[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...

gnash-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...

From:	Benjamin Wolsey
Subject:	[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...
Date:	Fri, 28 Mar 2008 13:52:31 +0000
CVSROOT:        /sources/gnash
Module name:    gnash
Changes by:     Benjamin Wolsey <bwy>   08/03/28 13:52:31

Modified files:
        .              : ChangeLog 
        libbase        : utf8.cpp utf8.h 
        server/vm      : ASHandlers.cpp 

Log message:
                * libbase/utf8.{cpp,h}: add a utf8::invalid constant.
                * server/vm/ASHandlers.cpp: use utf8 code for guessEncoding.
        
        Still not really sure how effective the guessing is, just checked for
        regressions.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/gnash/ChangeLog?cvsroot=gnash&r1=1.6057&r2=1.6058
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.cpp?cvsroot=gnash&r1=1.9&r2=1.10
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.h?cvsroot=gnash&r1=1.14&r2=1.15
http://cvs.savannah.gnu.org/viewcvs/gnash/server/vm/ASHandlers.cpp?cvsroot=gnash&r1=1.221&r2=1.222

Patches:
Index: ChangeLog
===================================================================
RCS file: /sources/gnash/gnash/ChangeLog,v
retrieving revision 1.6057
retrieving revision 1.6058
diff -u -b -r1.6057 -r1.6058
--- ChangeLog   28 Mar 2008 12:24:04 -0000      1.6057
+++ ChangeLog   28 Mar 2008 13:52:29 -0000      1.6058
@@ -1,5 +1,10 @@
 2008-03-28 Benjamin Wolsey <address@hidden>
 
+       * libbase/utf8.{cpp,h}: add a utf8::invalid constant.
+       * server/vm/ASHandlers.cpp: use utf8 code for guessEncoding.
+
+2008-03-28 Benjamin Wolsey <address@hidden>
+
        * server/asobj/string.cpp: (charAt) prevent another potential
          out-of-bounds string access.
 

Index: libbase/utf8.cpp
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.cpp,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- libbase/utf8.cpp    6 Feb 2008 15:21:34 -0000       1.9
+++ libbase/utf8.cpp    28 Mar 2008 13:52:30 -0000      1.10
@@ -23,6 +23,10 @@
 
 #include "utf8.h"
 
+// This isn't actually an invalid character; it's a valid char that
+// looks like an inverted question mark.
+#define INVALID_CHAR 0x0FFFD
+
 std::wstring
 utf8::decodeCanonicalString(const std::string& str, int version)
 {
@@ -35,6 +39,11 @@
        {
                while (boost::uint32_t code = decodeNextUnicodeCharacter(it))
                {
+                   if (code == utf8::invalid)
+                   {
+                           wstr.push_back(static_cast<wchar_t>(INVALID_CHAR));
+                           continue;       
+                   }
                        wstr.push_back(static_cast<wchar_t>(code));
                }
        }
@@ -97,10 +106,6 @@
        // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
        // 0xFFFF } then we ignore them; they are not valid in UTF-8.
 
-// This isn't actually an invalid character; it's a valid char that
-// looks like an inverted question mark.
-#define INVALID 0x0FFFD
-
 #define FIRST_BYTE(mask, shift)                \
        /* Post-increment iterator */ \
        uc = (*it++ & (mask)) << (shift);
@@ -108,7 +113,7 @@
 #define NEXT_BYTE(shift)                                               \
                                        \
        if (*it == 0) return 0; /* end of buffer, do not advance */     \
-       if ((*it & 0xC0) != 0x80) return INVALID; /* standard check */  \
+       if ((*it & 0xC0) != 0x80) return utf8::invalid; /* standard check */    
\
        /* Post-increment iterator: */          \
        uc |= (*it++ & 0x3F) << shift;
 
@@ -123,7 +128,7 @@
                // Two-byte sequence.
                FIRST_BYTE(0x1F, 6);
                NEXT_BYTE(0);
-               if (uc < 0x80) return INVALID;  // overlong
+               if (uc < 0x80) return utf8::invalid;    // overlong
                return uc;
        }
        else if ((*it & 0xF0) == 0xE0)
@@ -132,9 +137,9 @@
                FIRST_BYTE(0x0F, 12);
                NEXT_BYTE(6);
                NEXT_BYTE(0);
-               if (uc < 0x800) return INVALID; // overlong
-               if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID;     // not 
valid ISO 10646
-               if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID;     // not 
valid ISO 10646
+               if (uc < 0x800) return utf8::invalid;   // overlong
+               if (uc >= 0x0D800 && uc <= 0x0DFFF) return utf8::invalid;       
// not valid ISO 10646
+               if (uc == 0x0FFFE || uc == 0x0FFFF) return utf8::invalid;       
// not valid ISO 10646
                return uc;
        }
        else if ((*it & 0xF8) == 0xF0)
@@ -144,7 +149,7 @@
                NEXT_BYTE(12);
                NEXT_BYTE(6);
                NEXT_BYTE(0);
-               if (uc < 0x010000) return INVALID;      // overlong
+               if (uc < 0x010000) return utf8::invalid;        // overlong
                return uc;
        }
        else if ((*it & 0xFC) == 0xF8)
@@ -155,7 +160,7 @@
                NEXT_BYTE(12);
                NEXT_BYTE(6);
                NEXT_BYTE(0);
-               if (uc < 0x0200000) return INVALID;     // overlong
+               if (uc < 0x0200000) return utf8::invalid;       // overlong
                return uc;
        }
        else if ((*it & 0xFE) == 0xFC)
@@ -167,14 +172,14 @@
                NEXT_BYTE(12);
                NEXT_BYTE(6);
                NEXT_BYTE(0);
-               if (uc < 0x04000000) return INVALID;    // overlong
+               if (uc < 0x04000000) return utf8::invalid;      // overlong
                return uc;
        }
        else
        {
                // Invalid.
                it++;
-               return INVALID;
+               return utf8::invalid;
        }
 }
 

Index: libbase/utf8.h
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.h,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- libbase/utf8.h      27 Mar 2008 16:12:36 -0000      1.14
+++ libbase/utf8.h      28 Mar 2008 13:52:30 -0000      1.15
@@ -53,6 +53,8 @@
 /// gnash::edit_text_character, ord() and chr().
 namespace utf8
 {
+    static const boost::uint32_t invalid = -1;
+
        /// Converts a std::string with multibyte characters into a 
std::wstring.
        //
        /// @return a version-dependent wstring.

Index: server/vm/ASHandlers.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/vm/ASHandlers.cpp,v
retrieving revision 1.221
retrieving revision 1.222
diff -u -b -r1.221 -r1.222
--- server/vm/ASHandlers.cpp    27 Mar 2008 14:32:03 -0000      1.221
+++ server/vm/ASHandlers.cpp    28 Mar 2008 13:52:30 -0000      1.222
@@ -1538,45 +1538,34 @@
 
     std::string::const_iterator it = str.begin();
     length = 0;
-    int index = 0;
     
     // First, assume it's UTF8 and try to be wrong.
     while (it != str.end() && is_sought)
     {
-        int c = static_cast<int>(*it);
+        ++length;
 
-        if (width)
-        {
-            --width;
-            if ((c & 0xB0) != 0x80)
+        offsets.push_back(it - str.begin()); // current position
+
+        // Advances the iterator to point to the next 
+        boost::uint32_t c = utf8::decodeNextUnicodeCharacter(it);
+
+        if (c == utf8::invalid)
             {
                 is_sought = false;
+            break;
             }
-            continue;
-        }
-        ++length;
-        offsets.push_back(index); //[length - 1] = index;
-
-        if ((c & 0xC0) == 0x80) continue; // A 1 byte character.
-        else if ((c & 0xE0) == 0xC0) width = 1;
-        else if ((c & 0xF0) == 0xE0) width = 2;
-        else if ((c & 0xF8) == 0xF0) width = 3;
-        else if (c & 0x80) is_sought = false;
-            
-        ++it;
-        ++index;
     }
 
-    offsets.push_back(index); // [length - 1] = index;
+    offsets.push_back(it - str.begin()); // current position
 
-    if (!width && is_sought)
+    if (it == str.end() && is_sought)
     {
-        // No width left, so it's almost certainly UTF8.
+        // No characters left, so it's almost certainly UTF8.
         return ENCGUESS_UNICODE;
     }
 
     it = str.begin();
-    index = 0;
+    int index = 0;
     is_sought = true;
     width = 0;
     length = 0;
[Prev in Thread]
Current Thread
[Next in Thread]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h..., Benjamin Wolsey <=
Prev by Date: [Gnash-commit] gnash ChangeLog server/asobj/string.cpp
Next by Date: [Gnash-commit] gnash ChangeLog libbase/utf8.h
Previous by thread: [Gnash-commit] gnash ChangeLog libamf/amf.h libamf/lcshm.h lib...
Next by thread: [Gnash-commit] gnash ChangeLog backend/render_handler_cairo.h ...
Index(es):
- Date
- Thread