[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...
From: |
Benjamin Wolsey |
Subject: |
[Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h... |
Date: |
Fri, 28 Mar 2008 13:52:31 +0000 |
CVSROOT: /sources/gnash
Module name: gnash
Changes by: Benjamin Wolsey <bwy> 08/03/28 13:52:31
Modified files:
. : ChangeLog
libbase : utf8.cpp utf8.h
server/vm : ASHandlers.cpp
Log message:
* libbase/utf8.{cpp,h}: add a utf8::invalid constant.
* server/vm/ASHandlers.cpp: use utf8 code for guessEncoding.
Still not really sure how effective the guessing is, just checked for
regressions.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/gnash/ChangeLog?cvsroot=gnash&r1=1.6057&r2=1.6058
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.cpp?cvsroot=gnash&r1=1.9&r2=1.10
http://cvs.savannah.gnu.org/viewcvs/gnash/libbase/utf8.h?cvsroot=gnash&r1=1.14&r2=1.15
http://cvs.savannah.gnu.org/viewcvs/gnash/server/vm/ASHandlers.cpp?cvsroot=gnash&r1=1.221&r2=1.222
Patches:
Index: ChangeLog
===================================================================
RCS file: /sources/gnash/gnash/ChangeLog,v
retrieving revision 1.6057
retrieving revision 1.6058
diff -u -b -r1.6057 -r1.6058
--- ChangeLog 28 Mar 2008 12:24:04 -0000 1.6057
+++ ChangeLog 28 Mar 2008 13:52:29 -0000 1.6058
@@ -1,5 +1,10 @@
2008-03-28 Benjamin Wolsey <address@hidden>
+ * libbase/utf8.{cpp,h}: add a utf8::invalid constant.
+ * server/vm/ASHandlers.cpp: use utf8 code for guessEncoding.
+
+2008-03-28 Benjamin Wolsey <address@hidden>
+
* server/asobj/string.cpp: (charAt) prevent another potential
out-of-bounds string access.
Index: libbase/utf8.cpp
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.cpp,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -b -r1.9 -r1.10
--- libbase/utf8.cpp 6 Feb 2008 15:21:34 -0000 1.9
+++ libbase/utf8.cpp 28 Mar 2008 13:52:30 -0000 1.10
@@ -23,6 +23,10 @@
#include "utf8.h"
+// This isn't actually an invalid character; it's a valid char that
+// looks like an inverted question mark.
+#define INVALID_CHAR 0x0FFFD
+
std::wstring
utf8::decodeCanonicalString(const std::string& str, int version)
{
@@ -35,6 +39,11 @@
{
while (boost::uint32_t code = decodeNextUnicodeCharacter(it))
{
+ if (code == utf8::invalid)
+ {
+ wstr.push_back(static_cast<wchar_t>(INVALID_CHAR));
+ continue;
+ }
wstr.push_back(static_cast<wchar_t>(code));
}
}
@@ -97,10 +106,6 @@
// If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
// 0xFFFF } then we ignore them; they are not valid in UTF-8.
-// This isn't actually an invalid character; it's a valid char that
-// looks like an inverted question mark.
-#define INVALID 0x0FFFD
-
#define FIRST_BYTE(mask, shift) \
/* Post-increment iterator */ \
uc = (*it++ & (mask)) << (shift);
@@ -108,7 +113,7 @@
#define NEXT_BYTE(shift) \
\
if (*it == 0) return 0; /* end of buffer, do not advance */ \
- if ((*it & 0xC0) != 0x80) return INVALID; /* standard check */ \
+ if ((*it & 0xC0) != 0x80) return utf8::invalid; /* standard check */
\
/* Post-increment iterator: */ \
uc |= (*it++ & 0x3F) << shift;
@@ -123,7 +128,7 @@
// Two-byte sequence.
FIRST_BYTE(0x1F, 6);
NEXT_BYTE(0);
- if (uc < 0x80) return INVALID; // overlong
+ if (uc < 0x80) return utf8::invalid; // overlong
return uc;
}
else if ((*it & 0xF0) == 0xE0)
@@ -132,9 +137,9 @@
FIRST_BYTE(0x0F, 12);
NEXT_BYTE(6);
NEXT_BYTE(0);
- if (uc < 0x800) return INVALID; // overlong
- if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID; // not
valid ISO 10646
- if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID; // not
valid ISO 10646
+ if (uc < 0x800) return utf8::invalid; // overlong
+ if (uc >= 0x0D800 && uc <= 0x0DFFF) return utf8::invalid;
// not valid ISO 10646
+ if (uc == 0x0FFFE || uc == 0x0FFFF) return utf8::invalid;
// not valid ISO 10646
return uc;
}
else if ((*it & 0xF8) == 0xF0)
@@ -144,7 +149,7 @@
NEXT_BYTE(12);
NEXT_BYTE(6);
NEXT_BYTE(0);
- if (uc < 0x010000) return INVALID; // overlong
+ if (uc < 0x010000) return utf8::invalid; // overlong
return uc;
}
else if ((*it & 0xFC) == 0xF8)
@@ -155,7 +160,7 @@
NEXT_BYTE(12);
NEXT_BYTE(6);
NEXT_BYTE(0);
- if (uc < 0x0200000) return INVALID; // overlong
+ if (uc < 0x0200000) return utf8::invalid; // overlong
return uc;
}
else if ((*it & 0xFE) == 0xFC)
@@ -167,14 +172,14 @@
NEXT_BYTE(12);
NEXT_BYTE(6);
NEXT_BYTE(0);
- if (uc < 0x04000000) return INVALID; // overlong
+ if (uc < 0x04000000) return utf8::invalid; // overlong
return uc;
}
else
{
// Invalid.
it++;
- return INVALID;
+ return utf8::invalid;
}
}
Index: libbase/utf8.h
===================================================================
RCS file: /sources/gnash/gnash/libbase/utf8.h,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- libbase/utf8.h 27 Mar 2008 16:12:36 -0000 1.14
+++ libbase/utf8.h 28 Mar 2008 13:52:30 -0000 1.15
@@ -53,6 +53,8 @@
/// gnash::edit_text_character, ord() and chr().
namespace utf8
{
+ static const boost::uint32_t invalid = -1;
+
/// Converts a std::string with multibyte characters into a
std::wstring.
//
/// @return a version-dependent wstring.
Index: server/vm/ASHandlers.cpp
===================================================================
RCS file: /sources/gnash/gnash/server/vm/ASHandlers.cpp,v
retrieving revision 1.221
retrieving revision 1.222
diff -u -b -r1.221 -r1.222
--- server/vm/ASHandlers.cpp 27 Mar 2008 14:32:03 -0000 1.221
+++ server/vm/ASHandlers.cpp 28 Mar 2008 13:52:30 -0000 1.222
@@ -1538,45 +1538,34 @@
std::string::const_iterator it = str.begin();
length = 0;
- int index = 0;
// First, assume it's UTF8 and try to be wrong.
while (it != str.end() && is_sought)
{
- int c = static_cast<int>(*it);
+ ++length;
- if (width)
- {
- --width;
- if ((c & 0xB0) != 0x80)
+ offsets.push_back(it - str.begin()); // current position
+
+ // Advances the iterator to point to the next
+ boost::uint32_t c = utf8::decodeNextUnicodeCharacter(it);
+
+ if (c == utf8::invalid)
{
is_sought = false;
+ break;
}
- continue;
- }
- ++length;
- offsets.push_back(index); //[length - 1] = index;
-
- if ((c & 0xC0) == 0x80) continue; // A 1 byte character.
- else if ((c & 0xE0) == 0xC0) width = 1;
- else if ((c & 0xF0) == 0xE0) width = 2;
- else if ((c & 0xF8) == 0xF0) width = 3;
- else if (c & 0x80) is_sought = false;
-
- ++it;
- ++index;
}
- offsets.push_back(index); // [length - 1] = index;
+ offsets.push_back(it - str.begin()); // current position
- if (!width && is_sought)
+ if (it == str.end() && is_sought)
{
- // No width left, so it's almost certainly UTF8.
+ // No characters left, so it's almost certainly UTF8.
return ENCGUESS_UNICODE;
}
it = str.begin();
- index = 0;
+ int index = 0;
is_sought = true;
width = 0;
length = 0;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [Gnash-commit] gnash ChangeLog libbase/utf8.cpp libbase/utf8.h...,
Benjamin Wolsey <=