[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0
From: |
Daiki Ueno |
Subject: |
[bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0 |
Date: |
Fri, 10 Oct 2014 22:59:51 +0900 |
* lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value.
* lib/uniwbrk.in.h (WBP_RI): New enumeration value.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c.
Normalize table index skipping ignored properties.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI. Remove
WBP_EXTEND and WBP_FORMAT, which are now computed without using
the table.
* lib/uniwbrk/wbrktable.h: Adjust table size.
* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_RI.
* lib/unigbrk.in.h (GBP_RI): New enumeration value.
* lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK):
Support rule GB8a.
(UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI.
* tests/unigbrk/test-uc-is-grapheme-break.c
(graphemebreakproperty_to_string): Support GBP_RI.
* lib/gen-uni-tables.c (LBP_RI): New enumeration value.
(get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp)
(output_lbp): Support LBP_RI.
(WBP_RI): New enumeration value.
(debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support WBP_RI.
(GBP_RI): New enumeration value.
(output_gbp_test, fill_org_gbp): Support GBP_RI.
---
lib/gen-uni-tables.c | 31 +++++++++++++++++++++++++-----
lib/unigbrk.in.h | 3 ++-
lib/unigbrk/uc-is-grapheme-break.c | 9 +++++++--
lib/unilbrk/lbrktables.h | 1 +
lib/uniwbrk.in.h | 3 ++-
lib/uniwbrk/u-wordbreaks.h | 32 ++++++++++++++++++++-----------
lib/uniwbrk/wbrktable.c | 24 +++++++++++------------
lib/uniwbrk/wbrktable.h | 2 +-
tests/unigbrk/test-uc-gbrk-prop.c | 1 +
tests/unigbrk/test-uc-is-grapheme-break.c | 1 +
tests/uniwbrk/test-uc-wordbreaks.c | 1 +
11 files changed, 75 insertions(+), 33 deletions(-)
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index 16af39f..ce63ae4 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 6.1.0
+ 6.2.0
*/
#include <stdbool.h>
@@ -6249,6 +6249,7 @@ enum
LBP_JL = 22, /* Hangul L Jamo */
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
+ LBP_RI = 34, /* regional indicator */
LBP_SA = 31, /* complex context (South East Asian) */
LBP_XX = 32 /* unknown */
};
@@ -6708,6 +6709,10 @@ get_lbp (unsigned int ch)
if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
attr |= (int64_t) 1 << LBP_JT;
+ /* regional indicator */
+ if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
+ attr |= (int64_t) 1 << LBP_RI;
+
/* complex context (South East Asian) */
if (((unicode_attributes[ch].category[0] == 'C'
&& unicode_attributes[ch].category[1] == 'f')
@@ -6860,7 +6865,7 @@ get_lbp (unsigned int ch)
|| ch == 0x2064 /* INVISIBLE PLUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x110BD /* KAITHI NUMBER SIGN */)
- if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) |
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) |
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) |
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) |
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) |
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) |
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) |
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) |
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) |
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) |
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) |
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) |
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) |
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) |
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) |
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) |
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) |
((int64_t) 1 << LBP_ID))))
{
/* ambiguous (alphabetic) ? */
if ((unicode_width[ch] != NULL
@@ -6985,6 +6990,7 @@ debug_output_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
@@ -7100,6 +7106,7 @@ fill_org_lbp (const char *linebreak_filename)
TRY(LBP_JL)
TRY(LBP_JV)
TRY(LBP_JT)
+ TRY(LBP_RI)
TRY(LBP_SA)
TRY(LBP_XX)
#undef TRY
@@ -7182,6 +7189,7 @@ debug_output_org_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
@@ -7356,6 +7364,7 @@ output_lbp (FILE *stream1, FILE *stream2)
CASE(LBP_JL);
CASE(LBP_JV);
CASE(LBP_JT);
+ CASE(LBP_RI);
CASE(LBP_SA);
CASE(LBP_XX);
#undef CASE
@@ -7455,7 +7464,8 @@ enum
WBP_MIDLETTER = 4,
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
- WBP_EXTENDNUMLET = 7
+ WBP_EXTENDNUMLET = 7,
+ WBP_RI = 13
};
/* Returns the word breaking property for ch, as a bit mask. */
@@ -7523,6 +7533,9 @@ get_wbp (unsigned int ch)
if (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Pc") == 0)
attr |= 1 << WBP_EXTENDNUMLET;
+
+ if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
+ attr |= 1 << WBP_RI;
}
if (attr == 0)
@@ -7568,7 +7581,9 @@ debug_output_wbp (FILE *stream)
fprintf (stream, " Numeric");
if (attr & (1 << WBP_EXTENDNUMLET))
fprintf (stream, " ExtendNumLet");
- fprintf (stream, "\n");
+ if (attr & (1 << WBP_RI))
+ fprintf (stream, " Regional_Indicator");
+ fprintf (stream, "\n");
}
}
}
@@ -7653,6 +7668,7 @@ fill_org_wbp (const char *wordbreakproperty_filename)
PROP ("MidNum", WBP_MIDNUM)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+ PROP ("Regional_Indicator", WBP_RI)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7699,6 +7715,7 @@ debug_output_org_wbp (FILE *stream)
PROP ("MidNum", WBP_MIDNUM)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+ PROP ("Regional_Indicator", WBP_RI)
#undef PROP
fprintf (stream, " ??");
fprintf (stream, "\n");
@@ -7851,6 +7868,7 @@ output_wbp (FILE *stream)
CASE(WBP_MIDNUM);
CASE(WBP_NUMERIC);
CASE(WBP_EXTENDNUMLET);
+ CASE(WBP_RI);
#undef CASE
default:
abort ();
@@ -7931,7 +7949,8 @@ enum
GBP_V = 8,
GBP_T = 9,
GBP_LV = 10,
- GBP_LVT = 11
+ GBP_LVT = 11,
+ GBP_RI = 12
};
/* Construction of sparse 3-level tables. */
@@ -8002,6 +8021,7 @@ output_gbp_test (const char *filename)
CASE (GBP_T)
CASE (GBP_LV)
CASE (GBP_LVT)
+ CASE (GBP_RI)
#undef CASE
default:
abort ();
@@ -8199,6 +8219,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
PROP ("T", GBP_T)
PROP ("LV", GBP_LV)
PROP ("LVT", GBP_LVT)
+ PROP ("Regional_Indicator", GBP_RI)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h
index 8335e5a..a708a8c 100644
--- a/lib/unigbrk.in.h
+++ b/lib/unigbrk.in.h
@@ -51,7 +51,8 @@ enum
GBP_V = 8,
GBP_T = 9,
GBP_LV = 10,
- GBP_LVT = 11
+ GBP_LVT = 11,
+ GBP_RI = 12
};
/* Return the Grapheme_Cluster_Break property of a Unicode character. */
diff --git a/lib/unigbrk/uc-is-grapheme-break.c
b/lib/unigbrk/uc-is-grapheme-break.c
index 0e61e79..7d1759c 100644
--- a/lib/unigbrk/uc-is-grapheme-break.c
+++ b/lib/unigbrk/uc-is-grapheme-break.c
@@ -47,6 +47,9 @@
/* GB8 */ \
((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false : \
\
+ /* GB8a */ \
+ (A) == GBP_RI && (B) == GBP_RI ? false : \
+ \
/* GB9 */ \
(B) == GBP_EXTEND ? false : \
\
@@ -71,9 +74,10 @@
| (UC_IS_GRAPHEME_BREAK(A, GBP_V) << GBP_V) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_T) << GBP_T) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_LV) << GBP_LV) \
- | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT))
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI))
-static const unsigned short int gb_table[12] =
+static const unsigned short int gb_table[13] =
{
UC_GRAPHEME_BREAKS_FOR(0), /* GBP_OTHER */
UC_GRAPHEME_BREAKS_FOR(1), /* GBP_CR */
@@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] =
UC_GRAPHEME_BREAKS_FOR(9), /* GBP_T */
UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */
UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */
+ UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */
};
bool
diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h
index 9014573..1467926 100644
--- a/lib/unilbrk/lbrktables.h
+++ b/lib/unilbrk/lbrktables.h
@@ -59,6 +59,7 @@ enum
LBP_JL = 22, /* Hangul L Jamo */
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
+ LBP_RI = 34, /* regional indicator */
LBP_SA = 31, /* complex context (South East Asian) */
LBP_XX = 32 /* unknown */
};
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index ab4b532..c272d48 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -49,7 +49,8 @@ enum
WBP_MIDLETTER = 4,
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
- WBP_EXTENDNUMLET = 7
+ WBP_EXTENDNUMLET = 7,
+ WBP_RI = 13
};
/* Return the Word_Break property of a Unicode character. */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 33ca7eb..1d7f951 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p)
if (last_char_prop == WBP_CR && prop == WBP_LF)
/* *p = 0 */;
/* Break before and after newlines. */
- else if (last_char_prop >= WBP_NEWLINE
- /* same as:
- last_char_prop == WBP_CR
- || last_char_prop == WBP_LF
- || last_char_prop == WBP_NEWLINE */
- || prop >= WBP_NEWLINE
- /* same as:
- prop == WBP_CR
- || prop == WBP_LF
- || prop == WBP_NEWLINE */)
+ else if ((last_char_prop == WBP_CR
+ || last_char_prop == WBP_LF
+ || last_char_prop == WBP_NEWLINE)
+ || (prop == WBP_CR
+ || prop == WBP_LF
+ || prop == WBP_NEWLINE))
*p = 1;
/* Ignore Format and Extend characters. */
else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
@@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p)
(ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ Regional_Indicator × Regional_Indicator (WB13c)
*/
/* No break across certain punctuation. Also, disable word
breaks that were recognized earlier (due to lookahead of
@@ -103,8 +100,21 @@ FUNC (const UNIT *s, size_t n, char *p)
}
else
{
+ /* Normalize property value to table index,
+ skipping 5 properties: WBP_EXTEND,
+ WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
+ WBP_LF. */
+ int last_compchar_prop_index = last_compchar_prop;
+ int prop_index = prop;
+
+ if (last_compchar_prop_index >= WBP_EXTEND)
+ last_compchar_prop_index -= 5;
+
+ if (prop_index >= WBP_EXTEND)
+ prop_index -= 5;
+
/* Perform a single table lookup. */
- if (uniwbrk_table[last_compchar_prop][prop])
+ if (uniwbrk_table[last_compchar_prop_index][prop_index])
*p = 1;
/* else *p = 0; */
}
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 7cbe4d6..04bd0e5 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -32,21 +32,21 @@
(ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ Regional_Indicator × Regional_Indicator (WB13c)
*/
-const unsigned char uniwbrk_table[10][8] =
+const unsigned char uniwbrk_table[9][9] =
{ /* current: OTHER MIDNUMLET NUMERIC */
/* KATAKANA MIDLETTER EXTENDNUMLET */
- /* ALETTER MIDNUM */
+ /* ALETTER MIDNUM RI */
/* last */
- /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 },
- /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0 },
- /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0 },
- /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0 },
- /* WBP_EXTEND */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_FORMAT */ { 1, 1, 1, 1, 1, 1, 1, 1 }
+ /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1
},
+ /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 }
};
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 1b48adf..50b7823 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-extern const unsigned char uniwbrk_table[10][8];
+extern const unsigned char uniwbrk_table[9][9];
diff --git a/tests/unigbrk/test-uc-gbrk-prop.c
b/tests/unigbrk/test-uc-gbrk-prop.c
index 1c71280..4bfbdba 100644
--- a/tests/unigbrk/test-uc-gbrk-prop.c
+++ b/tests/unigbrk/test-uc-gbrk-prop.c
@@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp)
CASE(T)
CASE(LV)
CASE(LVT)
+ CASE(RI)
}
abort ();
}
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c
b/tests/unigbrk/test-uc-is-grapheme-break.c
index a93f6f2..dbaf3dc 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp)
CASE(T)
CASE(LV)
CASE(LVT)
+ CASE(RI)
}
abort ();
}
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c
b/tests/uniwbrk/test-uc-wordbreaks.c
index 87e0e05..710f583 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -44,6 +44,7 @@ wordbreakproperty_to_string (int wbp)
CASE(MIDNUM)
CASE(NUMERIC)
CASE(EXTENDNUMLET)
+ CASE(RI)
}
abort ();
}
--
2.1.1
- [bug-libunistring] [PATCH 0/8] Update libunistring-related modules to Unicode 7.0.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 1/8] gen-uni-tables: Check out-of-range values added to 3-level tables, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 2/8] unictype/joininggroup-of: Switch to 3-level table, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 4/8] uniwbrk/u32-wordbreaks-tests: Test using WordBreakTest.txt from UCD, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 3/8] uniwbrk: Ignore Extended/Format at the beginning of the line, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 5/8] Update to Unicode 6.1.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 7/8] Update to Unicode 6.3.0, Daiki Ueno, 2014/10/10
- [bug-libunistring] [PATCH 6/8] Update to Unicode 6.2.0,
Daiki Ueno <=
- [bug-libunistring] [PATCH 8/8] Update to Unicode 7.0.0, Daiki Ueno, 2014/10/10