[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH v2 08/10] Update to Unicode 6.2.0
From: |
Daiki Ueno |
Subject: |
[PATCH v2 08/10] Update to Unicode 6.2.0 |
Date: |
Thu, 23 Oct 2014 17:01:39 +0900 |
* lib/unilbrk/lbrktables.h (LBP_RI): New enumeration value.
(unilbrk_table): Adjust table size.
* lib/unilbrk/lbrktables.c (unilbrk_table): Add a row and column
for LBP_RI.
* lib/uniwbrk.in.h (WBP_RI): New enumeration value.
* lib/uniwbrk/u-wordbreaks.h (FUNC): Support rule WB13c.
Normalize table index skipping ignored properties.
* lib/uniwbrk/wbrktable.c (uniwbrk_table): Support WBP_RI. Remove
WBP_EXTEND and WBP_FORMAT, which are now computed without using
the table.
* lib/uniwbrk/wbrktable.h: Adjust table size.
* tests/uniwbrk/test-uc-wordbreaks.c
(wordbreakproperty_to_string): Support WBP_RI.
* lib/unigbrk.in.h (GBP_RI): New enumeration value.
* lib/unigbrk/uc-is-grapheme-break.c (UC_IS_GRAPHEME_BREAK):
Support rule GB8a.
(UC_GRAPHEME_BREAKS_FOR, gb_table): Support GBP_RI.
* tests/unigbrk/test-uc-is-grapheme-break.c
(graphemebreakproperty_to_string): Support GBP_RI.
* lib/gen-uni-tables.c (LBP_RI): New enumeration value.
(get_lbp, debug_output_lbp, fill_org_lbp, debug_output_org_lbp)
(output_lbp): Support LBP_RI.
(WBP_RI): New enumeration value.
(debug_output_wbp, fill_org_wbp, debug_output_org_wbp)
(output_wbp): Support WBP_RI.
(GBP_RI): New enumeration value.
(output_gbp_test, fill_org_gbp): Support GBP_RI.
---
lib/gen-uni-tables.c | 49 ++++++++++++++++++--------
lib/unigbrk.in.h | 3 +-
lib/unigbrk/uc-is-grapheme-break.c | 9 +++--
lib/unilbrk/lbrktables.c | 57 ++++++++++++++++---------------
lib/unilbrk/lbrktables.h | 21 ++++++------
lib/uniwbrk.in.h | 3 +-
lib/uniwbrk/u-wordbreaks.h | 36 +++++++++++++------
lib/uniwbrk/wbrktable.c | 24 ++++++-------
lib/uniwbrk/wbrktable.h | 2 +-
tests/unigbrk/test-uc-gbrk-prop.c | 1 +
tests/unigbrk/test-uc-is-grapheme-break.c | 1 +
tests/uniwbrk/test-uc-wordbreaks.c | 1 +
12 files changed, 127 insertions(+), 80 deletions(-)
diff --git a/lib/gen-uni-tables.c b/lib/gen-uni-tables.c
index ec1aba5..f833777 100644
--- a/lib/gen-uni-tables.c
+++ b/lib/gen-uni-tables.c
@@ -32,7 +32,7 @@
/usr/local/share/Unidata/CompositionExclusions.txt \
/usr/local/share/Unidata/SpecialCasing.txt \
/usr/local/share/Unidata/CaseFolding.txt \
- 6.1.0
+ 6.2.0
*/
#include <stdbool.h>
@@ -6213,22 +6213,22 @@ output_width_property_test (const char *filename)
enum
{
- /* Values >= 26 are resolved at run time. */
- LBP_BK = 26, /* mandatory break */
+ /* Values >= 27 are resolved at run time. */
+ LBP_BK = 27, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 27, /* attached characters and combining marks */
+ LBP_CM = 28, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK
*/
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 28, /* zero width space */
+ LBP_ZW = 29, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 29, /* space */
+ LBP_SP = 30, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 30, /* contingent break opportunity */
+ LBP_CB = 31, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -6241,7 +6241,7 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
/*LBP_CJ, conditional Japanese starter, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
@@ -6251,8 +6251,9 @@ enum
LBP_JL = 22, /* Hangul L Jamo */
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
- LBP_SA = 32, /* complex context (South East Asian) */
- LBP_XX = 33 /* unknown */
+ LBP_RI = 26, /* regional indicator */
+ LBP_SA = 33, /* complex context (South East Asian) */
+ LBP_XX = 34 /* unknown */
};
/* Returns the line breaking classification for ch, as a bit mask. */
@@ -6710,6 +6711,10 @@ get_lbp (unsigned int ch)
if ((ch >= 0x11A8 && ch <= 0x11FF) || (ch >= 0xD7CB && ch <= 0xD7FB))
attr |= (int64_t) 1 << LBP_JT;
+ /* regional indicator */
+ if (ch >= 0x1F1E6 && ch <= 0x1F1FF)
+ attr |= (int64_t) 1 << LBP_RI;
+
/* complex context (South East Asian) */
if (((unicode_attributes[ch].category[0] == 'C'
&& unicode_attributes[ch].category[1] == 'f')
@@ -6862,7 +6867,7 @@ get_lbp (unsigned int ch)
|| ch == 0x2064 /* INVISIBLE PLUS */
/* Extra characters for compatibility with Unicode LineBreak.txt. */
|| ch == 0x110BD /* KAITHI NUMBER SIGN */)
- if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) |
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) |
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) |
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) |
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) |
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) |
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) |
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) |
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_SA) | ((int64_t) 1 << LBP_ID))))
+ if (!(attr & (((int64_t) 1 << LBP_GL) | ((int64_t) 1 << LBP_B2) |
((int64_t) 1 << LBP_BA) | ((int64_t) 1 << LBP_BB) | ((int64_t) 1 << LBP_HY) |
((int64_t) 1 << LBP_CB) | ((int64_t) 1 << LBP_CL) | ((int64_t) 1 << LBP_CP) |
((int64_t) 1 << LBP_EX) | ((int64_t) 1 << LBP_IN) | ((int64_t) 1 << LBP_NS) |
((int64_t) 1 << LBP_OP) | ((int64_t) 1 << LBP_QU) | ((int64_t) 1 << LBP_IS) |
((int64_t) 1 << LBP_NU) | ((int64_t) 1 << LBP_PO) | ((int64_t) 1 << LBP_PR) |
((int64_t) 1 << LBP_SY) | ((int64_t) 1 << LBP_H2) | ((int64_t) 1 << LBP_H3) |
((int64_t) 1 << LBP_HL) | ((int64_t) 1 << LBP_JL) | ((int64_t) 1 << LBP_JV) |
((int64_t) 1 << LBP_JT) | ((int64_t) 1 << LBP_RI) | ((int64_t) 1 << LBP_SA) |
((int64_t) 1 << LBP_ID))))
{
/* ambiguous (alphabetic) ? */
if ((unicode_width[ch] != NULL
@@ -6987,6 +6992,7 @@ debug_output_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
@@ -7102,6 +7108,7 @@ fill_org_lbp (const char *linebreak_filename)
TRY(LBP_JL)
TRY(LBP_JV)
TRY(LBP_JT)
+ TRY(LBP_RI)
TRY(LBP_SA)
TRY(LBP_XX)
#undef TRY
@@ -7184,6 +7191,7 @@ debug_output_org_lbp (FILE *stream)
PRINT_BIT(attr,LBP_JL);
PRINT_BIT(attr,LBP_JV);
PRINT_BIT(attr,LBP_JT);
+ PRINT_BIT(attr,LBP_RI);
PRINT_BIT(attr,LBP_SA);
PRINT_BIT(attr,LBP_XX);
#undef PRINT_BIT
@@ -7358,6 +7366,7 @@ output_lbp (FILE *stream1, FILE *stream2)
CASE(LBP_JL);
CASE(LBP_JV);
CASE(LBP_JT);
+ CASE(LBP_RI);
CASE(LBP_SA);
CASE(LBP_XX);
#undef CASE
@@ -7457,7 +7466,8 @@ enum
WBP_MIDLETTER = 4,
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
- WBP_EXTENDNUMLET = 7
+ WBP_EXTENDNUMLET = 7,
+ WBP_RI = 13
};
/* Returns the word breaking property for ch, as a bit mask. */
@@ -7525,6 +7535,9 @@ get_wbp (unsigned int ch)
if (unicode_attributes[ch].category != NULL
&& strcmp (unicode_attributes[ch].category, "Pc") == 0)
attr |= 1 << WBP_EXTENDNUMLET;
+
+ if (((get_lbp (ch) >> LBP_RI) & 1) != 0)
+ attr |= 1 << WBP_RI;
}
if (attr == 0)
@@ -7570,7 +7583,9 @@ debug_output_wbp (FILE *stream)
fprintf (stream, " Numeric");
if (attr & (1 << WBP_EXTENDNUMLET))
fprintf (stream, " ExtendNumLet");
- fprintf (stream, "\n");
+ if (attr & (1 << WBP_RI))
+ fprintf (stream, " Regional_Indicator");
+ fprintf (stream, "\n");
}
}
}
@@ -7655,6 +7670,7 @@ fill_org_wbp (const char *wordbreakproperty_filename)
PROP ("MidNum", WBP_MIDNUM)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+ PROP ("Regional_Indicator", WBP_RI)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in '%s'\n", propname,
@@ -7701,6 +7717,7 @@ debug_output_org_wbp (FILE *stream)
PROP ("MidNum", WBP_MIDNUM)
PROP ("Numeric", WBP_NUMERIC)
PROP ("ExtendNumLet", WBP_EXTENDNUMLET)
+ PROP ("Regional_Indicator", WBP_RI)
#undef PROP
fprintf (stream, " ??");
fprintf (stream, "\n");
@@ -7853,6 +7870,7 @@ output_wbp (FILE *stream)
CASE(WBP_MIDNUM);
CASE(WBP_NUMERIC);
CASE(WBP_EXTENDNUMLET);
+ CASE(WBP_RI);
#undef CASE
default:
abort ();
@@ -7933,7 +7951,8 @@ enum
GBP_V = 8,
GBP_T = 9,
GBP_LV = 10,
- GBP_LVT = 11
+ GBP_LVT = 11,
+ GBP_RI = 12
};
/* Construction of sparse 3-level tables. */
@@ -8004,6 +8023,7 @@ output_gbp_test (const char *filename)
CASE (GBP_T)
CASE (GBP_LV)
CASE (GBP_LVT)
+ CASE (GBP_RI)
#undef CASE
default:
abort ();
@@ -8201,6 +8221,7 @@ fill_org_gbp (const char *graphemebreakproperty_filename)
PROP ("T", GBP_T)
PROP ("LV", GBP_LV)
PROP ("LVT", GBP_LVT)
+ PROP ("Regional_Indicator", GBP_RI)
#undef PROP
{
fprintf (stderr, "unknown property value '%s' in %s:%d\n", propname,
diff --git a/lib/unigbrk.in.h b/lib/unigbrk.in.h
index 8335e5a..a708a8c 100644
--- a/lib/unigbrk.in.h
+++ b/lib/unigbrk.in.h
@@ -51,7 +51,8 @@ enum
GBP_V = 8,
GBP_T = 9,
GBP_LV = 10,
- GBP_LVT = 11
+ GBP_LVT = 11,
+ GBP_RI = 12
};
/* Return the Grapheme_Cluster_Break property of a Unicode character. */
diff --git a/lib/unigbrk/uc-is-grapheme-break.c
b/lib/unigbrk/uc-is-grapheme-break.c
index 0e61e79..7d1759c 100644
--- a/lib/unigbrk/uc-is-grapheme-break.c
+++ b/lib/unigbrk/uc-is-grapheme-break.c
@@ -47,6 +47,9 @@
/* GB8 */ \
((A) == GBP_LVT || (A) == GBP_T) && (B) == GBP_T ? false : \
\
+ /* GB8a */ \
+ (A) == GBP_RI && (B) == GBP_RI ? false : \
+ \
/* GB9 */ \
(B) == GBP_EXTEND ? false : \
\
@@ -71,9 +74,10 @@
| (UC_IS_GRAPHEME_BREAK(A, GBP_V) << GBP_V) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_T) << GBP_T) \
| (UC_IS_GRAPHEME_BREAK(A, GBP_LV) << GBP_LV) \
- | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT))
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_LVT) << GBP_LVT) \
+ | (UC_IS_GRAPHEME_BREAK(A, GBP_RI) << GBP_RI))
-static const unsigned short int gb_table[12] =
+static const unsigned short int gb_table[13] =
{
UC_GRAPHEME_BREAKS_FOR(0), /* GBP_OTHER */
UC_GRAPHEME_BREAKS_FOR(1), /* GBP_CR */
@@ -87,6 +91,7 @@ static const unsigned short int gb_table[12] =
UC_GRAPHEME_BREAKS_FOR(9), /* GBP_T */
UC_GRAPHEME_BREAKS_FOR(10), /* GBP_LV */
UC_GRAPHEME_BREAKS_FOR(11), /* GBP_LVT */
+ UC_GRAPHEME_BREAKS_FOR(12), /* GBP_RI */
};
bool
diff --git a/lib/unilbrk/lbrktables.c b/lib/unilbrk/lbrktables.c
index d60321d..f4a55a3 100644
--- a/lib/unilbrk/lbrktables.c
+++ b/lib/unilbrk/lbrktables.c
@@ -23,36 +23,37 @@
/* Define unilbrkprop, table of line breaking properties. */
#include "unilbrk/lbrkprop2.h"
-const unsigned char unilbrk_table[26][26] =
+const unsigned char unilbrk_table[27][27] =
{
/* after */
- /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID
JL JV JT HL */
-/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, },
-/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, },
-/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, },
-/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, },
-/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, },
-/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, },
-/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D,
D, D, D, D, },
-/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D,
D, D, D, I, },
-/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, },
-/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, },
-/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, },
-/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
P, P, P, P, },
-/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, },
-/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, },
-/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D,
D, D, D, I, },
-/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, },
-/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I,
I, I, I, I, },
-/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, },
-/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, },
-/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, I, I, D, },
-/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, I, D, },
-/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, D, D, },
-/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D,
I, I, D, D, },
-/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, I, I, D, },
-/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, I, D, },
-/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, },
+ /* WJ GL B2 BA BB HY CL CP EX IN NS OP QU IS NU PO PR SY AL H2 H3 ID
JL JV JT HL RI */
+/* WJ */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, I, },
+/* GL */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, I, },
+/* B2 */ { P, I, P, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* BA */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* BB */ { P, I, I, I, I, I, P, P, P, I, I, I, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, I, },
+/* HY */ { P, D, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* CL */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, D, I, I, P, D, D, D, D,
D, D, D, D, D, },
+/* CP */ { P, I, D, I, D, I, P, P, P, D, P, D, I, P, I, I, I, P, I, D, D, D,
D, D, D, I, D, },
+/* EX */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* IN */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* NS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P,
P, P, P, P, P, },
+/* QU */ { P, I, I, I, I, I, P, P, P, I, I, P, I, P, I, I, I, P, I, I, I, I,
I, I, I, I, I, },
+/* IS */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* NU */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, I, I, P, I, D, D, D,
D, D, D, I, D, },
+/* PO */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, D, },
+/* PR */ { P, I, D, I, D, I, P, P, P, D, I, I, I, P, I, D, D, P, I, I, I, I,
I, I, I, I, D, },
+/* SY */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, I, D, D, P, D, D, D, D,
D, D, D, D, D, },
+/* AL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, D, },
+/* H2 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, I, I, D, D, },
+/* H3 */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, I, D, D, },
+/* ID */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, D, D, D, },
+/* JL */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, I, I, D,
I, I, D, D, D, },
+/* JV */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, I, I, D, D, },
+/* JT */ { P, I, D, I, D, I, P, P, P, I, I, D, I, P, D, I, D, P, D, D, D, D,
D, D, I, D, D, },
+/* HL */ { P, I, D, I, D, I, P, P, P, I, I, I, I, P, I, D, D, P, I, D, D, D,
D, D, D, I, D, },
+/* RI */ { P, I, D, I, D, I, P, P, P, D, I, D, I, P, D, D, D, P, D, D, D, D,
D, D, I, D, I, },
/* "" */
/* before */
};
diff --git a/lib/unilbrk/lbrktables.h b/lib/unilbrk/lbrktables.h
index 95bb502..9c76ad7 100644
--- a/lib/unilbrk/lbrktables.h
+++ b/lib/unilbrk/lbrktables.h
@@ -21,22 +21,22 @@
enum
{
- /* Values >= 26 are resolved at run time. */
- LBP_BK = 26, /* mandatory break */
+ /* Values >= 27 are resolved at run time. */
+ LBP_BK = 27, /* mandatory break */
/*LBP_CR, carriage return - not used here because it's a DOSism */
/*LBP_LF, line feed - not used here because it's a DOSism */
- LBP_CM = 27, /* attached characters and combining marks */
+ LBP_CM = 28, /* attached characters and combining marks */
/*LBP_NL, next line - not used here because it's equivalent to LBP_BK
*/
/*LBP_SG, surrogates - not used here because they are not characters */
LBP_WJ = 0, /* word joiner */
- LBP_ZW = 28, /* zero width space */
+ LBP_ZW = 29, /* zero width space */
LBP_GL = 1, /* non-breaking (glue) */
- LBP_SP = 29, /* space */
+ LBP_SP = 30, /* space */
LBP_B2 = 2, /* break opportunity before and after */
LBP_BA = 3, /* break opportunity after */
LBP_BB = 4, /* break opportunity before */
LBP_HY = 5, /* hyphen */
- LBP_CB = 30, /* contingent break opportunity */
+ LBP_CB = 31, /* contingent break opportunity */
LBP_CL = 6, /* closing punctuation */
LBP_CP = 7, /* closing parenthesis */
LBP_EX = 8, /* exclamation/interrogation */
@@ -49,7 +49,7 @@ enum
LBP_PO = 15, /* postfix (numeric) */
LBP_PR = 16, /* prefix (numeric) */
LBP_SY = 17, /* symbols allowing breaks */
- LBP_AI = 31, /* ambiguous (alphabetic or ideograph) */
+ LBP_AI = 32, /* ambiguous (alphabetic or ideograph) */
LBP_AL = 18, /* ordinary alphabetic and symbol characters */
/*LBP_CJ, conditional Japanese starters, resolved to NS */
LBP_H2 = 19, /* Hangul LV syllable */
@@ -59,8 +59,9 @@ enum
LBP_JL = 22, /* Hangul L Jamo */
LBP_JV = 23, /* Hangul V Jamo */
LBP_JT = 24, /* Hangul T Jamo */
- LBP_SA = 32, /* complex context (South East Asian) */
- LBP_XX = 33 /* unknown */
+ LBP_RI = 26, /* regional indicator */
+ LBP_SA = 33, /* complex context (South East Asian) */
+ LBP_XX = 34 /* unknown */
};
#include "lbrkprop1.h"
@@ -91,7 +92,7 @@ unilbrkprop_lookup (ucs4_t uc)
#define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
#define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
-extern const unsigned char unilbrk_table[26][26];
+extern const unsigned char unilbrk_table[27][27];
/* We don't support line breaking of complex-context dependent characters
(Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
diff --git a/lib/uniwbrk.in.h b/lib/uniwbrk.in.h
index ab4b532..c272d48 100644
--- a/lib/uniwbrk.in.h
+++ b/lib/uniwbrk.in.h
@@ -49,7 +49,8 @@ enum
WBP_MIDLETTER = 4,
WBP_MIDNUM = 5,
WBP_NUMERIC = 6,
- WBP_EXTENDNUMLET = 7
+ WBP_EXTENDNUMLET = 7,
+ WBP_RI = 13
};
/* Return the Word_Break property of a Unicode character. */
diff --git a/lib/uniwbrk/u-wordbreaks.h b/lib/uniwbrk/u-wordbreaks.h
index 33ca7eb..04d2738 100644
--- a/lib/uniwbrk/u-wordbreaks.h
+++ b/lib/uniwbrk/u-wordbreaks.h
@@ -55,16 +55,12 @@ FUNC (const UNIT *s, size_t n, char *p)
if (last_char_prop == WBP_CR && prop == WBP_LF)
/* *p = 0 */;
/* Break before and after newlines. */
- else if (last_char_prop >= WBP_NEWLINE
- /* same as:
- last_char_prop == WBP_CR
- || last_char_prop == WBP_LF
- || last_char_prop == WBP_NEWLINE */
- || prop >= WBP_NEWLINE
- /* same as:
- prop == WBP_CR
- || prop == WBP_LF
- || prop == WBP_NEWLINE */)
+ else if ((last_char_prop == WBP_CR
+ || last_char_prop == WBP_LF
+ || last_char_prop == WBP_NEWLINE)
+ || (prop == WBP_CR
+ || prop == WBP_LF
+ || prop == WBP_NEWLINE))
*p = 1;
/* Ignore Format and Extend characters. */
else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
@@ -85,6 +81,7 @@ FUNC (const UNIT *s, size_t n, char *p)
(ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ Regional_Indicator × Regional_Indicator (WB13c)
*/
/* No break across certain punctuation. Also, disable word
breaks that were recognized earlier (due to lookahead of
@@ -101,10 +98,27 @@ FUNC (const UNIT *s, size_t n, char *p)
*last_compchar_ptr = 0;
/* *p = 0; */
}
+ /* Break after Format and Extend characters. */
+ else if (last_compchar_prop == WBP_EXTEND
+ || last_compchar_prop == WBP_FORMAT)
+ *p = 1;
else
{
+ /* Normalize property value to table index,
+ skipping 5 properties: WBP_EXTEND,
+ WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
+ WBP_LF. */
+ int last_compchar_prop_index = last_compchar_prop;
+ int prop_index = prop;
+
+ if (last_compchar_prop_index >= WBP_EXTEND)
+ last_compchar_prop_index -= 5;
+
+ if (prop_index >= WBP_EXTEND)
+ prop_index -= 5;
+
/* Perform a single table lookup. */
- if (uniwbrk_table[last_compchar_prop][prop])
+ if (uniwbrk_table[last_compchar_prop_index][prop_index])
*p = 1;
/* else *p = 0; */
}
diff --git a/lib/uniwbrk/wbrktable.c b/lib/uniwbrk/wbrktable.c
index 7cbe4d6..04bd0e5 100644
--- a/lib/uniwbrk/wbrktable.c
+++ b/lib/uniwbrk/wbrktable.c
@@ -32,21 +32,21 @@
(ALetter | Numeric | Katakana) × ExtendNumLet (WB13a)
ExtendNumLet × ExtendNumLet (WB13a)
ExtendNumLet × (ALetter | Numeric | Katakana) (WB13b)
+ Regional_Indicator × Regional_Indicator (WB13c)
*/
-const unsigned char uniwbrk_table[10][8] =
+const unsigned char uniwbrk_table[9][9] =
{ /* current: OTHER MIDNUMLET NUMERIC */
/* KATAKANA MIDLETTER EXTENDNUMLET */
- /* ALETTER MIDNUM */
+ /* ALETTER MIDNUM RI */
/* last */
- /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0 },
- /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0 },
- /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0 },
- /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0 },
- /* WBP_EXTEND */ { 1, 1, 1, 1, 1, 1, 1, 1 },
- /* WBP_FORMAT */ { 1, 1, 1, 1, 1, 1, 1, 1 }
+ /* WBP_OTHER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_KATAKANA */ { 1, 0, 1, 1, 1, 1, 1, 0, 1
},
+ /* WBP_ALETTER */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_MIDNUMLET */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_MIDLETTER */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_MIDNUM */ { 1, 1, 1, 1, 1, 1, 1, 1, 1
},
+ /* WBP_NUMERIC */ { 1, 1, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_EXTENDNUMLET */ { 1, 0, 0, 1, 1, 1, 0, 0, 1
},
+ /* WBP_RI */ { 1, 1, 1, 1, 1, 1, 1, 1, 0 }
};
diff --git a/lib/uniwbrk/wbrktable.h b/lib/uniwbrk/wbrktable.h
index 1b48adf..50b7823 100644
--- a/lib/uniwbrk/wbrktable.h
+++ b/lib/uniwbrk/wbrktable.h
@@ -15,4 +15,4 @@
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
-extern const unsigned char uniwbrk_table[10][8];
+extern const unsigned char uniwbrk_table[9][9];
diff --git a/tests/unigbrk/test-uc-gbrk-prop.c
b/tests/unigbrk/test-uc-gbrk-prop.c
index 1c71280..4bfbdba 100644
--- a/tests/unigbrk/test-uc-gbrk-prop.c
+++ b/tests/unigbrk/test-uc-gbrk-prop.c
@@ -50,6 +50,7 @@ graphemebreakproperty_to_string (int gbp)
CASE(T)
CASE(LV)
CASE(LVT)
+ CASE(RI)
}
abort ();
}
diff --git a/tests/unigbrk/test-uc-is-grapheme-break.c
b/tests/unigbrk/test-uc-is-grapheme-break.c
index a93f6f2..dbaf3dc 100644
--- a/tests/unigbrk/test-uc-is-grapheme-break.c
+++ b/tests/unigbrk/test-uc-is-grapheme-break.c
@@ -44,6 +44,7 @@ graphemebreakproperty_to_string (int gbp)
CASE(T)
CASE(LV)
CASE(LVT)
+ CASE(RI)
}
abort ();
}
diff --git a/tests/uniwbrk/test-uc-wordbreaks.c
b/tests/uniwbrk/test-uc-wordbreaks.c
index 736cdba..41585f7 100644
--- a/tests/uniwbrk/test-uc-wordbreaks.c
+++ b/tests/uniwbrk/test-uc-wordbreaks.c
@@ -47,6 +47,7 @@ wordbreakproperty_to_string (int wbp)
CASE(MIDNUM)
CASE(NUMERIC)
CASE(EXTENDNUMLET)
+ CASE(RI)
}
abort ();
}
--
1.9.3
- [PATCH v2 00/10] Update libunistring-related modules to Unicode 7.0.0, Daiki Ueno, 2014/10/23
- [PATCH v2 01/10] gen-uni-tables: Minor style fixes, Daiki Ueno, 2014/10/23
- [PATCH v2 04/10] uniwbrk: Ignore Extended/Format at the beginning of the line, Daiki Ueno, 2014/10/23
- [PATCH v2 02/10] gen-uni-tables: Check out-of-range values added to 3-level tables, Daiki Ueno, 2014/10/23
- [PATCH v2 05/10] uniwbrk/u32-wordbreaks-tests: Test using WordBreakTest.txt from UCD, Daiki Ueno, 2014/10/23
- [PATCH v2 06/10] uniname: Make codepoint transformation more flexible, Daiki Ueno, 2014/10/23
- [PATCH v2 03/10] unictype/joininggroup-of: Switch to 3-level table, Daiki Ueno, 2014/10/23
- [PATCH v2 07/10] Update to Unicode 6.1.0, Daiki Ueno, 2014/10/23
- [PATCH v2 08/10] Update to Unicode 6.2.0,
Daiki Ueno <=
- [PATCH v2 09/10] Update to Unicode 6.3.0, Daiki Ueno, 2014/10/23
- Re: [PATCH v2 00/10] Update libunistring-related modules to Unicode 7.0.0, Pádraig Brady, 2014/10/23