[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [RFC 18/18] Fix case-fold-search character class matching
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [RFC 18/18] Fix case-fold-search character class matching |
Date: |
Tue, 4 Oct 2016 03:10:41 +0200 |
The uppar and lower character classes should match any cased characters
when case-fold-search is enabled. So ‘[[:upper:]]’ sould match ‘a’ but
also ‘ł’, ‘ß’ and ‘fi’. Fix character class tests to make that happen.
* src/character.h (CHAR_BIT_TITLE): New character bit for title case
characters (such as Dz).
* src/character.c (category_char_bits): Characters in Lt category are
title case; update lookup table.
* src/regex.c (re_wctype_to_bit): When case-folding is enabled return
any-case bits pattern for RECC_LOWER and RECC_UPPER.
(regex_compile): Update re_wctype_to_bit calls (it has new argument).
(execute_charset): Simplify case-folding case since now it’s encoded
in the bits. corig argument is no longer necessary.
(mutually_exclusive_p, re_match_2_internal): Update execute_charset (it
no longer has corig argument).
* test/src/regex-tests.el (regex-tests--letter-character-classes): Fix
case-fold letter matching.
---
src/character.c | 2 +-
src/character.h | 5 +++--
src/regex.c | 53 ++++++++++++++++++++-----------------------------
test/src/regex-tests.el | 16 +++++----------
4 files changed, 30 insertions(+), 46 deletions(-)
diff --git a/src/character.c b/src/character.c
index 63f89d3..cf42f30 100644
--- a/src/character.c
+++ b/src/character.c
@@ -979,7 +979,7 @@ const unsigned char category_char_bits[] = {
[UNICODE_CATEGORY_UNKNOWN] = 0,
[UNICODE_CATEGORY_Lu] = CHAR_BIT_ALPHA_ | CHAR_BIT_UPPER,
[UNICODE_CATEGORY_Ll] = CHAR_BIT_ALPHA_ | CHAR_BIT_LOWER,
- [UNICODE_CATEGORY_Lt] = CHAR_BIT_ALPHA_,
+ [UNICODE_CATEGORY_Lt] = CHAR_BIT_ALPHA_ | CHAR_BIT_TITLE,
[UNICODE_CATEGORY_Lm] = CHAR_BIT_ALPHA_,
[UNICODE_CATEGORY_Lo] = CHAR_BIT_ALPHA_,
[UNICODE_CATEGORY_Mn] = CHAR_BIT_ALPHA_,
diff --git a/src/character.h b/src/character.h
index 6dc95ad..f2849e5 100644
--- a/src/character.h
+++ b/src/character.h
@@ -665,8 +665,9 @@ extern unicode_category_t char_unicode_category (int);
#define CHAR_BIT_ALPHA (1 << 1)
#define CHAR_BIT_UPPER (1 << 2)
#define CHAR_BIT_LOWER (1 << 3)
-#define CHAR_BIT_GRAPH (1 << 4)
-#define CHAR_BIT_PRINT (1 << 5)
+#define CHAR_BIT_TITLE (1 << 4)
+#define CHAR_BIT_GRAPH (1 << 5)
+#define CHAR_BIT_PRINT (1 << 6)
/* Map from Unicode general category to character classes the character is in.
*
diff --git a/src/regex.c b/src/regex.c
index bfd04a1..aa8c6ef 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1794,6 +1794,7 @@ struct range_table_work_area
# define BIT_ALPHA CHAR_BIT_ALPHA
# define BIT_UPPER CHAR_BIT_UPPER
# define BIT_LOWER CHAR_BIT_LOWER
+# define BIT_TITLE CHAR_BIT_TITLE
# define BIT_GRAPH CHAR_BIT_GRAPH
# define BIT_PRINT CHAR_BIT_PRINT
#else
@@ -1801,8 +1802,9 @@ struct range_table_work_area
# define BIT_ALPHA (1 << 1)
# define BIT_UPPER (1 << 2)
# define BIT_LOWER (1 << 3)
-# define BIT_GRAPH (1 << 4)
-# define BIT_PRINT (1 << 5)
+# define BIT_TITLE (1 << 4)
+# define BIT_GRAPH (1 << 5)
+# define BIT_PRINT (1 << 6)
#endif
#define BIT_WORD (BIT_PRINT << 1)
#define BIT_PUNCT (BIT_PRINT << 2)
@@ -2067,7 +2069,7 @@ re_iswctype (int ch, re_wctype_t cc)
/* Return a bit-pattern to use in the range-table bits to match multibyte
chars of class CC. */
static int
-re_wctype_to_bit (re_wctype_t cc)
+re_wctype_to_bit (re_wctype_t cc, bool case_fold)
{
switch (cc)
{
@@ -2076,8 +2078,10 @@ re_wctype_to_bit (re_wctype_t cc)
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
case RECC_WORD: return BIT_WORD;
- case RECC_LOWER: return BIT_LOWER;
- case RECC_UPPER: return BIT_UPPER;
+ case RECC_LOWER:
+ return case_fold ? BIT_LOWER | BIT_UPPER | BIT_TITLE : BIT_LOWER;
+ case RECC_UPPER:
+ return case_fold ? BIT_LOWER | BIT_UPPER | BIT_TITLE : BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE;
case RECC_GRAPH: return BIT_GRAPH;
@@ -2886,7 +2890,8 @@ regex_compile (const_re_char *pattern, size_t size,
SET_LIST_BIT (c1);
}
SET_RANGE_TABLE_WORK_AREA_BIT
- (range_table_work, re_wctype_to_bit (cc));
+ (range_table_work,
+ re_wctype_to_bit (cc, RE_TRANSLATE_P (translate)));
#endif /* emacs */
/* In most cases the matching rule for char classes only
uses the syntax table for multibyte chars, so that the
@@ -4633,11 +4638,10 @@ skip_noops (const_re_char *p, const_re_char *pend)
/* Test if C matches charset op. *PP points to the charset or charset_not
opcode. When the function finishes, *PP will be advanced past that opcode.
- C is character to test (possibly after translations) and CORIG is original
- character (i.e. without any translations). UNIBYTE denotes whether c is
- unibyte or multibyte character. */
+ C is character to test. UNIBYTE denotes whether c is unibyte or multibyte
+ character. */
static bool
-execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
+execute_charset (const_re_char **pp, unsigned c, bool unibyte)
{
re_char *p = *pp, *rtp = NULL;
bool not = (re_opcode_t) *p == charset_not;
@@ -4675,24 +4679,9 @@ execute_charset (const_re_char **pp, unsigned c,
unsigned corig, bool unibyte)
IS_REAL_ASCII (c), we can ignore that. */
bits = class_bits & (BIT_ALNUM | BIT_ALPHA | BIT_UPPER | BIT_LOWER |
- BIT_GRAPH | BIT_PRINT);
- if (bits)
- {
- int char_bits = category_char_bits[char_unicode_category (c)];
- if (bits & char_bits)
- return !not;
-
- /* Handle case folding. */
- if (corig != c)
- {
- if ((bits & BIT_UPPER) && (char_bits & BIT_LOWER) &&
- c == downcase (corig))
- return !not;
- if ((bits & BIT_LOWER) && (char_bits & BIT_UPPER) &&
- c == upcase (corig))
- return !not;
- }
- }
+ BIT_TITLE | BIT_GRAPH | BIT_PRINT);
+ if (bits && (category_char_bits[char_unicode_category (c)] & bits))
+ return !not;
if (class_bits & (BIT_SPACE | BIT_WORD | BIT_PUNCT))
{
@@ -4772,7 +4761,7 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp,
const_re_char *p1,
else if ((re_opcode_t) *p1 == charset
|| (re_opcode_t) *p1 == charset_not)
{
- if (!execute_charset (&p1, c, c, !multibyte || IS_REAL_ASCII (c)))
+ if (!execute_charset (&p1, c, !multibyte || IS_REAL_ASCII (c)))
{
DEBUG_PRINT (" No match => fast loop.\n");
return 1;
@@ -5482,7 +5471,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
const_re_char *string1,
case charset:
case charset_not:
{
- register unsigned int c, corig;
+ register unsigned int c;
int len;
/* Whether matching against a unibyte character. */
@@ -5492,7 +5481,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
const_re_char *string1,
(re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
PREFETCH ();
- corig = c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
+ c = RE_STRING_CHAR_AND_LENGTH (d, len, target_multibyte);
if (target_multibyte)
{
int c1;
@@ -5524,7 +5513,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
const_re_char *string1,
}
p -= 1;
- if (!execute_charset (&p, c, corig, unibyte_char))
+ if (!execute_charset (&p, c, unibyte_char))
goto fail;
d += len;
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index 7617823..4da9ab3 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -127,17 +127,11 @@ regex--test-cc
(?ẞ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
(?DZ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
(?a . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
- ;; FIXME: Should match upper when case-fold case
- ;; (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
- ;; (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
- ;; (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
- ;; (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
- ;; (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper
lower")
- (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
- (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+ (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+ (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
(?Dz . "Lt | alnum alpha | case-fold: alnum alpha upper lower")
(?ʰ . "Lm | alnum alpha | case-fold: alnum alpha")
(?º . "Lo | alnum alpha | case-fold: alnum alpha")))))))
--
2.8.0.rc3.226.g39d4020
- bug#24603: [RFC 07/18] Split up casify_region function., (continued)
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/03
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/04
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Michal Nazarewicz, 2016/10/06
- bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data, Eli Zaretskii, 2016/10/07
bug#24603: [RFC 18/18] Fix case-fold-search character class matching,
Michal Nazarewicz <=
bug#24603: [RFC 17/18] Optimise character class matching in regexes, Michal Nazarewicz, 2016/10/03
bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings, Michal Nazarewicz, 2016/10/03
bug#24603: [RFC 08/18] Support casing characters which map into multiple code points, Michal Nazarewicz, 2016/10/03
bug#24603: [PATCH 0/3] Case table updates, Michal Nazarewicz, 2016/10/17