[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 11/17] dfa: rewrite handling of multibyte case folding
From: |
Paolo Bonzini |
Subject: |
[PATCH 11/17] dfa: rewrite handling of multibyte case folding |
Date: |
Fri, 12 Mar 2010 18:49:12 +0100 |
Let dfacomp do the folding to lowercase of multibyte input strings,
and remove it from grep.c. Input strings to kwset.c are still folded
outside kwset.c; this is left as a future cleanup since it is not needed
to fix bugs. While doing this I also rewrote the lexing of multibyte
characters in dfa.c.
* .x-sc_cast_of_argument_to_free: Remove.
* src/dfa.c (setbit_case_fold): Assert it is only called for SBCS.
(wctok): New.
(cur_mb_index, update_mb_len_index): Remove.
(FETCH): Do not call it.
(lex): Call fetch_wc in the main loop for MB_CUR_MAX > 1. Rewrite
normal_char label.
(atom): Handle WCHAR instead of treating multibyte characters specially.
(dfacomp): Remove case_fold special casing.
(addtok): Extract part to...
(addtok_mb): ... this new function.
* src/dfa.h (WCHAR): New.
* src/grep.c (mb_icase_keys): Remove.
(main): Do not call it.
* src/search.c (kwsinit): Init transition table only for MB_CUR_MAX == 1.
(mbtolower): New.
(kwsincr_case): New.
(kwsmusts): Call it instead of kwsincr).
(check_multibyte_string): Remove.
(check_multibyte_string_no_icase): Rename to check_multibyte_string.
(GEAcompile, EGexecute, Fcompile): Use mbtolower instead of the old
check_multibyte_string.
* tests/Makefile.am (TESTS): Add case-fold-backslash-w.
* tests/foad1.sh: Enable fixed tests.
* tests/case-fold-backslash-w: New.
---
.x-sc_cast_of_argument_to_free | 1 -
src/dfa.c | 231 +++++++++++++++++-----------------------
src/dfa.h | 3 +
src/grep.c | 68 ------------
src/search.c | 190 ++++++++++++++++++++-------------
tests/Makefile.am | 1 +
tests/case-fold-backslash-w | 14 +++
tests/foad1.sh | 10 +-
8 files changed, 233 insertions(+), 285 deletions(-)
delete mode 100644 .x-sc_cast_of_argument_to_free
create mode 100755 tests/case-fold-backslash-w
diff --git a/.x-sc_cast_of_argument_to_free b/.x-sc_cast_of_argument_to_free
deleted file mode 100644
index 3f02e3d..0000000
--- a/.x-sc_cast_of_argument_to_free
+++ /dev/null
@@ -1 +0,0 @@
-^src/search\.c$
diff --git a/src/dfa.c b/src/dfa.c
index 120df76..a225312 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -246,6 +246,7 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
static void
setbit_case_fold (unsigned b, charclass c)
{
+ assert (MB_CUR_MAX == 1);
setbit (b, c);
if (case_fold)
{
@@ -273,17 +274,10 @@ static int hard_LC_COLLATE; /* Nonzero if
LC_COLLATE is hard. */
#ifdef MBS_SUPPORT
/* These variables are used only if (MB_CUR_MAX > 1). */
static mbstate_t mbs; /* Mbstate for mbrlen(). */
-static int cur_mb_len; /* Byte length of the current scanning
+static int cur_mb_len; /* Length of the multibyte representation of
+ wctok. */
+static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
-static int cur_mb_index; /* Byte index of the current scanning multibyte
- character.
-
- single byte character : cur_mb_index = 0
- multibyte character
- 1st byte : cur_mb_index = 1
- 2nd byte : cur_mb_index = 2
- ...
- nth byte : cur_mb_index = n */
static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
Each element store the amount of remain
byte of corresponding multibyte character
@@ -305,38 +299,6 @@ static unsigned char const *buf_end; /* reference to
end in dfaexec(). */
#endif /* MBS_SUPPORT */
#ifdef MBS_SUPPORT
-/* This function update cur_mb_len, and cur_mb_index.
- p points current lexptr, len is the remaining buffer length. */
-static void
-update_mb_len_index (char const *p, int len)
-{
- /* If last character is a part of a multibyte character,
- we update cur_mb_index. */
- if (cur_mb_index)
- cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
- : cur_mb_index + 1;
-
- /* If last character is a single byte character, or the
- last portion of a multibyte character, we check whether
- next character is a multibyte character or not. */
- if (! cur_mb_index)
- {
- cur_mb_len = mbrlen(p, len, &mbs);
- if (cur_mb_len > 1)
- /* It is a multibyte character.
- cur_mb_len was already set by mbrlen(). */
- cur_mb_index = 1;
- else if (cur_mb_len < 1)
- /* Invalid sequence. We treat it as a single byte character.
- cur_mb_index is aleady 0. */
- cur_mb_len = 1;
- /* Otherwise, cur_mb_len == 1, it is a single byte character.
- cur_mb_index is aleady 0. */
- }
-}
-#endif /* MBS_SUPPORT */
-
-#ifdef MBS_SUPPORT
/* Note that characters become unsigned here. */
# define FETCH(c, eoferr) \
do { \
@@ -347,8 +309,6 @@ update_mb_len_index (char const *p, int len)
else \
return lasttok = END; \
} \
- if (MB_CUR_MAX > 1) \
- update_mb_len_index(lexptr, lexleft); \
(c) = (unsigned char) *lexptr++; \
--lexleft; \
} while(0)
@@ -371,7 +331,7 @@ fetch_wc (char const *eoferr)
if (cur_mb_len <= 0)
{
cur_mb_len = 1;
- wc = *lexptr;
+ wc = (unsigned char) *lexptr;
}
lexptr += cur_mb_len;
lexleft -= cur_mb_len;
@@ -582,24 +542,20 @@ parse_bracket_exp_mb (void)
}
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
range_sts_al, work_mbc->nranges + 1);
- work_mbc->range_sts[work_mbc->nranges] = (wchar_t)wc;
+ work_mbc->range_sts[work_mbc->nranges] =
+ (wchar_t) (case_fold ? towlower(wc) : wc);
REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
range_ends_al, work_mbc->nranges + 1);
- work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+ work_mbc->range_ends[work_mbc->nranges++] =
+ (wchar_t) (case_fold ? towlower(wc2) : wc2);
}
else if (wc != WEOF)
/* build normal characters. */
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
- if (case_fold && (iswlower(wc) || iswupper(wc)))
- {
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] =
- (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
- }
+ work_mbc->chars[work_mbc->nchars++] =
+ (wchar_t) (case_fold ? towlower(wc) : wc);
}
}
while ((wc = wc1) != L']');
@@ -683,15 +639,20 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH(c, 0);
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && cur_mb_index)
- /* If this is a part of a multi-byte character, we must treat
- this byte data as a normal character.
- e.g. In case of SJIS encoding, some character contains '\',
- but they must not be backslash. */
- goto normal_char;
+ if (MB_CUR_MAX > 1)
+ {
+ wint_t wi = fetch_wc (NULL);
+ if (wi == WEOF)
+ return lasttok = EOF;
+ wctok = wi, c = wctob (wi);
+ if ((int)c == EOF)
+ goto normal_char;
+ }
+ else
#endif /* MBS_SUPPORT */
+ FETCH(c, NULL);
+
switch (c)
{
case '\\':
@@ -1063,12 +1024,32 @@ lex (void)
default:
normal_char:
laststart = 0;
- if (case_fold && ISALPHA(c))
- {
- zeroset(ccl);
- setbit_case_fold (c, ccl);
- return lasttok = CSET + charclass_index(ccl);
- }
+ if (case_fold)
+ {
+#ifdef MBS_SUPPORT
+ /* For multibyte character sets, folding is done before dfaexec
+ here so we do not need to make a CSET. */
+ if (MB_CUR_MAX > 1)
+ {
+ wctok = towlower (wctok);
+ c = wctob (wctok);
+ }
+ else
+#endif
+ {
+ if (ISALPHA(c))
+ {
+ zeroset(ccl);
+ setbit_case_fold (c, ccl);
+ return lasttok = CSET + charclass_index(ccl);
+ }
+ }
+ }
+
+#ifdef MBS_SUPPORT
+ if ((int)c == EOF)
+ return lasttok = WCHAR;
+#endif
return lasttok = c;
}
}
@@ -1088,29 +1069,18 @@ static int depth; /* Current depth of a
hypothetical stack
required of the real stack later on in
dfaanalyze(). */
-/* Add the given token to the parse tree, maintaining the depth count and
- updating the maximum depth if necessary. */
static void
-addtok (token t)
+addtok_mb (token t, int mbprop)
{
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
dfa->tindex);
- /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
- if (t == MBCSET)
- dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
- else if (t < NOTCHAR)
- dfa->multibyte_prop[dfa->tindex]
- = (cur_mb_len == 1)? 3 /* single-byte char */
- : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
- + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
- else
- /* It may be unnecessary, but it is safer to treat other
- symbols as single byte characters. */
- dfa->multibyte_prop[dfa->tindex] = 3;
+ dfa->multibyte_prop[dfa->tindex] = mbprop;
}
+#else
+ (void) mbprop;
#endif
REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
@@ -1139,6 +1109,19 @@ addtok (token t)
dfa->depth = depth;
}
+/* Add the given token to the parse tree, maintaining the depth count and
+ updating the maximum depth if necessary. */
+static void
+addtok (token t)
+{
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && t == MBCSET)
+ addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ else
+#endif
+ addtok_mb (t, 3);
+}
+
/* The grammar understood by the parser is as follows.
regexp:
@@ -1177,6 +1160,32 @@ addtok (token t)
static void
atom (void)
{
+#ifdef MBS_SUPPORT
+ /* We treat a multibyte character as a single atom, so that DFA
+ can treat a multibyte character as a single expression.
+
+ e.g. We construct following tree from "<mb1><mb2>".
+ <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
+ <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
+ */
+ if (tok == WCHAR)
+ {
+ unsigned char buf[16];
+ mbstate_t s;
+ int i;
+ memset (&s, 0, sizeof(s));
+ cur_mb_len = wcrtomb ((char *) buf, wctok, &s);
+ addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < cur_mb_len; i++)
+ {
+ addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
+ addtok(CAT);
+ }
+ tok = lex();
+ return;
+ }
+#endif /* MBS_SUPPORT */
+
if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
#ifdef MBS_SUPPORT
@@ -1186,24 +1195,6 @@ atom (void)
{
addtok(tok);
tok = lex();
-#ifdef MBS_SUPPORT
- /* We treat a multibyte character as a single atom, so that DFA
- can treat a multibyte character as a single expression.
-
- e.g. We construct following tree from "<mb1><mb2>".
- <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
- <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
- */
- if (MB_CUR_MAX > 1)
- {
- while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
- {
- addtok(tok);
- addtok(CAT);
- tok = lex();
- }
- }
-#endif /* MBS_SUPPORT */
}
else if (tok == LPAREN)
{
@@ -1335,7 +1326,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
- cur_mb_index = 0;
cur_mb_len = 0;
memset(&mbs, 0, sizeof(mbstate_t));
}
@@ -2930,39 +2920,10 @@ dfainit (struct dfa *d)
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
- if (case_fold && len) /* dummy folding in service of dfamust() */
- {
- char *lcopy;
- int i;
-
- lcopy = malloc(len);
- if (!lcopy)
- dfaerror(_("memory exhausted"));
-
- /* This is a kludge. */
- case_fold = 0;
- for (i = 0; i < len; ++i)
- if (ISUPPER ((unsigned char) s[i]))
- lcopy[i] = tolower ((unsigned char) s[i]);
- else
- lcopy[i] = s[i];
-
- dfainit(d);
- dfaparse(lcopy, len, d);
- free(lcopy);
- dfamust(d);
- d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
- case_fold = 1;
- dfaparse(s, len, d);
- dfaanalyze(d, searchflag);
- }
- else
- {
- dfainit(d);
- dfaparse(s, len, d);
- dfamust(d);
- dfaanalyze(d, searchflag);
- }
+ dfainit(d);
+ dfaparse(s, len, d);
+ dfamust(d);
+ dfaanalyze(d, searchflag);
}
/* Free the storage held by the components of a dfa. */
diff --git a/src/dfa.h b/src/dfa.h
index 685ce94..4ca55f0 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -129,6 +129,9 @@ typedef enum
MBCSET, /* MBCSET is similar to CSET, but for
multibyte characters. */
+
+ WCHAR, /* Only returned by lex. wctok contains
+ the wide character representation. */
#endif /* MBS_SUPPORT */
CSET /* CSET and (and any value greater) is a
diff --git a/src/grep.c b/src/grep.c
index c1c6152..f1d341a 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1781,69 +1781,6 @@ parse_grep_colors (void)
"at remaining substring \"%s\"."), p, q);
}
-/* mb_icase_keys() is called by main() to convert its "keys" string with
- strlen() "len" to lowercase if match_icase is true. Pointers are used
- to implement in-out call-by-reference parameters. */
-#ifdef MBS_SUPPORT
-static void
-mb_icase_keys (char **keys, size_t *len)
-{
- wchar_t wc;
- mbstate_t sti, stj; /* i for input/old, j for output/new. */
- size_t i, j, li, lj; /* l for total string length (minus '\0'). */
- char *ki, *kj; /* k for keys. */
- int mcm;
-
- if ((mcm = MB_CUR_MAX) == 1)
- return;
-
- li = *len;
- ki = *keys;
- /* We use a new buffer because some multi-octet characters change
- length through a lower-case conversion. For example:
- len(U+0049)=1 --> len(U+0131)=2 under tr_TR.UTF-8
- len(U+0130)=2 --> len(U+0069)=1 under en_US.UTF-8
- len(U+2126)=3 --> len(U+03C9)=2 under en_US.UTF-8
- len(U+212A)=3 --> len(U+006B)=1 under en_US.UTF-8
- len(U+212B)=3 --> len(U+00E5)=2 under en_US.UTF-8 */
- lj = li + mcm;
- kj = xmalloc(lj + 1);
-
- memset(&sti, 0, sizeof(mbstate_t));
- memset(&stj, 0, sizeof(mbstate_t));
- for (i = j = 0; i < li ;)
- {
- size_t mbclen;
- mbclen = mbrtowc(&wc, ki + i, li - i, &sti);
- if (lj < j + mcm)
- {
- lj += mcm;
- kj = xrealloc(kj, lj + 1);
- }
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multi-octet character.
- We treat it as a single-octet character. */
- kj[j++] = ki[i++];
- }
- else
- {
- /* Doing towupper() before towlower() helps a few hairy cases and is
- not too costly since this is the PATTERN and is done only once. */
- wc = towupper((wint_t)wc);
- wc = towlower((wint_t)wc);
- j += wcrtomb(kj + j, wc, &stj);
- i += mbclen;
- }
- }
- kj[j] = '\0';
-
- free(ki);
- *keys = kj;
- *len = j;
-}
-#endif /* MBS_SUPPORT */
-
int
main (int argc, char **argv)
{
@@ -2261,11 +2198,6 @@ There is NO WARRANTY, to the extent permitted by
law.\n"),
set_limits();
-#ifdef MBS_SUPPORT
- if (match_icase)
- mb_icase_keys (&keys, &keycc);
-#endif /* MBS_SUPPORT */
-
compile(keys, keycc);
free (keys);
diff --git a/src/search.c b/src/search.c
index d9b4462..56f7068 100644
--- a/src/search.c
+++ b/src/search.c
@@ -59,14 +59,82 @@ kwsinit (void)
static char trans[NCHAR];
int i;
- if (match_icase)
- for (i = 0; i < NCHAR; ++i)
- trans[i] = TOLOWER (i);
+ if (match_icase && MB_CUR_MAX == 1)
+ {
+ for (i = 0; i < NCHAR; ++i)
+ trans[i] = TOLOWER (i);
+
+ kwset = kwsalloc (trans);
+ }
+ else
+ kwset = kwsalloc (NULL);
- if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
+ if (!kwset)
xalloc_die ();
}
+#ifdef MBS_SUPPORT
+/* Convert the string from BEG to N to lowercase. Overwrite N
+ with the length of the new string, and return a pointer to
+ the lowercase string. Successive calls to mbtolower will
+ rewrite the output buffer. */
+static char *
+mbtolower (const char *beg, size_t *n)
+{
+ static char *out;
+ static size_t outlen;
+ mbstate_t is, os;
+ const char *end;
+ char *p;
+ size_t free;
+
+ if (*n > outlen)
+ {
+ /* Get some additional room since we're resizing. */
+ outlen = *n * 2 + MB_CUR_MAX + 1;
+ out = xrealloc (out, outlen);
+ }
+
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ end = beg + *n;
+ p = out;
+ free = outlen - MB_CUR_MAX;
+ while (beg < end)
+ {
+ wchar_t wc;
+ size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
+ if (free < 0)
+ {
+ free += outlen;
+ outlen *= 2;
+ out = xrealloc (out, outlen);
+ }
+
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multi-octet character.
+ We treat it as a single-octet character. */
+ *p++ = *beg++;
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ }
+ else
+ {
+ beg += mbclen;
+ mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+ p += mbclen;
+ free -= mbclen;
+ }
+ }
+
+ *n = p - out;
+ *p++ = 0;
+ return out;
+}
+#endif
+
+
#ifndef FGREP_PROGRAM
/* DFA compiled regexp. */
static struct dfa dfa;
@@ -94,6 +162,22 @@ dfaerror (char const *mesg)
call the regexp matcher at all. */
static int kwset_exact_matches;
+static char const *
+kwsincr_case (const char *must)
+{
+ const char *buf;
+ size_t n;
+
+ n = strlen (must);
+#ifdef MBS_SUPPORT
+ if (match_icase && MB_CUR_MAX > 1)
+ buf = mbtolower (must, &n);
+ else
+#endif
+ buf = must;
+ return kwsincr (kwset, buf, n);
+}
+
/* If the DFA turns out to have some set of fixed strings one of
which must occur in the match, then we build a kwset matcher
to find those strings, and thus quickly filter out impossible
@@ -115,7 +199,7 @@ kwsmusts (void)
if (!dm->exact)
continue;
++kwset_exact_matches;
- if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
+ if ((err = kwsincr_case (dm->must)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
/* Now, we compile the substrings that will require
@@ -124,7 +208,7 @@ kwsmusts (void)
{
if (dm->exact)
continue;
- if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
+ if ((err = kwsincr_case (dm->must)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
if ((err = kwsprep (kwset)) != NULL)
@@ -134,48 +218,9 @@ kwsmusts (void)
#endif /* !FGREP_PROGRAM */
#ifdef MBS_SUPPORT
-/* This function allocate the array which correspond to "buf".
- Then this check multibyte string and mark on the positions which
- are not single byte character nor the first byte of a multibyte
- character. Caller must free the array. */
-static char*
-check_multibyte_string(char *buf, size_t size)
-{
- char *mb_properties = xcalloc(size, 1);
- mbstate_t cur_state;
- wchar_t wc;
- int i;
-
- memset(&cur_state, 0, sizeof(mbstate_t));
-
- for (i = 0; i < size ;)
- {
- size_t mbclen;
- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
-
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multibyte character.
- We treat it as a single byte character. */
- mbclen = 1;
- }
- else if (match_icase)
- {
- if (iswupper((wint_t)wc))
- {
- wc = towlower((wint_t)wc);
- ignore_value (wcrtomb(buf + i, wc, &cur_state));
- }
- }
- mb_properties[i] = mbclen;
- i += mbclen;
- }
-
- return mb_properties;
-}
static char*
-check_multibyte_string_no_icase(const char *buf, size_t size)
+check_multibyte_string(const char *buf, size_t size)
{
char *mb_properties = xcalloc(size, 1);
mbstate_t cur_state;
@@ -219,10 +264,8 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t
syntax_bits)
size_t total = size;
char *motif;
-#if 0
if (match_icase)
syntax_bits |= RE_ICASE;
-#endif
re_set_syntax (syntax_bits);
dfasyntax (syntax_bits, match_icase, eolbyte);
@@ -334,18 +377,16 @@ EXECUTE_FCT(EGexecute)
{
if (match_icase)
{
- /* Add one for the sentinel byte dfaexec may add. */
- char *case_buf = xmalloc(size + 1);
- memcpy(case_buf, buf, size);
+ /* mbtolower adds a NUL byte at the end. That will provide
+ space for the sentinel byte dfaexec may add. */
+ char *case_buf = mbtolower (buf, &size);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
- if (kwset)
- mb_properties = check_multibyte_string(case_buf, size);
buf = case_buf;
}
- else
- if (kwset)
- mb_properties = check_multibyte_string_no_icase(buf, size);
+
+ if (kwset)
+ mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
@@ -512,11 +553,7 @@ EXECUTE_FCT(EGexecute)
out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char *) buf);
- free (mb_properties);
- }
+ free (mb_properties);
#endif /* MBS_SUPPORT */
return ret_val;
}
@@ -525,16 +562,23 @@ EXECUTE_FCT(EGexecute)
#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
COMPILE_FCT(Fcompile)
{
- char const *beg, *end, *lim, *err;
+ char const *beg, *end, *lim, *err, *pat;
+ size_t psize;
kwsinit ();
- beg = pattern;
+ psize = size;
+ if (match_icase && MB_CUR_MAX > 1)
+ pat = mbtolower (pattern, &psize);
+ else
+ pat = pattern;
+
+ beg = pat;
do
{
for (lim = beg;; ++lim)
{
end = lim;
- if (lim >= pattern + size)
+ if (lim >= pat + psize)
break;
if (*lim == '\n')
{
@@ -542,18 +586,19 @@ COMPILE_FCT(Fcompile)
break;
}
#if HAVE_DOS_FILE_CONTENTS
- if (*lim == '\r' && lim + 1 < pattern + size && lim[1] == '\n')
+ if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
{
lim += 2;
break;
}
#endif
}
+
if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
beg = lim;
}
- while (beg < pattern + size);
+ while (beg < pat + psize);
if ((err = kwsprep (kwset)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
@@ -572,14 +617,13 @@ EXECUTE_FCT(Fexecute)
{
if (match_icase)
{
- char *case_buf = xmemdup (buf, size);
+ char *case_buf = mbtolower (buf, &size);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
- mb_properties = check_multibyte_string(case_buf, size);
buf = case_buf;
}
- else
- mb_properties = check_multibyte_string_no_icase(buf, size);
+
+ mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
@@ -644,11 +688,7 @@ EXECUTE_FCT(Fexecute)
out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char *) buf);
- free (mb_properties);
- }
+ free (mb_properties);
#endif /* MBS_SUPPORT */
return ret_val;
}
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 48d92a1..f915e2a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -17,6 +17,7 @@
TESTS = \
backref.sh \
bre.sh \
+ case-fold-backslash-w \
case-fold-char-class \
case-fold-char-type \
dfaexec-multibyte \
diff --git a/tests/case-fold-backslash-w b/tests/case-fold-backslash-w
new file mode 100755
index 0000000..6ae7046
--- /dev/null
+++ b/tests/case-fold-backslash-w
@@ -0,0 +1,14 @@
+#!/bin/sh
+# test that \W works on case-insensitive matches. It used to become \w.
+# Derived from https://savannah.gnu.org/bugs/?28162
+: ${srcdir=.}
+. "$srcdir/init.sh"; path_prepend_ ../src
+
+if echo foo bar | LANG=C.ASCII grep '^foo\W'; then
+ echo foo bar | LANG=C.ASCII grep -i '^foo\W' || fail_ ASCII insensitive
+else
+ echo foo bar | LANG=C grep '^foo\W' || fail_ LANG=C sensitive
+ echo foo bar | LANG=C grep -i '^foo\W' || fail_ LANG=C insensitive
+fi
+echo foo bar | LANG=en_US.UTF-8 grep '^foo\W' || fail_ UTF-8 sensitive
+echo foo bar | LANG=en_US.UTF-8 grep -i '^foo\W' || fail_ UTF-8 insensitive
diff --git a/tests/foad1.sh b/tests/foad1.sh
index 7c16d00..68acc77 100755
--- a/tests/foad1.sh
+++ b/tests/foad1.sh
@@ -42,9 +42,8 @@ grep_test ()
# "-o" with "-i" should output an exact copy of the matching input text.
grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "word" -o -i
-# Comment out cases that are known to fail. These should be uncommented after
the 2.5.4 release. TAA.
-#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
-#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
+grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
+grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
# Should display the line number (-n), octet offset (-b), or file name
# (-H) of every match, not just of the first match on each input line.
@@ -82,9 +81,8 @@ CE="[m[K"
# "--color" with "-i" should output an exact copy of the matching input text.
grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "word" --color=always -i
-# Comment out cases that are known to fail. These should be uncommented after
the 2.5.4 release. TAA.
-#grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
-#grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
+grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
+grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
# End of a previous match should not match a "start of ..." expression.
grep_test "word_word/" "${CB}word_${CE}word/" "^word_*" --color=always
--
1.6.6
- [PATCH 06/17] grep: fix error-message-uppercase, (continued)
- [PATCH 06/17] grep: fix error-message-uppercase, Paolo Bonzini, 2010/03/12
- [PATCH 09/17] syntax-check: enable space-tab, Paolo Bonzini, 2010/03/12
- [PATCH 08/17] syntax-check: enable m4-quote-check, Paolo Bonzini, 2010/03/12
- [PATCH 10/17] tests: add more UTF-8 test cases, Paolo Bonzini, 2010/03/12
- [PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/12
- [PATCH 12/17] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/12
- [PATCH 11/17] dfa: rewrite handling of multibyte case folding,
Paolo Bonzini <=
- [PATCH 14/17] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/12
- [PATCH 15/17] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/12
- [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/12
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/13
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/15
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/19
[PATCH 17/17] grep: match multibyte charsets line-by-line when using -i, Paolo Bonzini, 2010/03/12
Re: [PATCH 00/16] my last hefty patch drop, Jim Meyering, 2010/03/12