[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing
From: |
Paolo Bonzini |
Subject: |
[PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing |
Date: |
Sun, 14 Mar 2010 16:35:08 +0100 |
Let dfacomp do the folding to lowercase of multibyte regexes, and remove
it from grep.c. Input strings to kwset.c are still folded outside
kwset.c.
* NEWS: Document bugfixes.
* .x-sc_cast_of_argument_to_free: Remove.
* src/dfa.c (wctok, addtok_wc): New.
(cur_mb_index, update_mb_len_index): Remove.
(FETCH): Do not call it.
(parse_bracket_exp_mb) [GREP]: Disable case-folding of ranges and
characters.
(addtok): Extract part to...
(addtok_mb): ... this new function.
(lex): Call fetch_wc in the main loop for MB_CUR_MAX > 1. Return WCHAR
for normal characters if MB_CUR_MAX > 1.
(atom): Handle WCHAR instead of treating multibyte characters specially.
Do case folding of multibyte characters here.
(dfacomp): Remove case_fold special casing.
* src/dfa.h (WCHAR): New.
* src/grep.c (mb_icase_keys): Remove.
(main): Do not call it.
* src/search.c (kwsinit): Init transition table only for MB_CUR_MAX == 1.
(mbtolower): New.
(kwsincr_case): New.
(kwsmusts): Call it instead of kwsincr.
(check_multibyte_string): Remove.
(check_multibyte_string_no_icase): Rename to check_multibyte_string.
(GEAcompile, EGexecute, Fcompile): Use mbtolower instead of the old
check_multibyte_string.
* tests/Makefile.am (TESTS): Add case-fold-backslash-w.
* tests/foad1.sh: Enable fixed tests.
* tests/case-fold-backslash-w: New.
---
.x-sc_cast_of_argument_to_free | 1 -
NEWS | 5 +-
src/dfa.c | 225 +++++++++++++++++-----------------------
src/dfa.h | 3 +
src/grep.c | 68 ------------
src/search.c | 191 +++++++++++++++++++++-------------
tests/Makefile.am | 1 +
tests/case-fold-backslash-w | 14 +++
tests/foad1.sh | 10 +-
9 files changed, 239 insertions(+), 279 deletions(-)
delete mode 100644 .x-sc_cast_of_argument_to_free
create mode 100755 tests/case-fold-backslash-w
diff --git a/.x-sc_cast_of_argument_to_free b/.x-sc_cast_of_argument_to_free
deleted file mode 100644
index 3f02e3d..0000000
--- a/.x-sc_cast_of_argument_to_free
+++ /dev/null
@@ -1 +0,0 @@
-^src/search\.c$
diff --git a/NEWS b/NEWS
index 2ade4fc..8323cd7 100644
--- a/NEWS
+++ b/NEWS
@@ -8,7 +8,10 @@ GNU grep NEWS -*- outline
-*-
For example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[y]' would print nothing.
Character types would malfunction in multi-byte locales similarly; for
example, echo Y | LC_ALL=en_US.UTF-8 grep -i '[[:lower:]]' would print
- nothing.
+ nothing. Finally, grep -i would malfunction with a regex including \W.
+
+ grep -i -o would fail to report some matches; grep -i --color, while not
+ missing any line containing a match, would fail to color some matches.
Various bugs in grep -P, caused by expressions such as [^b] or \S matching
newlines, were fixed. grep -P also supports the special sequences \Z and
diff --git a/src/dfa.c b/src/dfa.c
index 3cc405a..3191c6f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -267,17 +267,10 @@ static int hard_LC_COLLATE; /* Nonzero if
LC_COLLATE is hard. */
#ifdef MBS_SUPPORT
/* These variables are used only if (MB_CUR_MAX > 1). */
static mbstate_t mbs; /* Mbstate for mbrlen(). */
-static int cur_mb_len; /* Byte length of the current scanning
+static int cur_mb_len; /* Length of the multibyte representation of
+ wctok. */
+static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
-static int cur_mb_index; /* Byte index of the current scanning multibyte
- character.
-
- single byte character : cur_mb_index = 0
- multibyte character
- 1st byte : cur_mb_index = 1
- 2nd byte : cur_mb_index = 2
- ...
- nth byte : cur_mb_index = n */
static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().
Each element store the amount of remain
byte of corresponding multibyte character
@@ -299,38 +292,6 @@ static unsigned char const *buf_end; /* reference to
end in dfaexec(). */
#endif /* MBS_SUPPORT */
#ifdef MBS_SUPPORT
-/* This function update cur_mb_len, and cur_mb_index.
- p points current lexptr, len is the remaining buffer length. */
-static void
-update_mb_len_index (char const *p, int len)
-{
- /* If last character is a part of a multibyte character,
- we update cur_mb_index. */
- if (cur_mb_index)
- cur_mb_index = (cur_mb_index >= cur_mb_len)? 0
- : cur_mb_index + 1;
-
- /* If last character is a single byte character, or the
- last portion of a multibyte character, we check whether
- next character is a multibyte character or not. */
- if (! cur_mb_index)
- {
- cur_mb_len = mbrlen(p, len, &mbs);
- if (cur_mb_len > 1)
- /* It is a multibyte character.
- cur_mb_len was already set by mbrlen(). */
- cur_mb_index = 1;
- else if (cur_mb_len < 1)
- /* Invalid sequence. We treat it as a single byte character.
- cur_mb_index is aleady 0. */
- cur_mb_len = 1;
- /* Otherwise, cur_mb_len == 1, it is a single byte character.
- cur_mb_index is aleady 0. */
- }
-}
-#endif /* MBS_SUPPORT */
-
-#ifdef MBS_SUPPORT
/* Note that characters become unsigned here. */
# define FETCH(c, eoferr) \
do { \
@@ -341,8 +302,6 @@ update_mb_len_index (char const *p, int len)
else \
return lasttok = END; \
} \
- if (MB_CUR_MAX > 1) \
- update_mb_len_index(lexptr, lexleft); \
(c) = (unsigned char) *lexptr++; \
--lexleft; \
} while(0)
@@ -365,7 +324,7 @@ fetch_wc (char const *eoferr)
if (cur_mb_len <= 0)
{
cur_mb_len = 1;
- wc = *lexptr;
+ wc = (unsigned char) *lexptr;
}
lexptr += cur_mb_len;
lexleft -= cur_mb_len;
@@ -583,6 +542,7 @@ parse_bracket_exp_mb (void)
work_mbc->range_ends[work_mbc->nranges++] =
case_fold ? towlower(wc2) : (wchar_t)wc2;
+#ifndef GREP
if (case_fold)
{
REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
@@ -592,21 +552,24 @@ parse_bracket_exp_mb (void)
range_ends_al, work_mbc->nranges + 1);
work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
}
+#endif
}
else if (wc != WEOF)
/* build normal characters. */
{
REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] = (wchar_t)wc;
- if (case_fold && (iswlower(wc) || iswupper(wc)))
- {
- REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
- work_mbc->nchars + 1);
- work_mbc->chars[work_mbc->nchars++] =
- (wchar_t) (iswlower(wc) ? towupper(wc) : towlower(wc));
- }
- }
+ work_mbc->chars[work_mbc->nchars++] =
+ (wchar_t) (case_fold ? towlower(wc) : wc);
+#ifndef GREP
+ if (case_fold)
+ {
+ REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+ work_mbc->nchars + 1);
+ work_mbc->chars[work_mbc->nchars++] = towupper(wc);
+ }
+#endif
+ }
}
while ((wc = wc1) != L']');
return MBCSET;
@@ -689,15 +652,20 @@ lex (void)
"if (backslash) ...". */
for (i = 0; i < 2; ++i)
{
- FETCH(c, 0);
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && cur_mb_index)
- /* If this is a part of a multi-byte character, we must treat
- this byte data as a normal character.
- e.g. In case of SJIS encoding, some character contains '\',
- but they must not be backslash. */
- goto normal_char;
+ if (MB_CUR_MAX > 1)
+ {
+ wint_t wi = fetch_wc (NULL);
+ if (wi == WEOF)
+ return lasttok = EOF;
+ wctok = wi, c = wctob (wi);
+ if ((int)c == EOF)
+ goto normal_char;
+ }
+ else
#endif /* MBS_SUPPORT */
+ FETCH(c, NULL);
+
switch (c)
{
case '\\':
@@ -1069,12 +1037,20 @@ lex (void)
default:
normal_char:
laststart = 0;
+#ifdef MBS_SUPPORT
+ /* For multibyte character sets, folding is done in atom. Always
+ return WCHAR. */
+ if (MB_CUR_MAX > 1)
+ return lasttok = WCHAR;
+#endif
+
if (case_fold && ISALPHA(c))
{
zeroset(ccl);
setbit_case_fold (c, ccl);
return lasttok = CSET + charclass_index(ccl);
}
+
return lasttok = c;
}
}
@@ -1094,29 +1070,18 @@ static int depth; /* Current depth of a
hypothetical stack
required of the real stack later on in
dfaanalyze(). */
-/* Add the given token to the parse tree, maintaining the depth count and
- updating the maximum depth if necessary. */
static void
-addtok (token t)
+addtok_mb (token t, int mbprop)
{
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
dfa->tindex);
- /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */
- if (t == MBCSET)
- dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;
- else if (t < NOTCHAR)
- dfa->multibyte_prop[dfa->tindex]
- = (cur_mb_len == 1)? 3 /* single-byte char */
- : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */
- + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */
- else
- /* It may be unnecessary, but it is safer to treat other
- symbols as single byte characters. */
- dfa->multibyte_prop[dfa->tindex] = 3;
+ dfa->multibyte_prop[dfa->tindex] = mbprop;
}
+#else
+ (void) mbprop;
#endif
REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
@@ -1145,6 +1110,41 @@ addtok (token t)
dfa->depth = depth;
}
+/* Add the given token to the parse tree, maintaining the depth count and
+ updating the maximum depth if necessary. */
+static void
+addtok (token t)
+{
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && t == MBCSET)
+ addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ else
+#endif
+ addtok_mb (t, 3);
+}
+
+/* We treat a multibyte character as a single atom, so that DFA
+ can treat a multibyte character as a single expression.
+
+ e.g. We construct following tree from "<mb1><mb2>".
+ <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
+ <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> */
+static void
+addtok_wc (wint_t wc)
+{
+ unsigned char buf[16];
+ mbstate_t s;
+ int i;
+ memset (&s, 0, sizeof(s));
+ cur_mb_len = wcrtomb ((char *) buf, wc, &s);
+ addtok_mb(buf[0], cur_mb_len == 1 ? 3 : 1);
+ for (i = 1; i < cur_mb_len; i++)
+ {
+ addtok_mb(buf[i], i == cur_mb_len - 1 ? 2 : 0);
+ addtok(CAT);
+ }
+}
+
/* The grammar understood by the parser is as follows.
regexp:
@@ -1183,6 +1183,23 @@ addtok (token t)
static void
atom (void)
{
+#ifdef MBS_SUPPORT
+ if (tok == WCHAR)
+ {
+ addtok_wc (case_fold ? towlower(wctok) : wctok);
+#ifndef GREP
+ if (case_fold && iswalpha(wctok))
+ {
+ addtok_wc (towupper(wctok));
+ addtok (OR);
+ }
+#endif
+
+ tok = lex();
+ return;
+ }
+#endif /* MBS_SUPPORT */
+
if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
#ifdef MBS_SUPPORT
@@ -1192,24 +1209,6 @@ atom (void)
{
addtok(tok);
tok = lex();
-#ifdef MBS_SUPPORT
- /* We treat a multibyte character as a single atom, so that DFA
- can treat a multibyte character as a single expression.
-
- e.g. We construct following tree from "<mb1><mb2>".
- <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>
- <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>
- */
- if (MB_CUR_MAX > 1)
- {
- while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)
- {
- addtok(tok);
- addtok(CAT);
- tok = lex();
- }
- }
-#endif /* MBS_SUPPORT */
}
else if (tok == LPAREN)
{
@@ -1341,7 +1340,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
{
- cur_mb_index = 0;
cur_mb_len = 0;
memset(&mbs, 0, sizeof(mbstate_t));
}
@@ -2936,39 +2934,10 @@ dfainit (struct dfa *d)
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
- if (case_fold && len) /* dummy folding in service of dfamust() */
- {
- char *lcopy;
- int i;
-
- lcopy = malloc(len);
- if (!lcopy)
- dfaerror(_("memory exhausted"));
-
- /* This is a kludge. */
- case_fold = 0;
- for (i = 0; i < len; ++i)
- if (ISUPPER ((unsigned char) s[i]))
- lcopy[i] = tolower ((unsigned char) s[i]);
- else
- lcopy[i] = s[i];
-
- dfainit(d);
- dfaparse(lcopy, len, d);
- free(lcopy);
- dfamust(d);
- d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
- case_fold = 1;
- dfaparse(s, len, d);
- dfaanalyze(d, searchflag);
- }
- else
- {
- dfainit(d);
- dfaparse(s, len, d);
- dfamust(d);
- dfaanalyze(d, searchflag);
- }
+ dfainit(d);
+ dfaparse(s, len, d);
+ dfamust(d);
+ dfaanalyze(d, searchflag);
}
/* Free the storage held by the components of a dfa. */
diff --git a/src/dfa.h b/src/dfa.h
index 685ce94..4ca55f0 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -129,6 +129,9 @@ typedef enum
MBCSET, /* MBCSET is similar to CSET, but for
multibyte characters. */
+
+ WCHAR, /* Only returned by lex. wctok contains
+ the wide character representation. */
#endif /* MBS_SUPPORT */
CSET /* CSET and (and any value greater) is a
diff --git a/src/grep.c b/src/grep.c
index c1c6152..f1d341a 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1781,69 +1781,6 @@ parse_grep_colors (void)
"at remaining substring \"%s\"."), p, q);
}
-/* mb_icase_keys() is called by main() to convert its "keys" string with
- strlen() "len" to lowercase if match_icase is true. Pointers are used
- to implement in-out call-by-reference parameters. */
-#ifdef MBS_SUPPORT
-static void
-mb_icase_keys (char **keys, size_t *len)
-{
- wchar_t wc;
- mbstate_t sti, stj; /* i for input/old, j for output/new. */
- size_t i, j, li, lj; /* l for total string length (minus '\0'). */
- char *ki, *kj; /* k for keys. */
- int mcm;
-
- if ((mcm = MB_CUR_MAX) == 1)
- return;
-
- li = *len;
- ki = *keys;
- /* We use a new buffer because some multi-octet characters change
- length through a lower-case conversion. For example:
- len(U+0049)=1 --> len(U+0131)=2 under tr_TR.UTF-8
- len(U+0130)=2 --> len(U+0069)=1 under en_US.UTF-8
- len(U+2126)=3 --> len(U+03C9)=2 under en_US.UTF-8
- len(U+212A)=3 --> len(U+006B)=1 under en_US.UTF-8
- len(U+212B)=3 --> len(U+00E5)=2 under en_US.UTF-8 */
- lj = li + mcm;
- kj = xmalloc(lj + 1);
-
- memset(&sti, 0, sizeof(mbstate_t));
- memset(&stj, 0, sizeof(mbstate_t));
- for (i = j = 0; i < li ;)
- {
- size_t mbclen;
- mbclen = mbrtowc(&wc, ki + i, li - i, &sti);
- if (lj < j + mcm)
- {
- lj += mcm;
- kj = xrealloc(kj, lj + 1);
- }
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multi-octet character.
- We treat it as a single-octet character. */
- kj[j++] = ki[i++];
- }
- else
- {
- /* Doing towupper() before towlower() helps a few hairy cases and is
- not too costly since this is the PATTERN and is done only once. */
- wc = towupper((wint_t)wc);
- wc = towlower((wint_t)wc);
- j += wcrtomb(kj + j, wc, &stj);
- i += mbclen;
- }
- }
- kj[j] = '\0';
-
- free(ki);
- *keys = kj;
- *len = j;
-}
-#endif /* MBS_SUPPORT */
-
int
main (int argc, char **argv)
{
@@ -2261,11 +2198,6 @@ There is NO WARRANTY, to the extent permitted by
law.\n"),
set_limits();
-#ifdef MBS_SUPPORT
- if (match_icase)
- mb_icase_keys (&keys, &keycc);
-#endif /* MBS_SUPPORT */
-
compile(keys, keycc);
free (keys);
diff --git a/src/search.c b/src/search.c
index d9b4462..9d73abc 100644
--- a/src/search.c
+++ b/src/search.c
@@ -59,14 +59,83 @@ kwsinit (void)
static char trans[NCHAR];
int i;
- if (match_icase)
- for (i = 0; i < NCHAR; ++i)
- trans[i] = TOLOWER (i);
+ if (match_icase && MB_CUR_MAX == 1)
+ {
+ for (i = 0; i < NCHAR; ++i)
+ trans[i] = TOLOWER (i);
+
+ kwset = kwsalloc (trans);
+ }
+ else
+ kwset = kwsalloc (NULL);
- if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
+ if (!kwset)
xalloc_die ();
}
+#ifdef MBS_SUPPORT
+/* Convert the string from BEG to N to lowercase. Overwrite N
+ with the length of the new string, and return a pointer to
+ the lowercase string. Successive calls to mbtolower will
+ rewrite the output buffer. */
+static char *
+mbtolower (const char *beg, size_t *n)
+{
+ static char *out;
+ static size_t outalloc;
+ size_t outlen, mb_cur_max;
+ mbstate_t is, os;
+ const char *end;
+ char *p;
+
+ if (*n > outalloc)
+ {
+ out = xrealloc (out, *n);
+ outalloc = *n;
+ }
+
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ end = beg + *n;
+
+ mb_cur_max = MB_CUR_MAX;
+ p = out;
+ outlen = 0;
+ while (beg < end)
+ {
+ wchar_t wc;
+ size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
+ if (outlen + mb_cur_max >= outalloc)
+ {
+ out = x2nrealloc (out, &outalloc, 1);
+ p = out + outlen;
+ }
+
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multi-octet character.
+ We treat it as a single-octet character. */
+ *p++ = *beg++;
+ outlen++;
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ }
+ else
+ {
+ beg += mbclen;
+ mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+ p += mbclen;
+ outlen += mbclen;
+ }
+ }
+
+ *n = p - out;
+ *p++ = 0;
+ return out;
+}
+#endif
+
+
#ifndef FGREP_PROGRAM
/* DFA compiled regexp. */
static struct dfa dfa;
@@ -94,6 +163,22 @@ dfaerror (char const *mesg)
call the regexp matcher at all. */
static int kwset_exact_matches;
+static char const *
+kwsincr_case (const char *must)
+{
+ const char *buf;
+ size_t n;
+
+ n = strlen (must);
+#ifdef MBS_SUPPORT
+ if (match_icase && MB_CUR_MAX > 1)
+ buf = mbtolower (must, &n);
+ else
+#endif
+ buf = must;
+ return kwsincr (kwset, buf, n);
+}
+
/* If the DFA turns out to have some set of fixed strings one of
which must occur in the match, then we build a kwset matcher
to find those strings, and thus quickly filter out impossible
@@ -115,7 +200,7 @@ kwsmusts (void)
if (!dm->exact)
continue;
++kwset_exact_matches;
- if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
+ if ((err = kwsincr_case (dm->must)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
/* Now, we compile the substrings that will require
@@ -124,7 +209,7 @@ kwsmusts (void)
{
if (dm->exact)
continue;
- if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL)
+ if ((err = kwsincr_case (dm->must)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
}
if ((err = kwsprep (kwset)) != NULL)
@@ -134,48 +219,9 @@ kwsmusts (void)
#endif /* !FGREP_PROGRAM */
#ifdef MBS_SUPPORT
-/* This function allocate the array which correspond to "buf".
- Then this check multibyte string and mark on the positions which
- are not single byte character nor the first byte of a multibyte
- character. Caller must free the array. */
-static char*
-check_multibyte_string(char *buf, size_t size)
-{
- char *mb_properties = xcalloc(size, 1);
- mbstate_t cur_state;
- wchar_t wc;
- int i;
-
- memset(&cur_state, 0, sizeof(mbstate_t));
-
- for (i = 0; i < size ;)
- {
- size_t mbclen;
- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
-
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multibyte character.
- We treat it as a single byte character. */
- mbclen = 1;
- }
- else if (match_icase)
- {
- if (iswupper((wint_t)wc))
- {
- wc = towlower((wint_t)wc);
- ignore_value (wcrtomb(buf + i, wc, &cur_state));
- }
- }
- mb_properties[i] = mbclen;
- i += mbclen;
- }
-
- return mb_properties;
-}
static char*
-check_multibyte_string_no_icase(const char *buf, size_t size)
+check_multibyte_string(const char *buf, size_t size)
{
char *mb_properties = xcalloc(size, 1);
mbstate_t cur_state;
@@ -219,10 +265,8 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t
syntax_bits)
size_t total = size;
char *motif;
-#if 0
if (match_icase)
syntax_bits |= RE_ICASE;
-#endif
re_set_syntax (syntax_bits);
dfasyntax (syntax_bits, match_icase, eolbyte);
@@ -334,18 +378,16 @@ EXECUTE_FCT(EGexecute)
{
if (match_icase)
{
- /* Add one for the sentinel byte dfaexec may add. */
- char *case_buf = xmalloc(size + 1);
- memcpy(case_buf, buf, size);
+ /* mbtolower adds a NUL byte at the end. That will provide
+ space for the sentinel byte dfaexec may add. */
+ char *case_buf = mbtolower (buf, &size);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
- if (kwset)
- mb_properties = check_multibyte_string(case_buf, size);
buf = case_buf;
}
- else
- if (kwset)
- mb_properties = check_multibyte_string_no_icase(buf, size);
+
+ if (kwset)
+ mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
@@ -512,11 +554,7 @@ EXECUTE_FCT(EGexecute)
out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char *) buf);
- free (mb_properties);
- }
+ free (mb_properties);
#endif /* MBS_SUPPORT */
return ret_val;
}
@@ -525,16 +563,23 @@ EXECUTE_FCT(EGexecute)
#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
COMPILE_FCT(Fcompile)
{
- char const *beg, *end, *lim, *err;
+ char const *beg, *end, *lim, *err, *pat;
+ size_t psize;
kwsinit ();
- beg = pattern;
+ psize = size;
+ if (match_icase && MB_CUR_MAX > 1)
+ pat = mbtolower (pattern, &psize);
+ else
+ pat = pattern;
+
+ beg = pat;
do
{
for (lim = beg;; ++lim)
{
end = lim;
- if (lim >= pattern + size)
+ if (lim >= pat + psize)
break;
if (*lim == '\n')
{
@@ -542,18 +587,19 @@ COMPILE_FCT(Fcompile)
break;
}
#if HAVE_DOS_FILE_CONTENTS
- if (*lim == '\r' && lim + 1 < pattern + size && lim[1] == '\n')
+ if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
{
lim += 2;
break;
}
#endif
}
+
if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
beg = lim;
}
- while (beg < pattern + size);
+ while (beg < pat + psize);
if ((err = kwsprep (kwset)) != NULL)
error (EXIT_TROUBLE, 0, "%s", err);
@@ -572,14 +618,13 @@ EXECUTE_FCT(Fexecute)
{
if (match_icase)
{
- char *case_buf = xmemdup (buf, size);
+ char *case_buf = mbtolower (buf, &size);
if (start_ptr)
start_ptr = case_buf + (start_ptr - buf);
- mb_properties = check_multibyte_string(case_buf, size);
buf = case_buf;
}
- else
- mb_properties = check_multibyte_string_no_icase(buf, size);
+
+ mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
@@ -644,11 +689,7 @@ EXECUTE_FCT(Fexecute)
out:
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- free ((char *) buf);
- free (mb_properties);
- }
+ free (mb_properties);
#endif /* MBS_SUPPORT */
return ret_val;
}
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ab5fd4e..0561e80 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -17,6 +17,7 @@
TESTS = \
backref.sh \
bre.sh \
+ case-fold-backslash-w \
case-fold-char-class \
case-fold-char-range \
case-fold-char-type \
diff --git a/tests/case-fold-backslash-w b/tests/case-fold-backslash-w
new file mode 100755
index 0000000..6ae7046
--- /dev/null
+++ b/tests/case-fold-backslash-w
@@ -0,0 +1,14 @@
+#!/bin/sh
+# test that \W works on case-insensitive matches. It used to become \w.
+# Derived from https://savannah.gnu.org/bugs/?28162
+: ${srcdir=.}
+. "$srcdir/init.sh"; path_prepend_ ../src
+
+if echo foo bar | LANG=C.ASCII grep '^foo\W'; then
+ echo foo bar | LANG=C.ASCII grep -i '^foo\W' || fail_ ASCII insensitive
+else
+ echo foo bar | LANG=C grep '^foo\W' || fail_ LANG=C sensitive
+ echo foo bar | LANG=C grep -i '^foo\W' || fail_ LANG=C insensitive
+fi
+echo foo bar | LANG=en_US.UTF-8 grep '^foo\W' || fail_ UTF-8 sensitive
+echo foo bar | LANG=en_US.UTF-8 grep -i '^foo\W' || fail_ UTF-8 insensitive
diff --git a/tests/foad1.sh b/tests/foad1.sh
index 7c16d00..68acc77 100755
--- a/tests/foad1.sh
+++ b/tests/foad1.sh
@@ -42,9 +42,8 @@ grep_test ()
# "-o" with "-i" should output an exact copy of the matching input text.
grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "word" -o -i
-# Comment out cases that are known to fail. These should be uncommented after
the 2.5.4 release. TAA.
-#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
-#grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
+grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "Word" -o -i
+grep_test "WordA/wordB/WORDC/" "Word/word/WORD/" "WORD" -o -i
# Should display the line number (-n), octet offset (-b), or file name
# (-H) of every match, not just of the first match on each input line.
@@ -82,9 +81,8 @@ CE="[m[K"
# "--color" with "-i" should output an exact copy of the matching input text.
grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "word" --color=always -i
-# Comment out cases that are known to fail. These should be uncommented after
the 2.5.4 release. TAA.
-#grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
-#grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
+grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "Word" --color=always -i
+grep_test "WordA/wordb/WORDC/"
"${CB}Word${CE}A/${CB}word${CE}b/${CB}WORD${CE}C/" "WORD" --color=always -i
# End of a previous match should not match a "start of ..." expression.
grep_test "word_word/" "${CB}word_${CE}word/" "^word_*" --color=always
--
1.6.6.1
[PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing,
Paolo Bonzini <=
[PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/14
[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/14
[PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/14