From bd012ab161c017feca3f7867776aa9c80e8ae891 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 7 Apr 2014 20:28:26 -0700 Subject: [PATCH] grep: remove trival_case_ignore This optimization is no longer needed, given the other optimizations recently installed. Derived from a patch by Norihiro Tanaka; see . * bootstrap.conf (gnulib_modules): Remove assert-h. * src/dfa.c (CASE_FOLDED_BUFSIZE): Move here from dfa.h. Remove now-unnecessary static assert. (case_folded_counterparts): Now static. * src/dfa.h (CASE_FOLDED_BUFSIZE, case_folded_counterparts): Remove decls; no longer public. * src/dfasearch.c (kwsmusts): Use kwset even if fill MB_CUR_MAX > 1 and case-insensitive. * src/grep.c (MBRTOWC, WCRTOMB): Remove. (fgrep_to_grep_pattern): Use mbrtowc, not MBRTOWC. (trivial_case_ignore): Remove; this optimization is no longer needed. All uses removed. --- bootstrap.conf | 1 - src/dfa.c | 11 ++++-- src/dfa.h | 8 ---- src/dfasearch.c | 6 --- src/grep.c | 119 +------------------------------------------------------- 5 files changed, 8 insertions(+), 137 deletions(-) diff --git a/bootstrap.conf b/bootstrap.conf index 86cd81d..367427d 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -24,7 +24,6 @@ gnulib_modules=' alloca announce-gen argmatch -assert-h binary-io btowc c-ctype diff --git a/src/dfa.c b/src/dfa.c index b6c1250..d8744d7 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -933,14 +933,17 @@ static short const lonesome_lower[] = 0x03F5, 0x1E9B, 0x1FBE, }; -static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2 - == CASE_FOLDED_BUFSIZE), - "CASE_FOLDED_BUFSIZE is wrong"); +/* Maximum number of characters that can be the case-folded + counterparts of a single character, not counting the character + itself. This is 1 for towupper, 1 for towlower, and 1 for each + entry in LONESOME_LOWER. */ +enum +{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower }; /* Find the characters equal to C after case-folding, other than C itself, and store them into FOLDED. Return the number of characters stored. */ -int +static int case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) { int i; diff --git a/src/dfa.h b/src/dfa.h index 6ed2231..db29a62 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -112,11 +112,3 @@ extern void dfawarn (const char *); extern _Noreturn void dfaerror (const char *); extern int using_utf8 (void); - -/* Maximum number of characters that can be the case-folded - counterparts of a single character, not counting the character - itself. This is 1 for towupper, 1 for towlower, and 1 for each - entry in LONESOME_LOWER; see dfa.c. */ -enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 }; - -extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]); diff --git a/src/dfasearch.c b/src/dfasearch.c index 44360b6..2ae0a4a 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -81,12 +81,6 @@ dfawarn (char const *mesg) static void kwsmusts (void) { - /* With case-insensitive matching in a multi-byte locale, do not - use kwsearch, because in that case, it would be too expensive, - requiring that we case-convert all searched input. */ - if (MB_CUR_MAX > 1 && match_icase) - return; - struct dfamust const *dm = dfamusts (dfa); if (dm) { diff --git a/src/grep.c b/src/grep.c index 7033730..8bd6c49 100644 --- a/src/grep.c +++ b/src/grep.c @@ -1894,15 +1894,6 @@ parse_grep_colors (void) return; } -#define MBRTOWC(pwc, s, n, ps) \ - (MB_CUR_MAX == 1 \ - ? (*(pwc) = btowc (*(unsigned char *) (s)), 1) \ - : mbrtowc (pwc, s, n, ps)) -#define WCRTOMB(s, wc, ps) \ - (MB_CUR_MAX == 1 \ - ? (*(s) = wctob ((wint_t) (wc)), 1) \ - : wcrtomb (s, wc, ps)) - /* Change a pattern for fgrep into grep. */ static void fgrep_to_grep_pattern (size_t len, char const *keys, @@ -1915,7 +1906,7 @@ fgrep_to_grep_pattern (size_t len, char const *keys, for (; len; keys += n, len -= n) { wchar_t wc; - n = MBRTOWC (&wc, keys, len, &mb_state); + n = mbrtowc (&wc, keys, len, &mb_state); switch (n) { case (size_t) -2: @@ -1942,86 +1933,6 @@ fgrep_to_grep_pattern (size_t len, char const *keys, *new_len = p - *new_keys; } -/* If the newline-separated regular expressions, KEYS (with length, LEN - and no trailing NUL byte), are amenable to transformation into - otherwise equivalent case-ignoring ones, perform the transformation, - put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN, - and return true. Otherwise, return false. */ - -static bool -trivial_case_ignore (size_t len, char const *keys, - size_t *new_len, char **new_keys) -{ - /* FIXME: consider removing the following restriction: - Reject if KEYS contain ASCII '\\' or '['. */ - if (memchr (keys, '\\', len) || memchr (keys, '[', len)) - return false; - - /* Worst case is that each byte B of KEYS is ASCII alphabetic and - CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each - occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which - requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is - bounded above by the constant expression CASE_FOLDED_BUFSIZE * - MB_LEN_MAX + 3. */ - *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3); - char *p = *new_keys; - - mbstate_t mb_state = { 0 }; - while (len) - { - bool initial_state = mbsinit (&mb_state) != 0; - wchar_t wc; - size_t n = MBRTOWC (&wc, keys, len, &mb_state); - - /* For an invalid, incomplete or L'\0', skip this optimization. */ - if ((size_t) -2 <= n) - { - skip_case_ignore_optimization: - free (*new_keys); - return false; - } - - char const *orig = keys; - keys += n; - len -= n; - - wchar_t folded[CASE_FOLDED_BUFSIZE]; - int nfolded = case_folded_counterparts (wc, folded); - if (nfolded <= 0) - { - memcpy (p, orig, n); - p += n; - } - else if (! initial_state) - goto skip_case_ignore_optimization; - else - { - *p++ = '['; - memcpy (p, orig, n); - p += n; - - int i = 0; - do - { - size_t nbytes = WCRTOMB (p, folded[i], &mb_state); - if (nbytes == (size_t) -1) - goto skip_case_ignore_optimization; - p += nbytes; - } - while (++i < nfolded); - - if (! mbsinit (&mb_state)) - goto skip_case_ignore_optimization; - - *p++ = ']'; - } - } - - *new_len = p - *new_keys; - - return true; -} - int main (int argc, char **argv) { @@ -2432,34 +2343,6 @@ main (int argc, char **argv) execute = EGexecute; } - /* Case-insensitive matching is expensive in multibyte locales - because a few characters may change size when converted to upper - or lower case. To accommodate those, search the input one line - at a time, rather than using the much more efficient buffer search. - - Try to convert a regular expression 'foo' (ignoring case) to an - equivalent regular expression '[fF][oO][oO]' (where case matters). - Not only does this avoid the expensive requirement to read and - process a line at a time, it also allows use of the kwset engine, - a win in non-UTF-8 multibyte locales. */ - if (match_icase) - { - size_t new_keycc; - char *new_keys; - /* It is not possible with -F, not useful with -P (pcre) and there is no - point when there is no regexp. It also depends on which constructs - appear in the regexp. See trivial_case_ignore for those details. */ - if (keycc - && ! (compile == Fcompile || compile == Pcompile) - && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys)) - { - match_icase = 0; - free (keys); - keys = new_keys; - keycc = new_keycc; - } - } - if (MB_CUR_MAX > 1) build_mbclen_cache (); -- 1.9.0