From bd012ab161c017feca3f7867776aa9c80e8ae891 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Mon, 7 Apr 2014 20:28:26 -0700
Subject: [PATCH] grep: remove trival_case_ignore
This optimization is no longer needed, given the other
optimizations recently installed. Derived from a patch by
Norihiro Tanaka; see .
* bootstrap.conf (gnulib_modules): Remove assert-h.
* src/dfa.c (CASE_FOLDED_BUFSIZE): Move here from dfa.h.
Remove now-unnecessary static assert.
(case_folded_counterparts): Now static.
* src/dfa.h (CASE_FOLDED_BUFSIZE, case_folded_counterparts):
Remove decls; no longer public.
* src/dfasearch.c (kwsmusts): Use kwset even if fill MB_CUR_MAX > 1
and case-insensitive.
* src/grep.c (MBRTOWC, WCRTOMB): Remove.
(fgrep_to_grep_pattern): Use mbrtowc, not MBRTOWC.
(trivial_case_ignore): Remove; this optimization is no longer needed.
All uses removed.
---
bootstrap.conf | 1 -
src/dfa.c | 11 ++++--
src/dfa.h | 8 ----
src/dfasearch.c | 6 ---
src/grep.c | 119 +-------------------------------------------------------
5 files changed, 8 insertions(+), 137 deletions(-)
diff --git a/bootstrap.conf b/bootstrap.conf
index 86cd81d..367427d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -24,7 +24,6 @@ gnulib_modules='
alloca
announce-gen
argmatch
-assert-h
binary-io
btowc
c-ctype
diff --git a/src/dfa.c b/src/dfa.c
index b6c1250..d8744d7 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -933,14 +933,17 @@ static short const lonesome_lower[] =
0x03F5, 0x1E9B, 0x1FBE,
};
-static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
- == CASE_FOLDED_BUFSIZE),
- "CASE_FOLDED_BUFSIZE is wrong");
+/* Maximum number of characters that can be the case-folded
+ counterparts of a single character, not counting the character
+ itself. This is 1 for towupper, 1 for towlower, and 1 for each
+ entry in LONESOME_LOWER. */
+enum
+{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
/* Find the characters equal to C after case-folding, other than C
itself, and store them into FOLDED. Return the number of characters
stored. */
-int
+static int
case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
{
int i;
diff --git a/src/dfa.h b/src/dfa.h
index 6ed2231..db29a62 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -112,11 +112,3 @@ extern void dfawarn (const char *);
extern _Noreturn void dfaerror (const char *);
extern int using_utf8 (void);
-
-/* Maximum number of characters that can be the case-folded
- counterparts of a single character, not counting the character
- itself. This is 1 for towupper, 1 for towlower, and 1 for each
- entry in LONESOME_LOWER; see dfa.c. */
-enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
-
-extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 44360b6..2ae0a4a 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -81,12 +81,6 @@ dfawarn (char const *mesg)
static void
kwsmusts (void)
{
- /* With case-insensitive matching in a multi-byte locale, do not
- use kwsearch, because in that case, it would be too expensive,
- requiring that we case-convert all searched input. */
- if (MB_CUR_MAX > 1 && match_icase)
- return;
-
struct dfamust const *dm = dfamusts (dfa);
if (dm)
{
diff --git a/src/grep.c b/src/grep.c
index 7033730..8bd6c49 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1894,15 +1894,6 @@ parse_grep_colors (void)
return;
}
-#define MBRTOWC(pwc, s, n, ps) \
- (MB_CUR_MAX == 1 \
- ? (*(pwc) = btowc (*(unsigned char *) (s)), 1) \
- : mbrtowc (pwc, s, n, ps))
-#define WCRTOMB(s, wc, ps) \
- (MB_CUR_MAX == 1 \
- ? (*(s) = wctob ((wint_t) (wc)), 1) \
- : wcrtomb (s, wc, ps))
-
/* Change a pattern for fgrep into grep. */
static void
fgrep_to_grep_pattern (size_t len, char const *keys,
@@ -1915,7 +1906,7 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
for (; len; keys += n, len -= n)
{
wchar_t wc;
- n = MBRTOWC (&wc, keys, len, &mb_state);
+ n = mbrtowc (&wc, keys, len, &mb_state);
switch (n)
{
case (size_t) -2:
@@ -1942,86 +1933,6 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
*new_len = p - *new_keys;
}
-/* If the newline-separated regular expressions, KEYS (with length, LEN
- and no trailing NUL byte), are amenable to transformation into
- otherwise equivalent case-ignoring ones, perform the transformation,
- put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
- and return true. Otherwise, return false. */
-
-static bool
-trivial_case_ignore (size_t len, char const *keys,
- size_t *new_len, char **new_keys)
-{
- /* FIXME: consider removing the following restriction:
- Reject if KEYS contain ASCII '\\' or '['. */
- if (memchr (keys, '\\', len) || memchr (keys, '[', len))
- return false;
-
- /* Worst case is that each byte B of KEYS is ASCII alphabetic and
- CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each
- occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which
- requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is
- bounded above by the constant expression CASE_FOLDED_BUFSIZE *
- MB_LEN_MAX + 3. */
- *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3);
- char *p = *new_keys;
-
- mbstate_t mb_state = { 0 };
- while (len)
- {
- bool initial_state = mbsinit (&mb_state) != 0;
- wchar_t wc;
- size_t n = MBRTOWC (&wc, keys, len, &mb_state);
-
- /* For an invalid, incomplete or L'\0', skip this optimization. */
- if ((size_t) -2 <= n)
- {
- skip_case_ignore_optimization:
- free (*new_keys);
- return false;
- }
-
- char const *orig = keys;
- keys += n;
- len -= n;
-
- wchar_t folded[CASE_FOLDED_BUFSIZE];
- int nfolded = case_folded_counterparts (wc, folded);
- if (nfolded <= 0)
- {
- memcpy (p, orig, n);
- p += n;
- }
- else if (! initial_state)
- goto skip_case_ignore_optimization;
- else
- {
- *p++ = '[';
- memcpy (p, orig, n);
- p += n;
-
- int i = 0;
- do
- {
- size_t nbytes = WCRTOMB (p, folded[i], &mb_state);
- if (nbytes == (size_t) -1)
- goto skip_case_ignore_optimization;
- p += nbytes;
- }
- while (++i < nfolded);
-
- if (! mbsinit (&mb_state))
- goto skip_case_ignore_optimization;
-
- *p++ = ']';
- }
- }
-
- *new_len = p - *new_keys;
-
- return true;
-}
-
int
main (int argc, char **argv)
{
@@ -2432,34 +2343,6 @@ main (int argc, char **argv)
execute = EGexecute;
}
- /* Case-insensitive matching is expensive in multibyte locales
- because a few characters may change size when converted to upper
- or lower case. To accommodate those, search the input one line
- at a time, rather than using the much more efficient buffer search.
-
- Try to convert a regular expression 'foo' (ignoring case) to an
- equivalent regular expression '[fF][oO][oO]' (where case matters).
- Not only does this avoid the expensive requirement to read and
- process a line at a time, it also allows use of the kwset engine,
- a win in non-UTF-8 multibyte locales. */
- if (match_icase)
- {
- size_t new_keycc;
- char *new_keys;
- /* It is not possible with -F, not useful with -P (pcre) and there is no
- point when there is no regexp. It also depends on which constructs
- appear in the regexp. See trivial_case_ignore for those details. */
- if (keycc
- && ! (compile == Fcompile || compile == Pcompile)
- && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
- {
- match_icase = 0;
- free (keys);
- keys = new_keys;
- keycc = new_keycc;
- }
- }
-
if (MB_CUR_MAX > 1)
build_mbclen_cache ();
--
1.9.0