From bd012ab161c017feca3f7867776aa9c80e8ae891 Mon Sep 17 00:00:00 2001
From: Paul Eggert <address@hidden>
Date: Mon, 7 Apr 2014 20:28:26 -0700
Subject: [PATCH] grep: remove trival_case_ignore

This optimization is no longer needed, given the other
optimizations recently installed.  Derived from a patch by
Norihiro Tanaka; see <http://bugs.gnu.org/17019>.
* bootstrap.conf (gnulib_modules): Remove assert-h.
* src/dfa.c (CASE_FOLDED_BUFSIZE): Move here from dfa.h.
Remove now-unnecessary static assert.
(case_folded_counterparts): Now static.
* src/dfa.h (CASE_FOLDED_BUFSIZE, case_folded_counterparts):
Remove decls; no longer public.
* src/dfasearch.c (kwsmusts): Use kwset even if fill MB_CUR_MAX > 1
and case-insensitive.
* src/grep.c (MBRTOWC, WCRTOMB): Remove.
(fgrep_to_grep_pattern): Use mbrtowc, not MBRTOWC.
(trivial_case_ignore): Remove; this optimization is no longer needed.
All uses removed.
---
 bootstrap.conf  |   1 -
 src/dfa.c       |  11 ++++--
 src/dfa.h       |   8 ----
 src/dfasearch.c |   6 ---
 src/grep.c      | 119 +-------------------------------------------------------
 5 files changed, 8 insertions(+), 137 deletions(-)

diff --git a/bootstrap.conf b/bootstrap.conf
index 86cd81d..367427d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -24,7 +24,6 @@ gnulib_modules='
 alloca
 announce-gen
 argmatch
-assert-h
 binary-io
 btowc
 c-ctype
diff --git a/src/dfa.c b/src/dfa.c
index b6c1250..d8744d7 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -933,14 +933,17 @@ static short const lonesome_lower[] =
     0x03F5, 0x1E9B, 0x1FBE,
   };
 
-static_assert ((sizeof lonesome_lower / sizeof *lonesome_lower + 2
-                == CASE_FOLDED_BUFSIZE),
-               "CASE_FOLDED_BUFSIZE is wrong");
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
+   entry in LONESOME_LOWER.  */
+enum
+{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
 
 /* Find the characters equal to C after case-folding, other than C
    itself, and store them into FOLDED.  Return the number of characters
    stored.  */
-int
+static int
 case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
 {
   int i;
diff --git a/src/dfa.h b/src/dfa.h
index 6ed2231..db29a62 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -112,11 +112,3 @@ extern void dfawarn (const char *);
 extern _Noreturn void dfaerror (const char *);
 
 extern int using_utf8 (void);
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER; see dfa.c.  */
-enum { CASE_FOLDED_BUFSIZE = 1 + 1 + 19 };
-
-extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 44360b6..2ae0a4a 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -81,12 +81,6 @@ dfawarn (char const *mesg)
 static void
 kwsmusts (void)
 {
-  /* With case-insensitive matching in a multi-byte locale, do not
-     use kwsearch, because in that case, it would be too expensive,
-     requiring that we case-convert all searched input.  */
-  if (MB_CUR_MAX > 1 && match_icase)
-    return;
-
   struct dfamust const *dm = dfamusts (dfa);
   if (dm)
     {
diff --git a/src/grep.c b/src/grep.c
index 7033730..8bd6c49 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -1894,15 +1894,6 @@ parse_grep_colors (void)
       return;
 }
 
-#define MBRTOWC(pwc, s, n, ps) \
-  (MB_CUR_MAX == 1 \
-   ? (*(pwc) = btowc (*(unsigned char *) (s)), 1) \
-   : mbrtowc (pwc, s, n, ps))
-#define WCRTOMB(s, wc, ps) \
-  (MB_CUR_MAX == 1 \
-   ? (*(s) = wctob ((wint_t) (wc)), 1) \
-   : wcrtomb (s, wc, ps))
-
 /* Change a pattern for fgrep into grep.  */
 static void
 fgrep_to_grep_pattern (size_t len, char const *keys,
@@ -1915,7 +1906,7 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
   for (; len; keys += n, len -= n)
     {
       wchar_t wc;
-      n = MBRTOWC (&wc, keys, len, &mb_state);
+      n = mbrtowc (&wc, keys, len, &mb_state);
       switch (n)
         {
         case (size_t) -2:
@@ -1942,86 +1933,6 @@ fgrep_to_grep_pattern (size_t len, char const *keys,
   *new_len = p - *new_keys;
 }
 
-/* If the newline-separated regular expressions, KEYS (with length, LEN
-   and no trailing NUL byte), are amenable to transformation into
-   otherwise equivalent case-ignoring ones, perform the transformation,
-   put the result into malloc'd memory, *NEW_KEYS with length *NEW_LEN,
-   and return true.  Otherwise, return false.  */
-
-static bool
-trivial_case_ignore (size_t len, char const *keys,
-                     size_t *new_len, char **new_keys)
-{
-  /* FIXME: consider removing the following restriction:
-     Reject if KEYS contain ASCII '\\' or '['.  */
-  if (memchr (keys, '\\', len) || memchr (keys, '[', len))
-    return false;
-
-  /* Worst case is that each byte B of KEYS is ASCII alphabetic and
-     CASE_FOLDED_BUFSIZE other_case(B) characters, C through Z, each
-     occupying MB_CUR_MAX bytes, so each B maps to [BC...Z], which
-     requires CASE_FOLDED_BUFSIZE * MB_CUR_MAX + 3 bytes; this is
-     bounded above by the constant expression CASE_FOLDED_BUFSIZE *
-     MB_LEN_MAX + 3.  */
-  *new_keys = xnmalloc (len + 1, CASE_FOLDED_BUFSIZE * MB_LEN_MAX + 3);
-  char *p = *new_keys;
-
-  mbstate_t mb_state = { 0 };
-  while (len)
-    {
-      bool initial_state = mbsinit (&mb_state) != 0;
-      wchar_t wc;
-      size_t n = MBRTOWC (&wc, keys, len, &mb_state);
-
-      /* For an invalid, incomplete or L'\0', skip this optimization.  */
-      if ((size_t) -2 <= n)
-        {
-        skip_case_ignore_optimization:
-          free (*new_keys);
-          return false;
-        }
-
-      char const *orig = keys;
-      keys += n;
-      len -= n;
-
-      wchar_t folded[CASE_FOLDED_BUFSIZE];
-      int nfolded = case_folded_counterparts (wc, folded);
-      if (nfolded <= 0)
-        {
-          memcpy (p, orig, n);
-          p += n;
-        }
-      else if (! initial_state)
-        goto skip_case_ignore_optimization;
-      else
-        {
-          *p++ = '[';
-          memcpy (p, orig, n);
-          p += n;
-
-          int i = 0;
-          do
-            {
-              size_t nbytes = WCRTOMB (p, folded[i], &mb_state);
-              if (nbytes == (size_t) -1)
-                goto skip_case_ignore_optimization;
-              p += nbytes;
-            }
-          while (++i < nfolded);
-
-          if (! mbsinit (&mb_state))
-            goto skip_case_ignore_optimization;
-
-          *p++ = ']';
-        }
-    }
-
-  *new_len = p - *new_keys;
-
-  return true;
-}
-
 int
 main (int argc, char **argv)
 {
@@ -2432,34 +2343,6 @@ main (int argc, char **argv)
       execute = EGexecute;
     }
 
-  /* Case-insensitive matching is expensive in multibyte locales
-     because a few characters may change size when converted to upper
-     or lower case.  To accommodate those, search the input one line
-     at a time, rather than using the much more efficient buffer search.
-
-     Try to convert a regular expression 'foo' (ignoring case) to an
-     equivalent regular expression '[fF][oO][oO]' (where case matters).
-     Not only does this avoid the expensive requirement to read and
-     process a line at a time, it also allows use of the kwset engine,
-     a win in non-UTF-8 multibyte locales.  */
-  if (match_icase)
-    {
-      size_t new_keycc;
-      char *new_keys;
-      /* It is not possible with -F, not useful with -P (pcre) and there is no
-         point when there is no regexp.  It also depends on which constructs
-         appear in the regexp.  See trivial_case_ignore for those details.  */
-      if (keycc
-          && ! (compile == Fcompile || compile == Pcompile)
-          && trivial_case_ignore (keycc, keys, &new_keycc, &new_keys))
-        {
-          match_icase = 0;
-          free (keys);
-          keys = new_keys;
-          keycc = new_keycc;
-        }
-    }
-
   if (MB_CUR_MAX > 1)
     build_mbclen_cache ();
 
-- 
1.9.0