grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.25-67-g1e166fd


From: Paul Eggert
Subject: grep branch, master, updated. v2.25-67-g1e166fd
Date: Thu, 1 Sep 2016 15:54:09 +0000 (UTC)

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  1e166fdaa69902793057ed2a63408f51c3b4273b (commit)
       via  960ad317db21e781b04010f4128bb149273a3327 (commit)
      from  dc9d5bf33857b6c84f233e7c0d3192993b287928 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=1e166fdaa69902793057ed2a63408f51c3b4273b


commit 1e166fdaa69902793057ed2a63408f51c3b4273b
Author: Paul Eggert <address@hidden>
Date:   Thu Sep 1 08:43:48 2016 -0700

    grep: avoid code duplication with -iF
    
    This follows up on the -iF performance improvement (Bug#23752).
    * NEWS: Simplify description of -iF improvement.
    * src/dfa.c: Do not include wctype.h.
    (lonesome_lower, case_folded_counterparts): Move to localeinfo.c.
    (CASE_FOLDED_BUFSIZE): Move to localeinfo.h.
    * src/grep.c: Do not include wctype.h.
    (lonesome_lower): Remove.
    (fgrep_icase_available): Use case_folded_counterparts instead.
    Do not call it for the same character twice.
    Return false on wcrtomb failures (which should never happen).
    (fgrep_to_grep_pattern, main): Simplify.  Let fgrep_to_grep’s
    caller fiddle with the global variables.
    * src/localeinfo.c: Include <wctype.h>
    (lonesome_lower, case_folded_counterparts):
    Move here from src/dfa.c.  Return int, not unsigned int.
    Verify that CASE_FOLDED_BUFSIZE is big enough.
    * src/localeinfo.h (CASE_FOLDED_BUFSIZE): Now 32, so that
    we don’t expose lonesome_lower’s size.
    * src/searchutils.c (kwsinit): Return new kwset instead of
    storing it via a pointer.  All callers changed.  Simplify a bit.

diff --git a/NEWS b/NEWS
index 65e663d..91f1bfc 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,15 @@ GNU grep NEWS                                    -*- 
outline -*-
   as it now uses the Aho-Corasick algorithm instead of the
   Commentz-Walter algorithm in that case.
 
+  grep -iF is typically much faster in a multibyte locale, if the
+  pattern and its case counterparts contain only single byte characters.
+
   In multibyte locales that are not UTF-8, grep now handles leading
   "." in patterns more efficiently.
 
   grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
   invalid regular expression that was read from an '-f'-specified file.
 
-  grep -F now tries fgrep matcher for case insensitive matching in
-  multibyte locale.  It improves performance for a case which a pattern
-  is composed of only single byte characters and their all counterparts
-  are also single byte characters and the pattern does not have invalid
-  sequences.
-
 
 * Noteworthy changes in release 2.25 (2016-04-21) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index cc96740..0f5109e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -50,7 +50,6 @@
 #define _(str) gettext (str)
 
 #include <wchar.h>
-#include <wctype.h>
 
 /* HPUX defines these as macros in sys/param.h.  */
 #ifdef setbit
@@ -857,53 +856,6 @@ using_simple_locale (bool multibyte)
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE,
-  };
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER.  */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
-   itself, and store them into FOLDED.  Return the number of characters
-   stored.  */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
-  unsigned int i;
-  unsigned int n = 0;
-  wint_t uc = towupper (c);
-  wint_t lc = towlower (uc);
-  if (uc != c)
-    folded[n++] = uc;
-  if (lc != uc && lc != c && towupper (lc) == uc)
-    folded[n++] = lc;
-  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
-    {
-      wint_t li = lonesome_lower[i];
-      if (li != lc && li != uc && li != c && towupper (li) == uc)
-        folded[n++] = li;
-    }
-  return n;
-}
-
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 548ef08..61b1f70 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -89,7 +89,7 @@ kwsmusts (void)
   struct dfamust *dm = dfamust (dfa);
   if (!dm)
     return;
-  kwsinit (&kwset, false);
+  kwset = kwsinit (false);
   if (dm->exact)
     {
       /* Prepare a substring whose presence implies a match.
diff --git a/src/grep.c b/src/grep.c
index ae3b6e7..15c6dc6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -22,7 +22,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <wchar.h>
-#include <wctype.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdarg.h>
@@ -2265,91 +2264,53 @@ contains_encoding_error (char const *pat, size_t patlen)
   return false;
 }
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE
-  };
+/* Return true if the fgrep pattern PAT, of size PATLEN, matches only
+   single-byte characters including case folding, and so is suitable
+   to be converted to a grep pattern.  */
 
 static bool
 fgrep_icase_available (char const *pat, size_t patlen)
 {
-  for (size_t i = 0; i < patlen; ++i)
-    {
-      unsigned char c = pat[i];
-      if (localeinfo.sbclen[c] > 1)
-        return false;
-    }
+  bool used[UCHAR_MAX + 1] = { 0, };
 
-  for (size_t i = 0; i < patlen; ++i)
+  for (size_t i = 0; i < patlen; i++)
     {
       unsigned char c = pat[i];
-
-      wint_t wc = localeinfo.sbctowc[c];
-      if (wc == WEOF)
+      if (localeinfo.sbctowc[c] == WEOF)
         return false;
+      used[c] = true;
+    }
 
-      wint_t uwc = towupper (wc);
-      if (uwc != wc)
-        {
-          char s[MB_LEN_MAX];
-          mbstate_t mb_state = { 0 };
-          size_t len = wcrtomb (s, uwc, &mb_state);
-          if (len > 1 && len != (size_t) -1)
-            return false;
-        }
-
-      wint_t lwc = towlower (uwc);
-      if (lwc != uwc && lwc != wc && towupper (lwc) == uwc)
-        {
-          char s[MB_LEN_MAX];
-          mbstate_t mb_state = { 0 };
-          size_t len = wcrtomb (s, lwc, &mb_state);
-          if (len > 1 && len != (size_t) -1)
-            return false;
-        }
+  for (int c = 0; c <= UCHAR_MAX; c++)
+    if (used[c])
+      {
+        wint_t wc = localeinfo.sbctowc[c];
+        wchar_t folded[CASE_FOLDED_BUFSIZE];
+        size_t nfolded = case_folded_counterparts (wc, folded);
 
-      for (size_t j = 0; lonesome_lower[j]; j++)
-        {
-          wint_t li = lonesome_lower[j];
-          if (li != lwc && li != uwc && li != wc && towupper (li) == uwc)
-            {
-              char s[MB_LEN_MAX];
-              mbstate_t mb_state = { 0 };
-              size_t len = wcrtomb (s, li, &mb_state);
-              if (len > 1 && len != (size_t) -1)
-                return false;
-            }
-        }
-    }
+        for (size_t i = 0; i < nfolded; i++)
+          {
+            char s[MB_LEN_MAX];
+            mbstate_t mb_state = { 0 };
+            if (1 < wcrtomb (s, folded[i], &mb_state))
+              return false;
+          }
+      }
 
   return true;
 }
 
-/* Change a pattern for fgrep into grep.  */
+/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style.  */
+
 static void
 fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
 {
-  char *keys, *new_keys, *p;
+  size_t len = *len_p;
+  char *keys = *keys_p;
   mbstate_t mb_state = { 0 };
-  size_t len, n;
-
-  len = *len_p;
-  keys = *keys_p;
-
-  new_keys = xnmalloc (len + 1, 2);
-  p = new_keys;
+  char *new_keys = xnmalloc (len + 1, 2);
+  char *p = new_keys;
+  size_t n;
 
   for (; len; keys += n, len -= n)
     {
@@ -2365,14 +2326,15 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
 
         case (size_t) -1:
           memset (&mb_state, 0, sizeof mb_state);
+          n = 1;
           /* Fall through.  */
         case 1:
-          *p = '\\';
-          p += strchr ("$*.[\\^", *keys) != NULL;
-          /* Fall through.  */
-        case 0:
+          switch (*keys)
+            {
+            case '$': case '*': case '.': case '[': case '\\': case '^':
+              *p++ = '\\'; break;
+            }
           *p++ = *keys;
-          n = 1;
           break;
         }
     }
@@ -2380,10 +2342,6 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
   free (*keys_p);
   *keys_p = new_keys;
   *len_p = p - new_keys;
-
-  matcher = "grep";
-  compile = Gcompile;
-  execute = EGexecute;
 }
 
 int
@@ -2814,19 +2772,18 @@ main (int argc, char **argv)
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
      In a multibyte locale, switch from fgrep to grep if either
-     (1) case is ignored (where grep is typically faster), or
-     (2) the pattern has an encoding error (where fgrep might not work).  */
-  if (compile == Fcompile)
+     (1) the pattern has an encoding error (where fgrep might not work), or
+     (2) case is ignored and a fast fgrep ignore-case is not available.  */
+  if (compile == Fcompile
+      && (MB_CUR_MAX <= 1
+          ? match_words
+          : (contains_encoding_error (keys, keycc)
+             || (match_icase && !fgrep_icase_available (keys, keycc)))))
     {
-      if (MB_CUR_MAX > 1)
-        {
-          if (contains_encoding_error (keys, keycc))
-            fgrep_to_grep_pattern (&keys, &keycc);
-          else if (match_icase && !fgrep_icase_available (keys, keycc))
-            fgrep_to_grep_pattern (&keys, &keycc);
-        }
-      else if (match_words)
-        fgrep_to_grep_pattern (&keys, &keycc);
+      fgrep_to_grep_pattern (&keys, &keycc);
+      matcher = "grep";
+      compile = Gcompile;
+      execute = EGexecute;
     }
 
   compile (keys, keycc);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7fe08aa..29d140c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size)
 {
   size_t total = size;
 
-  kwsinit (&kwset, true);
+  kwset = kwsinit (true);
 
   char const *p = pattern;
   do
diff --git a/src/localeinfo.c b/src/localeinfo.c
index 329d431..ca96afc 100644
--- a/src/localeinfo.c
+++ b/src/localeinfo.c
@@ -29,6 +29,7 @@
 #include <locale.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wctype.h>
 
 /* The sbclen implementation relies on this.  */
 verify (MB_LEN_MAX <= SCHAR_MAX);
@@ -64,3 +65,49 @@ init_localeinfo (struct localeinfo *localeinfo)
       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
     }
 }
+
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+/* Verify that the worst case fits.  This is 1 for towupper, 1 for
+   towlower, and 1 for each entry in LONESOME_LOWER.  */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+        <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
index 70b55a8..cf2f9a6 100644
--- a/src/localeinfo.h
+++ b/src/localeinfo.h
@@ -45,3 +45,10 @@ struct localeinfo
 };
 
 extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is a generous upper bound.  */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/search.h b/src/search.h
index 534a49e..fb4e5c8 100644
--- a/src/search.h
+++ b/src/search.h
@@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN
 typedef signed char mb_len_map_t;
 
 /* searchutils.c */
-extern void kwsinit (kwset_t *, bool);
+extern kwset_t kwsinit (bool);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
 extern wint_t mb_next_wc (char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 87f51a4..73d6c1c 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,42 +24,36 @@
 
 #define NCHAR (UCHAR_MAX + 1)
 
-void
-kwsinit (kwset_t *kwset, bool mb_trans)
+kwset_t
+kwsinit (bool mb_trans)
 {
   static char trans[NCHAR];
-  int i;
+  char *transptr = NULL;
 
   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
     {
       if (MB_CUR_MAX == 1)
-        for (i = 0; i < NCHAR; ++i)
+        for (int i = 0; i < NCHAR; i++)
           trans[i] = toupper (i);
       else
-        for (i = 0; i < NCHAR; ++i)
+        for (int i = 0; i < NCHAR; i++)
           {
             wint_t wc = localeinfo.sbctowc[i];
             wint_t uwc = towupper (wc);
             if (uwc != wc)
               {
-                char s[MB_LEN_MAX];
                 mbstate_t mbs = { 0 };
-                size_t len = wcrtomb (s, uwc, &mbs);
-                if (len > 1)
+                size_t len = wcrtomb (&trans[i], uwc, &mbs);
+                if (len != 1)
                   abort ();
-                trans[i] = s[0];
               }
             else
               trans[i] = i;
           }
-
-      *kwset = kwsalloc (trans, false);
+      transptr = trans;
     }
-  else
-    *kwset = kwsalloc (NULL, false);
 
-  if (!*kwset)
-    xalloc_die ();
+  return kwsalloc (transptr, false);
 }
 
 /* In the buffer *MB_START, return the number of bytes needed to go

http://git.savannah.gnu.org/cgit/grep.git/commit/?id=960ad317db21e781b04010f4128bb149273a3327


commit 1e166fdaa69902793057ed2a63408f51c3b4273b
Author: Paul Eggert <address@hidden>
Date:   Thu Sep 1 08:43:48 2016 -0700

    grep: avoid code duplication with -iF
    
    This follows up on the -iF performance improvement (Bug#23752).
    * NEWS: Simplify description of -iF improvement.
    * src/dfa.c: Do not include wctype.h.
    (lonesome_lower, case_folded_counterparts): Move to localeinfo.c.
    (CASE_FOLDED_BUFSIZE): Move to localeinfo.h.
    * src/grep.c: Do not include wctype.h.
    (lonesome_lower): Remove.
    (fgrep_icase_available): Use case_folded_counterparts instead.
    Do not call it for the same character twice.
    Return false on wcrtomb failures (which should never happen).
    (fgrep_to_grep_pattern, main): Simplify.  Let fgrep_to_grep’s
    caller fiddle with the global variables.
    * src/localeinfo.c: Include <wctype.h>
    (lonesome_lower, case_folded_counterparts):
    Move here from src/dfa.c.  Return int, not unsigned int.
    Verify that CASE_FOLDED_BUFSIZE is big enough.
    * src/localeinfo.h (CASE_FOLDED_BUFSIZE): Now 32, so that
    we don’t expose lonesome_lower’s size.
    * src/searchutils.c (kwsinit): Return new kwset instead of
    storing it via a pointer.  All callers changed.  Simplify a bit.

diff --git a/NEWS b/NEWS
index 65e663d..91f1bfc 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,15 @@ GNU grep NEWS                                    -*- 
outline -*-
   as it now uses the Aho-Corasick algorithm instead of the
   Commentz-Walter algorithm in that case.
 
+  grep -iF is typically much faster in a multibyte locale, if the
+  pattern and its case counterparts contain only single byte characters.
+
   In multibyte locales that are not UTF-8, grep now handles leading
   "." in patterns more efficiently.
 
   grep now prints a "FILENAME:LINENO: " prefix when diagnosing an
   invalid regular expression that was read from an '-f'-specified file.
 
-  grep -F now tries fgrep matcher for case insensitive matching in
-  multibyte locale.  It improves performance for a case which a pattern
-  is composed of only single byte characters and their all counterparts
-  are also single byte characters and the pattern does not have invalid
-  sequences.
-
 
 * Noteworthy changes in release 2.25 (2016-04-21) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index cc96740..0f5109e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -50,7 +50,6 @@
 #define _(str) gettext (str)
 
 #include <wchar.h>
-#include <wctype.h>
 
 /* HPUX defines these as macros in sys/param.h.  */
 #ifdef setbit
@@ -857,53 +856,6 @@ using_simple_locale (bool multibyte)
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE,
-  };
-
-/* Maximum number of characters that can be the case-folded
-   counterparts of a single character, not counting the character
-   itself.  This is 1 for towupper, 1 for towlower, and 1 for each
-   entry in LONESOME_LOWER.  */
-enum
-{ CASE_FOLDED_BUFSIZE = 2 + sizeof lonesome_lower / sizeof *lonesome_lower };
-
-/* Find the characters equal to C after case-folding, other than C
-   itself, and store them into FOLDED.  Return the number of characters
-   stored.  */
-static unsigned int
-case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
-{
-  unsigned int i;
-  unsigned int n = 0;
-  wint_t uc = towupper (c);
-  wint_t lc = towlower (uc);
-  if (uc != c)
-    folded[n++] = uc;
-  if (lc != uc && lc != c && towupper (lc) == uc)
-    folded[n++] = lc;
-  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
-    {
-      wint_t li = lonesome_lower[i];
-      if (li != lc && li != uc && li != c && towupper (li) == uc)
-        folded[n++] = li;
-    }
-  return n;
-}
-
 typedef int predicate (int);
 
 /* The following list maps the names of the Posix named character classes
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 548ef08..61b1f70 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -89,7 +89,7 @@ kwsmusts (void)
   struct dfamust *dm = dfamust (dfa);
   if (!dm)
     return;
-  kwsinit (&kwset, false);
+  kwset = kwsinit (false);
   if (dm->exact)
     {
       /* Prepare a substring whose presence implies a match.
diff --git a/src/grep.c b/src/grep.c
index ae3b6e7..15c6dc6 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -22,7 +22,6 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <wchar.h>
-#include <wctype.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdarg.h>
@@ -2265,91 +2264,53 @@ contains_encoding_error (char const *pat, size_t patlen)
   return false;
 }
 
-/* The set of wchar_t values C such that there's a useful locale
-   somewhere where C != towupper (C) && C != towlower (towupper (C)).
-   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
-   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
-   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
-static short const lonesome_lower[] =
-  {
-    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
-    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
-
-    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
-       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
-    0x03F2,
-
-    0x03F5, 0x1E9B, 0x1FBE
-  };
+/* Return true if the fgrep pattern PAT, of size PATLEN, matches only
+   single-byte characters including case folding, and so is suitable
+   to be converted to a grep pattern.  */
 
 static bool
 fgrep_icase_available (char const *pat, size_t patlen)
 {
-  for (size_t i = 0; i < patlen; ++i)
-    {
-      unsigned char c = pat[i];
-      if (localeinfo.sbclen[c] > 1)
-        return false;
-    }
+  bool used[UCHAR_MAX + 1] = { 0, };
 
-  for (size_t i = 0; i < patlen; ++i)
+  for (size_t i = 0; i < patlen; i++)
     {
       unsigned char c = pat[i];
-
-      wint_t wc = localeinfo.sbctowc[c];
-      if (wc == WEOF)
+      if (localeinfo.sbctowc[c] == WEOF)
         return false;
+      used[c] = true;
+    }
 
-      wint_t uwc = towupper (wc);
-      if (uwc != wc)
-        {
-          char s[MB_LEN_MAX];
-          mbstate_t mb_state = { 0 };
-          size_t len = wcrtomb (s, uwc, &mb_state);
-          if (len > 1 && len != (size_t) -1)
-            return false;
-        }
-
-      wint_t lwc = towlower (uwc);
-      if (lwc != uwc && lwc != wc && towupper (lwc) == uwc)
-        {
-          char s[MB_LEN_MAX];
-          mbstate_t mb_state = { 0 };
-          size_t len = wcrtomb (s, lwc, &mb_state);
-          if (len > 1 && len != (size_t) -1)
-            return false;
-        }
+  for (int c = 0; c <= UCHAR_MAX; c++)
+    if (used[c])
+      {
+        wint_t wc = localeinfo.sbctowc[c];
+        wchar_t folded[CASE_FOLDED_BUFSIZE];
+        size_t nfolded = case_folded_counterparts (wc, folded);
 
-      for (size_t j = 0; lonesome_lower[j]; j++)
-        {
-          wint_t li = lonesome_lower[j];
-          if (li != lwc && li != uwc && li != wc && towupper (li) == uwc)
-            {
-              char s[MB_LEN_MAX];
-              mbstate_t mb_state = { 0 };
-              size_t len = wcrtomb (s, li, &mb_state);
-              if (len > 1 && len != (size_t) -1)
-                return false;
-            }
-        }
-    }
+        for (size_t i = 0; i < nfolded; i++)
+          {
+            char s[MB_LEN_MAX];
+            mbstate_t mb_state = { 0 };
+            if (1 < wcrtomb (s, folded[i], &mb_state))
+              return false;
+          }
+      }
 
   return true;
 }
 
-/* Change a pattern for fgrep into grep.  */
+/* Change the pattern *KEYS_P, of size *LEN_P, from fgrep to grep style.  */
+
 static void
 fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
 {
-  char *keys, *new_keys, *p;
+  size_t len = *len_p;
+  char *keys = *keys_p;
   mbstate_t mb_state = { 0 };
-  size_t len, n;
-
-  len = *len_p;
-  keys = *keys_p;
-
-  new_keys = xnmalloc (len + 1, 2);
-  p = new_keys;
+  char *new_keys = xnmalloc (len + 1, 2);
+  char *p = new_keys;
+  size_t n;
 
   for (; len; keys += n, len -= n)
     {
@@ -2365,14 +2326,15 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
 
         case (size_t) -1:
           memset (&mb_state, 0, sizeof mb_state);
+          n = 1;
           /* Fall through.  */
         case 1:
-          *p = '\\';
-          p += strchr ("$*.[\\^", *keys) != NULL;
-          /* Fall through.  */
-        case 0:
+          switch (*keys)
+            {
+            case '$': case '*': case '.': case '[': case '\\': case '^':
+              *p++ = '\\'; break;
+            }
           *p++ = *keys;
-          n = 1;
           break;
         }
     }
@@ -2380,10 +2342,6 @@ fgrep_to_grep_pattern (char **keys_p, size_t *len_p)
   free (*keys_p);
   *keys_p = new_keys;
   *len_p = p - new_keys;
-
-  matcher = "grep";
-  compile = Gcompile;
-  execute = EGexecute;
 }
 
 int
@@ -2814,19 +2772,18 @@ main (int argc, char **argv)
   /* In a unibyte locale, switch from fgrep to grep if
      the pattern matches words (where grep is typically faster).
      In a multibyte locale, switch from fgrep to grep if either
-     (1) case is ignored (where grep is typically faster), or
-     (2) the pattern has an encoding error (where fgrep might not work).  */
-  if (compile == Fcompile)
+     (1) the pattern has an encoding error (where fgrep might not work), or
+     (2) case is ignored and a fast fgrep ignore-case is not available.  */
+  if (compile == Fcompile
+      && (MB_CUR_MAX <= 1
+          ? match_words
+          : (contains_encoding_error (keys, keycc)
+             || (match_icase && !fgrep_icase_available (keys, keycc)))))
     {
-      if (MB_CUR_MAX > 1)
-        {
-          if (contains_encoding_error (keys, keycc))
-            fgrep_to_grep_pattern (&keys, &keycc);
-          else if (match_icase && !fgrep_icase_available (keys, keycc))
-            fgrep_to_grep_pattern (&keys, &keycc);
-        }
-      else if (match_words)
-        fgrep_to_grep_pattern (&keys, &keycc);
+      fgrep_to_grep_pattern (&keys, &keycc);
+      matcher = "grep";
+      compile = Gcompile;
+      execute = EGexecute;
     }
 
   compile (keys, keycc);
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 7fe08aa..29d140c 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -38,7 +38,7 @@ Fcompile (char const *pattern, size_t size)
 {
   size_t total = size;
 
-  kwsinit (&kwset, true);
+  kwset = kwsinit (true);
 
   char const *p = pattern;
   do
diff --git a/src/localeinfo.c b/src/localeinfo.c
index 329d431..ca96afc 100644
--- a/src/localeinfo.c
+++ b/src/localeinfo.c
@@ -29,6 +29,7 @@
 #include <locale.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wctype.h>
 
 /* The sbclen implementation relies on this.  */
 verify (MB_LEN_MAX <= SCHAR_MAX);
@@ -64,3 +65,49 @@ init_localeinfo (struct localeinfo *localeinfo)
       localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
     }
 }
+
+/* The set of wchar_t values C such that there's a useful locale
+   somewhere where C != towupper (C) && C != towlower (towupper (C)).
+   For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
+   towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
+   towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */
+static short const lonesome_lower[] =
+  {
+    0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
+    0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
+
+    /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
+       counterpart in locales predating Unicode 4.0.0 (April 2003).  */
+    0x03F2,
+
+    0x03F5, 0x1E9B, 0x1FBE,
+  };
+
+/* Verify that the worst case fits.  This is 1 for towupper, 1 for
+   towlower, and 1 for each entry in LONESOME_LOWER.  */
+verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
+        <= CASE_FOLDED_BUFSIZE);
+
+/* Find the characters equal to C after case-folding, other than C
+   itself, and store them into FOLDED.  Return the number of characters
+   stored.  */
+
+int
+case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
+{
+  int i;
+  int n = 0;
+  wint_t uc = towupper (c);
+  wint_t lc = towlower (uc);
+  if (uc != c)
+    folded[n++] = uc;
+  if (lc != uc && lc != c && towupper (lc) == uc)
+    folded[n++] = lc;
+  for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
+    {
+      wint_t li = lonesome_lower[i];
+      if (li != lc && li != uc && li != c && towupper (li) == uc)
+        folded[n++] = li;
+    }
+  return n;
+}
diff --git a/src/localeinfo.h b/src/localeinfo.h
index 70b55a8..cf2f9a6 100644
--- a/src/localeinfo.h
+++ b/src/localeinfo.h
@@ -45,3 +45,10 @@ struct localeinfo
 };
 
 extern void init_localeinfo (struct localeinfo *);
+
+/* Maximum number of characters that can be the case-folded
+   counterparts of a single character, not counting the character
+   itself.  This is a generous upper bound.  */
+enum { CASE_FOLDED_BUFSIZE = 32 };
+
+extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);
diff --git a/src/search.h b/src/search.h
index 534a49e..fb4e5c8 100644
--- a/src/search.h
+++ b/src/search.h
@@ -47,7 +47,7 @@ _GL_INLINE_HEADER_BEGIN
 typedef signed char mb_len_map_t;
 
 /* searchutils.c */
-extern void kwsinit (kwset_t *, bool);
+extern kwset_t kwsinit (bool);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
 extern wint_t mb_prev_wc (char const *, char const *, char const *);
 extern wint_t mb_next_wc (char const *, char const *);
diff --git a/src/searchutils.c b/src/searchutils.c
index 87f51a4..73d6c1c 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -24,42 +24,36 @@
 
 #define NCHAR (UCHAR_MAX + 1)
 
-void
-kwsinit (kwset_t *kwset, bool mb_trans)
+kwset_t
+kwsinit (bool mb_trans)
 {
   static char trans[NCHAR];
-  int i;
+  char *transptr = NULL;
 
   if (match_icase && (MB_CUR_MAX == 1 || mb_trans))
     {
       if (MB_CUR_MAX == 1)
-        for (i = 0; i < NCHAR; ++i)
+        for (int i = 0; i < NCHAR; i++)
           trans[i] = toupper (i);
       else
-        for (i = 0; i < NCHAR; ++i)
+        for (int i = 0; i < NCHAR; i++)
           {
             wint_t wc = localeinfo.sbctowc[i];
             wint_t uwc = towupper (wc);
             if (uwc != wc)
               {
-                char s[MB_LEN_MAX];
                 mbstate_t mbs = { 0 };
-                size_t len = wcrtomb (s, uwc, &mbs);
-                if (len > 1)
+                size_t len = wcrtomb (&trans[i], uwc, &mbs);
+                if (len != 1)
                   abort ();
-                trans[i] = s[0];
               }
             else
               trans[i] = i;
           }
-
-      *kwset = kwsalloc (trans, false);
+      transptr = trans;
     }
-  else
-    *kwset = kwsalloc (NULL, false);
 
-  if (!*kwset)
-    xalloc_die ();
+  return kwsalloc (transptr, false);
 }
 
 /* In the buffer *MB_START, return the number of bytes needed to go

-----------------------------------------------------------------------

Summary of changes:
 NEWS              |    3 +++
 src/dfa.c         |   48 ---------------------------------
 src/dfasearch.c   |    2 +-
 src/grep.c        |   77 +++++++++++++++++++++++++++++++++++++++--------------
 src/kwsearch.c    |    2 +-
 src/localeinfo.c  |   47 ++++++++++++++++++++++++++++++++
 src/localeinfo.h  |    7 +++++
 src/search.h      |    2 +-
 src/searchutils.c |   36 ++++++++++++++++---------
 9 files changed, 141 insertions(+), 83 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]