grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.8-14-gc4bf293


From: Paolo Bonzini
Subject: grep branch, master, updated. v2.8-14-gc4bf293
Date: Tue, 07 Jun 2011 11:25:56 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968 (commit)
      from  b5cd4788533da574c202219de76f82c5cda9cb20 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968


commit c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968
Author: Paolo Bonzini <address@hidden>
Date:   Tue Jun 7 10:33:59 2011 +0200

    dfa: correct handling of single-byte character ranges
    
    This provides a better fix for the unibyte-bracket-expr and high-bit-range
    testcases, and fixes the latent bug tested by bogus-wctob.
    
    * src/dfa.c (setbit_case_fold): Remove, replace with...
    (setbit_wc, setbit_c, setbit_case_fold_c): ... these.
    (parse_bracket_exp): Use setbit_case_fold_c when iterating over
    single-byte sequences.  Use setbit_wc for multi-byte character sets,
    and setbit_case_fold_c for single-byte character sets.
    (lex): Use setbit_case_fold_c for single-byte character sets.

diff --git a/src/dfa.c b/src/dfa.c
index 83386aa..ba3a27c 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -536,55 +536,65 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
   eolbyte = eol;
 }
 
-/* Like setbit, but if case is folded, set both cases of a letter.
-   For MB_CUR_MAX > 1, one or both of the two cases may not be set,
-   so the resulting charset may only be used as an optimization.  */
-static void
-setbit_case_fold (
+/* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
+   is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
+   this may happen when folding case in weird Turkish locales where
+   dotless i/dotted I are not included in the chosen character set.
+   Return whether a bit was set in the charclass.  */
 #if MBS_SUPPORT
-                  wint_t b,
+static bool
+setbit_wc (wint_t wc, charclass c)
+{
+  int b = wctob (wc);
+  if (b == EOF)
+    return false;
+
+  setbit (b, c);
+  return true;
+}
+
+/* Set a bit in the charclass for the given single byte character,
+   if it is valid in the current character set.  */
+static void
+setbit_c (int b, charclass c)
+{
+  /* Do nothing if b is invalid in this character set.  */
+  if (MB_CUR_MAX > 1 && btowc (b) == EOF)
+    return;
+  setbit (b, c);
+}
 #else
-                  unsigned int b,
+#define setbit_c setbit
 #endif
-                  charclass c)
+
+/* Like setbit_c, but if case is folded, set both cases of a letter.  For
+   MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
+   and the caller takes care of setting the appropriate field of struct
+   mb_char_classes.  */
+static void
+setbit_case_fold_c (int b, charclass c)
 {
-  if (case_fold)
-    {
 #if MBS_SUPPORT
-      if (MB_CUR_MAX > 1)
-        {
-          wint_t b1 = iswupper(b) ? towlower(b) : b;
-          wint_t b2 = iswlower(b) ? towupper(b) : b;
-          if (wctob ((unsigned char)b1) == b1)
-            setbit (b1, c);
-          if (b2 != b1 && wctob ((unsigned char)b2) == b2)
-            setbit (b2, c);
-        }
-      else
-#endif
-        {
-          unsigned char b1 = isupper(b) ? tolower(b) : b;
-          unsigned char b2 = islower(b) ? toupper(b) : b;
-          setbit (b1, c);
-          if (b2 != b1)
-            setbit (b2, c);
-        }
+  if (MB_CUR_MAX > 1)
+    {
+      wint_t wc = btowc (b);
+      if (wc == EOF)
+        return;
+      setbit (b, c);
+      if (case_fold && iswalpha (wc))
+        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
     }
   else
-    {
-#if MBS_SUPPORT
-      /* Below, note how when b2 != b and we have a uni-byte locale
-         (MB_CUR_MAX == 1), we set b = b2.  I.e., in a uni-byte locale,
-         we can safely call setbit with a non-EOF value returned by wctob.  */
-      int b2 = wctob (b);
-      if (b2 == EOF || b2 == b || (MB_CUR_MAX == 1 ? (b=b2), 1 : 0))
 #endif
-        if (b < NOTCHAR)
-          setbit (b, c);
+    {
+      setbit (b, c);
+      if (case_fold && isalpha (b))
+        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
     }
 }
 
 
+
 /* UTF-8 encoding allows some optimizations that we can't otherwise
    assume in a multibyte encoding. */
 static inline int
@@ -863,7 +873,7 @@ parse_bracket_exp (void)
 
                   for (c2 = 0; c2 < NOTCHAR; ++c2)
                     if (pred->func(c2))
-                      setbit_case_fold (c2, ccl);
+                      setbit_case_fold_c (c2, ccl);
                 }
 
 #if MBS_SUPPORT
@@ -974,7 +984,7 @@ parse_bracket_exp (void)
                 }
               if (!hard_LC_COLLATE)
                 for (c = c1; c <= c2; c++)
-                  setbit_case_fold (c, ccl);
+                  setbit_case_fold_c (c, ccl);
               else
                 {
                   /* Defer to the system regex library about the meaning
@@ -988,7 +998,7 @@ parse_bracket_exp (void)
                       subject[0] = c;
                       if (!(case_fold && isupper (c))
                           && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
-                        setbit_case_fold (c, ccl);
+                        setbit_case_fold_c (c, ccl);
                     }
                   regfree (&re);
                 }
@@ -1002,15 +1012,12 @@ parse_bracket_exp (void)
       colon_warning_state |= (c == ':') ? 2 : 4;
 
 #if MBS_SUPPORT
-      /* Build normal characters.  */
-      setbit_case_fold (wc, ccl);
       if (MB_CUR_MAX > 1)
         {
           if (case_fold && iswalpha(wc))
             {
               wc = towlower(wc);
-              c = wctob(wc);
-              if (c == EOF || (wint_t)c != (wint_t)wc)
+              if (!setbit_wc (wc, ccl))
                 {
                   REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                                        work_mbc->nchars + 1);
@@ -1020,19 +1027,18 @@ parse_bracket_exp (void)
               continue;
 #else
               wc = towupper(wc);
-              c = wctob(wc);
 #endif
             }
-          if (c == EOF || (wint_t)c != (wint_t)wc)
+          if (!setbit_wc (wc, ccl))
             {
               REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
                                    work_mbc->nchars + 1);
               work_mbc->chars[work_mbc->nchars++] = wc;
             }
         }
-#else
-      setbit_case_fold (c, ccl);
+      else
 #endif
+        setbit_case_fold_c (c, ccl);
     }
   while ((
 #if MBS_SUPPORT
@@ -1386,7 +1392,7 @@ lex (void)
           if (case_fold && isalpha(c))
             {
               zeroset(ccl);
-              setbit_case_fold (c, ccl);
+              setbit_case_fold_c (c, ccl);
               return lasttok = CSET + charclass_index(ccl);
             }
 

-----------------------------------------------------------------------

Summary of changes:
 src/dfa.c |  104 ++++++++++++++++++++++++++++++++----------------------------
 1 files changed, 55 insertions(+), 49 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]