grep branch, master, updated. v2.18-4-g8b16e90

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.18-4-g8b16e90

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.18-4-g8b16e90
Date:	Thu, 27 Feb 2014 17:29:12 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  8b16e90a248d70aef687931370f8ef7c1f2bc4ef (commit)
      from  0b5003dd7c485bceba81cf8ffa901f3646c2417d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=8b16e90a248d70aef687931370f8ef7c1f2bc4ef


commit 8b16e90a248d70aef687931370f8ef7c1f2bc4ef
Author: Paul Eggert <address@hidden>
Date:   Thu Feb 27 09:26:23 2014 -0800

    grep: fix multiple bugs with bracket expressions
    
    * NEWS: Document this.
    * src/dfa.c (using_simple_locale): New function.
    (parse_bracket_exp): Handle bracket expressions like [a-[.z.]]
    correctly.  Don't assume that dfaexec handles expressions like
    [^a-z] correctly, as they can match multiple characters in some
    locales.
    * tests/posix-bracket: New file.
    * tests/Makefile.am (TESTS): Add it.

diff --git a/NEWS b/NEWS
index 657f3d1..6cfcaba 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep no longer mishandles patterns like [a-[.z.]], and no longer
+  mishandles patterns like [^a] in locales that have multicharacter
+  collating sequences so that [^a] can match a string of two characters.
+
   grep -P now works with -w and -x and backreferences. Before,
   echo aa|grep -Pw '(.)\1' would fail to match, yet
   echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/dfa.c b/src/dfa.c
index 8906ed3..65ab5d6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -182,7 +182,8 @@ enum
   EMPTY = NOTCHAR,              /* EMPTY is a terminal symbol that matches
                                    the empty string.  */
 
-  BACKREF,                      /* BACKREF is generated by \<digit>; it
+  BACKREF,                      /* BACKREF is generated by \<digit>
+                                   or by any other construct that
                                    is not completely handled.  If the scanner
                                    detects a transition on backref, it returns
                                    a kind of "semi-success" indicating that
@@ -769,6 +770,45 @@ using_utf8 (void)
   return utf8;
 }
 
+/* Return true if the current locale is known to be a unibyte locale
+   without multicharacter collating sequences and where range
+   comparisons simply use the native encoding.  These locales can be
+   processed more efficiently.  */
+
+static bool
+using_simple_locale (void)
+{
+  /* True if the native character set is known to be compatible with
+     the C locale.  The following test isn't perfect, but it's good
+     enough in practice, as only ASCII and EBCDIC are in common use
+     and this test correctly accepts ASCII and rejects EBCDIC.  */
+  enum { native_c_charset =
+    ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+     && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+     && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+     && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+     && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+     && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+     && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+     && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+     && '}' == 125 && '~' == 126)
+  };
+
+  if (! native_c_charset || MB_CUR_MAX > 1)
+    return false;
+  else
+    {
+      static int unibyte_c = -1;
+      if (unibyte_c < 0)
+        {
+          char *locale = setlocale (LC_ALL, 0);
+          unibyte_c = (locale && (STREQ (locale, "C")
+                                  || STREQ (locale, "POSIX")));
+        }
+      return unibyte_c;
+    }
+}
+
 /* Lexical analyzer.  All the dross that deals with the obnoxious
    GNU Regex syntax bits is located here.  The poor, suffering
    reader is referred to the GNU Regex documentation for the
@@ -917,6 +957,10 @@ parse_bracket_exp (void)
   int c, c1, c2;
   charclass ccl;
 
+  /* True if this is a bracket expression that dfaexec is known to
+     process correctly.  */
+  bool known_bracket_exp = true;
+
   /* Used to warn about [:space:].
      Bit 0 = first character is a colon.
      Bit 1 = last character is a colon.
@@ -958,6 +1002,7 @@ parse_bracket_exp (void)
     {
       FETCH_WC (c, wc, _("unbalanced ["));
       invert = 1;
+      known_bracket_exp = using_simple_locale ();
     }
   else
     invert = 0;
@@ -972,16 +1017,14 @@ parse_bracket_exp (void)
          we just treat it as a bunch of ordinary characters.  We can do
          this because we assume regex has checked for syntax errors before
          dfa is ever called.  */
-      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+      if (c == '[')
         {
 #define MAX_BRACKET_STRING_LEN 32
           char str[MAX_BRACKET_STRING_LEN + 1];
           FETCH_WC (c1, wc1, _("unbalanced ["));
 
-          /* If pattern contains '[[:', '[[.', or '[[='.  */
-          if (c1 == ':'
-              /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1.  */
-              || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+          if ((c1 == ':' && syntax_bits & RE_CHAR_CLASSES)
+              || c1 == '.' || c1 == '=')
             {
               size_t len = 0;
               for (;;)
@@ -1000,7 +1043,10 @@ parse_bracket_exp (void)
               /* Fetch bracket.  */
               FETCH_WC (c, wc, _("unbalanced ["));
               if (c1 == ':')
-                /* build character class.  */
+                /* Build character class.  POSIX allows character
+                   classes to match multicharacter collating elements,
+                   but the regex code does not support that, so do not
+                   worry about that possibility.  */
                 {
                   char const *class
                     = (case_fold && (STREQ (str, "upper")
@@ -1024,28 +1070,9 @@ parse_bracket_exp (void)
                     if (pred->func (c2))
                       setbit_case_fold_c (c2, ccl);
                 }
+              else
+                known_bracket_exp = false;
 
-              else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
-                {
-                  char *elem = xmemdup (str, len + 1);
-
-                  if (c1 == '=')
-                    /* build equivalence class.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->equivs,
-                                            equivs_al, work_mbc->nequivs + 1);
-                      work_mbc->equivs[work_mbc->nequivs++] = elem;
-                    }
-
-                  if (c1 == '.')
-                    /* build collating element.  */
-                    {
-                      REALLOC_IF_NECESSARY (work_mbc->coll_elems,
-                                            coll_elems_al,
-                                            work_mbc->ncoll_elems + 1);
-                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
-                    }
-                }
               colon_warning_state |= 8;
 
               /* Fetch new lookahead character.  */
@@ -1067,6 +1094,16 @@ parse_bracket_exp (void)
         /* build range characters.  */
         {
           FETCH_WC (c2, wc2, _("unbalanced ["));
+
+          /* A bracket expression like [a-[.aa.]] matches an unknown set.
+             Treat it like [-a[.aa.]] while parsing it, and
+             remember that the set is unknown.  */
+          if (c2 == '[' && *lexptr == '.')
+            {
+              known_bracket_exp = false;
+              c2 = ']';
+            }
+
           if (c2 == ']')
             {
               /* In the case [x-], the - is an ordinary hyphen,
@@ -1104,36 +1141,11 @@ parse_bracket_exp (void)
                   work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
                 }
             }
+          else if (using_simple_locale ())
+            for (; c <= c2; c++)
+              setbit_case_fold_c (c, ccl);
           else
-            {
-              /* Defer to the system regex library about the meaning
-                 of range expressions.  */
-              struct re_pattern_buffer re = { 0 };
-              char const *compile_msg;
-#if 199901 <= __STDC_VERSION__
-              char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
-#else
-              char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
-              pattern[2] = c;
-              pattern[5] = c2;
-#endif
-              re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
-              compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
-              if (compile_msg)
-                dfaerror (compile_msg);
-              for (c = 0; c < NOTCHAR; c++)
-                {
-                  char subject = c;
-                  switch (re_match (&re, &subject, 1, 0, NULL))
-                    {
-                    case 1: setbit (c, ccl); break;
-                    case -1: break;
-                    default: xalloc_die ();
-                    }
-                }
-              regfree (&re);
-              re_set_syntax (syntax_bits);
-            }
+            known_bracket_exp = false;
 
           colon_warning_state |= 8;
           FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -1171,6 +1183,9 @@ parse_bracket_exp (void)
   if (colon_warning_state == 7)
     dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
 
+  if (! known_bracket_exp)
+    return BACKREF;
+
   if (MB_CUR_MAX > 1)
     {
       static charclass zeroclass;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 742a580..972ffc5 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -86,6 +86,7 @@ TESTS =                                               \
   pcre-w                                       \
   pcre-wx-backref                              \
   pcre-z                                       \
+  posix-bracket                                        \
   prefix-of-multibyte                          \
   r-dot                                                \
   repetition-overflow                          \
diff --git a/tests/posix-bracket b/tests/posix-bracket
new file mode 100755
index 0000000..d9d1d84
--- /dev/null
+++ b/tests/posix-bracket
@@ -0,0 +1,33 @@
+#!/bin/sh
+# Check various bracket expressions in the POSIX locale.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+LC_ALL=C
+export LC_ALL
+
+fail=0
+
+echo a >in || framework_failure_
+for bracketed in '[.a.]' '[.a.]-a' 'a-[.a.]' '[.a.]-[.a.]' \
+    '[=a=]' '[:alpha:]'; do
+  grep "[$bracketed]" in >out || fail=1
+  compare in out || fail=1
+  grep "[^$bracketed]" in >out && fail=1
+  compare /dev/null out || fail=1
+done
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                               |    4 +
 src/dfa.c                                          |  129 +++++++++++---------
 tests/Makefile.am                                  |    1 +
 .../{unibyte-negated-circumflex => posix-bracket}  |   14 ++-
 4 files changed, 87 insertions(+), 61 deletions(-)
 copy tests/{unibyte-negated-circumflex => posix-bracket} (71%)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.18-4-g8b16e90, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.18-3-g0b5003d
Next by Date: grep branch, master, updated. v2.18-6-gfd56955
Previous by thread: grep branch, master, updated. v2.18-3-g0b5003d
Next by thread: grep branch, master, updated. v2.18-6-gfd56955
Index(es):
- Date
- Thread