grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.8-16-g126cc2e


From: Paolo Bonzini
Subject: grep branch, master, updated. v2.8-16-g126cc2e
Date: Tue, 07 Jun 2011 11:32:12 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  126cc2ea4d8579f97a8d1071b37ff638c22c2b36 (commit)
       via  4b29aa3ca3498fa26e8646a6cebb5f50c396ab6c (commit)
      from  c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=126cc2ea4d8579f97a8d1071b37ff638c22c2b36


commit 126cc2ea4d8579f97a8d1071b37ff638c22c2b36
Author: Paolo Bonzini <address@hidden>
Date:   Tue Jun 7 12:24:38 2011 +0200

    dfa: optimize wide characters in a bracket expression
    
    * src/dfa.c (addtok): Compile characters to an alternation.  Handle the
    case when nothing else remains in the MBCSET.

diff --git a/NEWS b/NEWS
index 67b3fad..d026448 100644
--- a/NEWS
+++ b/NEWS
@@ -4,9 +4,13 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep is faster on regular expressions that match multibyte characters
+  in brackets (such as '[áéíóú]').
+
   echo c|grep '[c]' would fail for any c in 0x80..0xff, with a uni-byte
   encoding for which the byte-to-wide-char mapping is nontrivial.  For
   example, the ISO-88591 locales are not affected, but ru_RU.KOI8-R is.
+  [bug introduced in grep-2.6]
 
   grep -P no longer aborts when PCRE's backtracking limit is exceeded
   Before, echo aaaaaaaaaaaaaab |grep -P '((a+)*)+$' would abort.  Now,
diff --git a/src/dfa.c b/src/dfa.c
index 28f8daf..873530f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1447,6 +1447,8 @@ addtok_mb (token t, int mbprop)
     dfa->depth = depth;
 }
 
+static void addtok_wc (wint_t wc);
+
 /* Add the given token to the parse tree, maintaining the depth count and
    updating the maximum depth if necessary. */
 static void
@@ -1455,8 +1457,24 @@ addtok (token t)
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1 && t == MBCSET)
     {
+      bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
 
+      /* Extract wide characters into alternations for better performance.
+         This does not require UTF-8.  */
+      if (!work_mbc->invert)
+        {
+          int i;
+          for (i = 0; i < work_mbc->nchars; i++)
+            {
+              addtok_wc (work_mbc->chars[i]);
+              if (need_or)
+                addtok (OR);
+              need_or = true;
+            }
+          work_mbc->nchars = 0;
+        }
+
       /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
       if (work_mbc->invert
           || (!using_utf8() && work_mbc->cset != -1)
@@ -1465,13 +1483,22 @@ addtok (token t)
           || work_mbc->nranges != 0
           || work_mbc->nequivs != 0
           || work_mbc->ncoll_elems != 0)
-        addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+        {
+          addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+          if (need_or)
+            addtok (OR);
+        }
       else
         {
-          /* The single-byte character set must be non-empty, or due to the
-             test above the entire MBCSET would be empty (which is invalid).  
*/
-          assert (using_utf8() && work_mbc->cset != -1);
-          addtok (CSET + work_mbc->cset);
+          /* Characters have been handled above, so it is possible
+             that the mbcset is empty now.  Do nothing in that case.  */
+          if (work_mbc->cset != -1)
+            {
+              assert (using_utf8 ());
+              addtok (CSET + work_mbc->cset);
+              if (need_or)
+                addtok (OR);
+            }
         }
     }
   else

http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4b29aa3ca3498fa26e8646a6cebb5f50c396ab6c


commit 126cc2ea4d8579f97a8d1071b37ff638c22c2b36
Author: Paolo Bonzini <address@hidden>
Date:   Tue Jun 7 12:24:38 2011 +0200

    dfa: optimize wide characters in a bracket expression
    
    * src/dfa.c (addtok): Compile characters to an alternation.  Handle the
    case when nothing else remains in the MBCSET.

diff --git a/NEWS b/NEWS
index 67b3fad..d026448 100644
--- a/NEWS
+++ b/NEWS
@@ -4,9 +4,13 @@ GNU grep NEWS                                    -*- outline 
-*-
 
 ** Bug fixes
 
+  grep is faster on regular expressions that match multibyte characters
+  in brackets (such as '[áéíóú]').
+
   echo c|grep '[c]' would fail for any c in 0x80..0xff, with a uni-byte
   encoding for which the byte-to-wide-char mapping is nontrivial.  For
   example, the ISO-88591 locales are not affected, but ru_RU.KOI8-R is.
+  [bug introduced in grep-2.6]
 
   grep -P no longer aborts when PCRE's backtracking limit is exceeded
   Before, echo aaaaaaaaaaaaaab |grep -P '((a+)*)+$' would abort.  Now,
diff --git a/src/dfa.c b/src/dfa.c
index 28f8daf..873530f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1447,6 +1447,8 @@ addtok_mb (token t, int mbprop)
     dfa->depth = depth;
 }
 
+static void addtok_wc (wint_t wc);
+
 /* Add the given token to the parse tree, maintaining the depth count and
    updating the maximum depth if necessary. */
 static void
@@ -1455,8 +1457,24 @@ addtok (token t)
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1 && t == MBCSET)
     {
+      bool need_or = false;
       struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
 
+      /* Extract wide characters into alternations for better performance.
+         This does not require UTF-8.  */
+      if (!work_mbc->invert)
+        {
+          int i;
+          for (i = 0; i < work_mbc->nchars; i++)
+            {
+              addtok_wc (work_mbc->chars[i]);
+              if (need_or)
+                addtok (OR);
+              need_or = true;
+            }
+          work_mbc->nchars = 0;
+        }
+
       /* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET.  */
       if (work_mbc->invert
           || (!using_utf8() && work_mbc->cset != -1)
@@ -1465,13 +1483,22 @@ addtok (token t)
           || work_mbc->nranges != 0
           || work_mbc->nequivs != 0
           || work_mbc->ncoll_elems != 0)
-        addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+        {
+          addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+          if (need_or)
+            addtok (OR);
+        }
       else
         {
-          /* The single-byte character set must be non-empty, or due to the
-             test above the entire MBCSET would be empty (which is invalid).  
*/
-          assert (using_utf8() && work_mbc->cset != -1);
-          addtok (CSET + work_mbc->cset);
+          /* Characters have been handled above, so it is possible
+             that the mbcset is empty now.  Do nothing in that case.  */
+          if (work_mbc->cset != -1)
+            {
+              assert (using_utf8 ());
+              addtok (CSET + work_mbc->cset);
+              if (need_or)
+                addtok (OR);
+            }
         }
     }
   else

-----------------------------------------------------------------------

Summary of changes:
 NEWS      |    4 ++++
 src/dfa.c |   57 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 52 insertions(+), 9 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]