grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.18-108-g4c57448


From: Paul Eggert
Subject: grep branch, master, updated. v2.18-108-g4c57448
Date: Sun, 27 Apr 2014 03:58:53 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  4c57448135ebd424c15c774ffba8a187e32bc0a3 (commit)
       via  7600fde5bcf50872a89d427fea4add1f590ada36 (commit)
      from  d3d99505cb704563cedf4643301d54ecc717ae57 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4c57448135ebd424c15c774ffba8a187e32bc0a3


commit 4c57448135ebd424c15c774ffba8a187e32bc0a3
Author: Paul Eggert <address@hidden>
Date:   Sat Apr 26 18:18:59 2014 -0700

    kwset: speed up by using memchr2
    
    Idea suggested by Eric Blake in: http://bugs.gnu.org/17229#43
    * bootstrap.conf (gnulib_modules): Add memchr2.
    * src/kwset.c: Include stdint.h, for uintptr_t.  Include memchr2.h.
    (struct kwset): New members gc1, gc2, gc1help.
    (tr): Move earlier, so it can be used earlier.
    (kwsprep): Initialize struct kwset's new members.
    (memchr_kwset): Rename from memchr_trans.  Combine C and TRANS args into
    new arg KWSET.  All uses changed.  Use memchr2 when appropriate.
    (bmexec): Use new members instead of recomputing their values.
    Increase advance_heuristic; it's just a guess, but memchr2 probably
    makes it reasonable to increase it.

diff --git a/bootstrap.conf b/bootstrap.conf
index 367427d..9f3457d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -57,6 +57,7 @@ manywarnings
 mbrlen
 mbrtowc
 memchr
+memchr2
 mempcpy
 minmax
 obstack
diff --git a/src/kwset.c b/src/kwset.c
index 406fa0e..8270f05 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -36,8 +36,10 @@
 #include "kwset.h"
 
 #include <stdbool.h>
+#include <stdint.h>
 #include <sys/types.h>
 #include "system.h"
+#include "memchr2.h"
 #include "obstack.h"
 #include "xalloc.h"
 
@@ -91,8 +93,33 @@ struct kwset
   char *target;                        /* Target string if there's only one. */
   int *shift;                  /* Used in Boyer-Moore search for one string. */
   char const *trans;           /* Character translation table. */
+
+  /* If there's only one string, this is the string's last byte,
+     translated via TRANS if TRANS is nonnull.  */
+  char gc1;
+
+  /* Likewise for the string's penultimate byte, if it has two or more
+     bytes.  */
+  char gc2;
+
+  /* If there's only one string, this helps to match the string's last byte.
+     If GC1HELP is negative, only GC1 matches the string's last byte;
+     otherwise at least two bytes match, and B matches if TRANS[B] == GC1.
+     If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two
+     such matches, and GC1HELP is the other match after conversion to
+     unsigned char.  If GC1HELP is at least NCHAR, there are three or
+     more such matches; e.g., Greek has three sigma characters that
+     all match when case-folding.  */
+  int gc1help;
 };
 
+/* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
+static char
+tr (char const *trans, char c)
+{
+  return trans ? trans[U(c)] : c;
+}
+
 /* Allocate and initialize a keyword set object, returning an opaque
    pointer to it.  */
 kwset_t
@@ -456,6 +483,27 @@ kwsprep (kwset_t kwset)
               curr = curr->next;
             }
         }
+
+      char gc1 = tr (trans, kwset->target[kwset->mind - 1]);
+
+      /* Set GC1HELP according to whether exactly one, exactly two, or
+         three-or-more characters match GC1.  */
+      int gc1help = -1;
+      if (trans)
+        {
+          char const *equiv1 = memchr (trans, gc1, NCHAR);
+          char const *equiv2 = memchr (equiv1 + 1, gc1,
+                                       trans + NCHAR - (equiv1 + 1));
+          if (equiv2)
+            gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1))
+                       ? NCHAR
+                       : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans));
+        }
+
+      kwset->gc1 = gc1;
+      kwset->gc1help = gc1help;
+      if (kwset->mind > 1)
+        kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
     }
 
   /* Fix things up for any translation table. */
@@ -464,13 +512,6 @@ kwsprep (kwset_t kwset)
       kwset->delta[i] = delta[U(trans[i])];
 }
 
-/* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
-static char
-tr (char const *trans, char c)
-{
-  return trans ? trans[U(c)] : c;
-}
-
 /* Delta2 portion of a Boyer-Moore search.  *TP is the string text
    pointer; it is updated in place.  EP is the end of the string text,
    and SP the end of the pattern.  LEN is the pattern length; it must
@@ -524,19 +565,23 @@ bm_delta2_search (char const **tpp, char const *ep, char 
const *sp, int len,
   return false;
 }
 
-/* Return the address of the first byte in the buffer S that equals C.
-   S contains N bytes.  If TRANS is nonnull, use it to transliterate
-   S's bytes before comparing them.  */
+/* Return the address of the first byte in the buffer S (of size N)
+   that matches the last byte specified by KWSET, a singleton.  */
 static char const *
-memchr_trans (char const *s, char c, size_t n, char const *trans)
+memchr_kwset (char const *s, size_t n, kwset_t kwset)
 {
-  if (! trans)
-    return memchr (s, c, n);
-  char const *slim = s + n;
+  if (kwset->gc1help < 0)
+    return memchr (s, kwset->gc1, n);
+  int small_heuristic = 2;
+  int small = (- (uintptr_t) s % sizeof (long)
+               + small_heuristic * sizeof (long));
+  size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n;
+  char const *slim = s + ntrans;
   for (; s < slim; s++)
-    if (trans[U(*s)] == c)
+    if (kwset->trans[U(*s)] == kwset->gc1)
       return s;
-  return NULL;
+  n -= ntrans;
+  return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n);
 }
 
 /* Fast boyer-moore search. */
@@ -555,15 +600,15 @@ bmexec (kwset_t kwset, char const *text, size_t size)
     return -1;
   if (len == 1)
     {
-      tp = memchr_trans (text, kwset->target[0], size, trans);
+      tp = memchr_kwset (text, size, kwset);
       return tp ? tp - text : -1;
     }
 
   d1 = kwset->delta;
   sp = kwset->target + len;
   tp = text + len;
-  char gc1 = tr (trans, sp[-1]);
-  char gc2 = tr (trans, sp[-2]);
+  char gc1 = kwset->gc1;
+  char gc2 = kwset->gc2;
 
   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
   if (size > 12 * len)
@@ -590,11 +635,11 @@ bmexec (kwset_t kwset, char const *text, size_t size)
 
                     /* As a heuristic, prefer memchr to seeking by
                        delta1 when the latter doesn't advance much.  */
-                    int advance_heuristic = 4 * sizeof (long);
+                    int advance_heuristic = 16 * sizeof (long);
                     if (advance_heuristic <= tp - tp0)
                       goto big_advance;
                     tp--;
-                    tp = memchr_trans (tp, gc1, text + size - tp, trans);
+                    tp = memchr_kwset (tp, text + size - tp, kwset);
                     if (! tp)
                       return -1;
                     tp++;

http://git.savannah.gnu.org/cgit/grep.git/commit/?id=7600fde5bcf50872a89d427fea4add1f590ada36


commit 4c57448135ebd424c15c774ffba8a187e32bc0a3
Author: Paul Eggert <address@hidden>
Date:   Sat Apr 26 18:18:59 2014 -0700

    kwset: speed up by using memchr2
    
    Idea suggested by Eric Blake in: http://bugs.gnu.org/17229#43
    * bootstrap.conf (gnulib_modules): Add memchr2.
    * src/kwset.c: Include stdint.h, for uintptr_t.  Include memchr2.h.
    (struct kwset): New members gc1, gc2, gc1help.
    (tr): Move earlier, so it can be used earlier.
    (kwsprep): Initialize struct kwset's new members.
    (memchr_kwset): Rename from memchr_trans.  Combine C and TRANS args into
    new arg KWSET.  All uses changed.  Use memchr2 when appropriate.
    (bmexec): Use new members instead of recomputing their values.
    Increase advance_heuristic; it's just a guess, but memchr2 probably
    makes it reasonable to increase it.

diff --git a/bootstrap.conf b/bootstrap.conf
index 367427d..9f3457d 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -57,6 +57,7 @@ manywarnings
 mbrlen
 mbrtowc
 memchr
+memchr2
 mempcpy
 minmax
 obstack
diff --git a/src/kwset.c b/src/kwset.c
index 406fa0e..8270f05 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -36,8 +36,10 @@
 #include "kwset.h"
 
 #include <stdbool.h>
+#include <stdint.h>
 #include <sys/types.h>
 #include "system.h"
+#include "memchr2.h"
 #include "obstack.h"
 #include "xalloc.h"
 
@@ -91,8 +93,33 @@ struct kwset
   char *target;                        /* Target string if there's only one. */
   int *shift;                  /* Used in Boyer-Moore search for one string. */
   char const *trans;           /* Character translation table. */
+
+  /* If there's only one string, this is the string's last byte,
+     translated via TRANS if TRANS is nonnull.  */
+  char gc1;
+
+  /* Likewise for the string's penultimate byte, if it has two or more
+     bytes.  */
+  char gc2;
+
+  /* If there's only one string, this helps to match the string's last byte.
+     If GC1HELP is negative, only GC1 matches the string's last byte;
+     otherwise at least two bytes match, and B matches if TRANS[B] == GC1.
+     If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two
+     such matches, and GC1HELP is the other match after conversion to
+     unsigned char.  If GC1HELP is at least NCHAR, there are three or
+     more such matches; e.g., Greek has three sigma characters that
+     all match when case-folding.  */
+  int gc1help;
 };
 
+/* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
+static char
+tr (char const *trans, char c)
+{
+  return trans ? trans[U(c)] : c;
+}
+
 /* Allocate and initialize a keyword set object, returning an opaque
    pointer to it.  */
 kwset_t
@@ -456,6 +483,27 @@ kwsprep (kwset_t kwset)
               curr = curr->next;
             }
         }
+
+      char gc1 = tr (trans, kwset->target[kwset->mind - 1]);
+
+      /* Set GC1HELP according to whether exactly one, exactly two, or
+         three-or-more characters match GC1.  */
+      int gc1help = -1;
+      if (trans)
+        {
+          char const *equiv1 = memchr (trans, gc1, NCHAR);
+          char const *equiv2 = memchr (equiv1 + 1, gc1,
+                                       trans + NCHAR - (equiv1 + 1));
+          if (equiv2)
+            gc1help = (memchr (equiv2 + 1, gc1, trans + NCHAR - (equiv2 + 1))
+                       ? NCHAR
+                       : U(gc1) ^ (equiv1 - trans) ^ (equiv2 - trans));
+        }
+
+      kwset->gc1 = gc1;
+      kwset->gc1help = gc1help;
+      if (kwset->mind > 1)
+        kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
     }
 
   /* Fix things up for any translation table. */
@@ -464,13 +512,6 @@ kwsprep (kwset_t kwset)
       kwset->delta[i] = delta[U(trans[i])];
 }
 
-/* Use TRANS to transliterate C.  A null TRANS does no transliteration.  */
-static char
-tr (char const *trans, char c)
-{
-  return trans ? trans[U(c)] : c;
-}
-
 /* Delta2 portion of a Boyer-Moore search.  *TP is the string text
    pointer; it is updated in place.  EP is the end of the string text,
    and SP the end of the pattern.  LEN is the pattern length; it must
@@ -524,19 +565,23 @@ bm_delta2_search (char const **tpp, char const *ep, char 
const *sp, int len,
   return false;
 }
 
-/* Return the address of the first byte in the buffer S that equals C.
-   S contains N bytes.  If TRANS is nonnull, use it to transliterate
-   S's bytes before comparing them.  */
+/* Return the address of the first byte in the buffer S (of size N)
+   that matches the last byte specified by KWSET, a singleton.  */
 static char const *
-memchr_trans (char const *s, char c, size_t n, char const *trans)
+memchr_kwset (char const *s, size_t n, kwset_t kwset)
 {
-  if (! trans)
-    return memchr (s, c, n);
-  char const *slim = s + n;
+  if (kwset->gc1help < 0)
+    return memchr (s, kwset->gc1, n);
+  int small_heuristic = 2;
+  int small = (- (uintptr_t) s % sizeof (long)
+               + small_heuristic * sizeof (long));
+  size_t ntrans = kwset->gc1help < NCHAR && small < n ? small : n;
+  char const *slim = s + ntrans;
   for (; s < slim; s++)
-    if (trans[U(*s)] == c)
+    if (kwset->trans[U(*s)] == kwset->gc1)
       return s;
-  return NULL;
+  n -= ntrans;
+  return n == 0 ? NULL : memchr2 (s, kwset->gc1, kwset->gc1help, n);
 }
 
 /* Fast boyer-moore search. */
@@ -555,15 +600,15 @@ bmexec (kwset_t kwset, char const *text, size_t size)
     return -1;
   if (len == 1)
     {
-      tp = memchr_trans (text, kwset->target[0], size, trans);
+      tp = memchr_kwset (text, size, kwset);
       return tp ? tp - text : -1;
     }
 
   d1 = kwset->delta;
   sp = kwset->target + len;
   tp = text + len;
-  char gc1 = tr (trans, sp[-1]);
-  char gc2 = tr (trans, sp[-2]);
+  char gc1 = kwset->gc1;
+  char gc2 = kwset->gc2;
 
   /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
   if (size > 12 * len)
@@ -590,11 +635,11 @@ bmexec (kwset_t kwset, char const *text, size_t size)
 
                     /* As a heuristic, prefer memchr to seeking by
                        delta1 when the latter doesn't advance much.  */
-                    int advance_heuristic = 4 * sizeof (long);
+                    int advance_heuristic = 16 * sizeof (long);
                     if (advance_heuristic <= tp - tp0)
                       goto big_advance;
                     tp--;
-                    tp = memchr_trans (tp, gc1, text + size - tp, trans);
+                    tp = memchr_kwset (tp, text + size - tp, kwset);
                     if (! tp)
                       return -1;
                     tp++;

-----------------------------------------------------------------------

Summary of changes:
 bootstrap.conf |    1 +
 src/kwset.c    |   98 ++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 76 insertions(+), 23 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]