grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.16-14-gee33308


From: Jim Meyering
Subject: grep branch, master, updated. v2.16-14-gee33308
Date: Sun, 02 Feb 2014 16:34:21 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  ee333086a5ab659f7451818dbb9c31e1a05e23a4 (commit)
      from  c5cb52ecb97af4bf052e1c1366b8eb93a54ba6a0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=ee333086a5ab659f7451818dbb9c31e1a05e23a4


commit ee333086a5ab659f7451818dbb9c31e1a05e23a4
Author: Jim Meyering <address@hidden>
Date:   Thu Jan 30 12:56:04 2014 -0800

    speed up mb-boundary-detection after each preliminary match
    
    After each kwsexec or dfaexec match, we must determine whether
    the tentative match falls in the middle of a multi-byte character.
    That is what our is_mb_middle function does, but it was expensive,
    even when most input consisted of single-byte characters.  The main
    cost was for each call to mbrlen.  This change constructs and uses
    a cache of the lengths returned by mbrlen for unibyte values.
    The largest speed-up (3x to 7x, CPU-dependent) is when most
    lines contain a match, yet few are printed, e.g., when using
    grep -v common-pattern ... to filter out all but a few lines.
    
    * src/search.h (build_mbclen_cache): Declare it.
    * src/main.c: Include "search.h".
    [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale.
    * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>.
    (mbclen_cache): New global.
    (build_mbclen_cache): New function.
    (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it.
    * NEWS (Improvements): Mention it.

diff --git a/NEWS b/NEWS
index a662960..2ff7272 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,9 @@ GNU grep NEWS                                    -*- outline -*-
   grep -i in a multibyte locale is now typically 10 times faster
   for patterns that do not contain \ or [.
 
+  grep (without -i) in a multibyte locale is now up to 7 times faster
+  when processing many matched lines.
+
   Range expressions in unibyte locales now ordinarily use the rational
   range interpretation, in which [a-z] matches only lower-case ASCII
   letters regardless of locale, and similarly for other ranges.  (This
diff --git a/src/main.c b/src/main.c
index 42f9ff3..54d9dfc 100644
--- a/src/main.c
+++ b/src/main.c
@@ -46,6 +46,7 @@
 #include "propername.h"
 #include "quote.h"
 #include "safe-read.h"
+#include "search.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -2364,6 +2365,11 @@ main (int argc, char **argv)
         }
     }
 
+#if MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    build_mbclen_cache ();
+#endif
+
   compile (keys, keycc);
   free (keys);
 
diff --git a/src/search.h b/src/search.h
index 61dcf95..12d0822 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,6 +46,7 @@ typedef signed char mb_len_map_t;
 extern void kwsinit (kwset_t *);
 
 extern char *mbtolower (const char *, size_t *, mb_len_map_t **);
+extern void build_mbclen_cache (void);
 extern bool is_mb_middle (const char **, const char *, const char *, size_t);
 
 /* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad..3478417 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,14 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
 
 #define NCHAR (UCHAR_MAX + 1)
 
+static size_t mbclen_cache[NCHAR];
+
 void
 kwsinit (kwset_t *kwset)
 {
@@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
   return out;
 }
 
+/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
+void
+build_mbclen_cache (void)
+{
+  int i;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t mbs = { 0 };
+      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+    }
+}
 
 bool
 is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+  static int is_utf8 = -1;
+
+  if (is_utf8 == -1)
+    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+
+  if (is_utf8 && buf - p > MB_CUR_MAX)
+    {
+      for (p = buf; buf - p > MB_CUR_MAX; p--)
+        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
+          break;
+
+      if (buf - p == MB_CUR_MAX)
+        p = buf;
+    }
+#endif
+
+  memset (&cur_state, 0, sizeof cur_state);
 
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
   while (p < buf)
     {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
+      size_t mbclen = mbclen_cache[to_uchar (*p)];
+
+      if (mbclen == (size_t) -2)
+        mbclen = mbrlen (p, end - p, &cur_state);
 
       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
@@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
           /* An invalid sequence, or a truncated multibyte character.
              We treat it as a single byte character.  */
           mbclen = 1;
-          memset(&cur_state, 0, sizeof cur_state);
+          memset (&cur_state, 0, sizeof cur_state);
         }
       p += mbclen;
     }

-----------------------------------------------------------------------

Summary of changes:
 NEWS              |    3 +++
 src/main.c        |    6 ++++++
 src/search.h      |    1 +
 src/searchutils.c |   46 ++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 52 insertions(+), 4 deletions(-)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]