grep branch, master, updated. v2.16-14-gb75ce6f

grep-commit

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.16-14-gb75ce6f

From:	Jim Meyering
Subject:	grep branch, master, updated. v2.16-14-gb75ce6f
Date:	Mon, 10 Feb 2014 05:06:17 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
  discards  ee333086a5ab659f7451818dbb9c31e1a05e23a4 (commit)
       via  b75ce6f7c611cb98549dc736947198e812b587c4 (commit)

This update added new revisions after undoing existing revisions.  That is
to say, the old revision is not a strict subset of the new revision.  This
situation occurs when you --force push a change and generate a repository
containing something like this:

 * -- * -- B -- O -- O -- O (ee333086a5ab659f7451818dbb9c31e1a05e23a4)
            \
             N -- N -- N (b75ce6f7c611cb98549dc736947198e812b587c4)

When this happens we assume that you've already had alert emails for all
of the O revisions, and so we here report only the revisions in the N
branch from the common base, B.

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=b75ce6f7c611cb98549dc736947198e812b587c4


commit b75ce6f7c611cb98549dc736947198e812b587c4
Author: Norihiro Tanaka <address@hidden>
Date:   Thu Jan 30 12:56:04 2014 -0800

    speed up mb-boundary-detection after each preliminary match
    
    After each kwsexec or dfaexec match, we must determine whether
    the tentative match falls in the middle of a multi-byte character.
    That is what our is_mb_middle function does, but it was expensive,
    even when most input consisted of single-byte characters.  The main
    cost was for each call to mbrlen.  This change constructs and uses
    a cache of the lengths returned by mbrlen for unibyte values.
    The largest speed-up (3x to 7x, CPU-dependent) is when most
    lines contain a match, yet few are printed, e.g., when using
    grep -v common-pattern ... to filter out all but a few lines.
    
    * src/search.h (build_mbclen_cache): Declare it.
    * src/main.c: Include "search.h".
    [MBS_SUPPORT] (main): Call build_mbclen_cache in a multibyte locale.
    * src/searchutils.c [HAVE_LANGINFO_CODESET]: Include <langinfo.h>.
    (mbclen_cache): New global.
    (build_mbclen_cache): New function.
    (is_mb_middle) [HAVE_LANGINFO_CODESET]: Use it.
    * NEWS (Improvements): Mention it.

diff --git a/NEWS b/NEWS
index a662960..2ff7272 100644
--- a/NEWS
+++ b/NEWS
@@ -7,6 +7,9 @@ GNU grep NEWS                                    -*- outline -*-
   grep -i in a multibyte locale is now typically 10 times faster
   for patterns that do not contain \ or [.
 
+  grep (without -i) in a multibyte locale is now up to 7 times faster
+  when processing many matched lines.
+
   Range expressions in unibyte locales now ordinarily use the rational
   range interpretation, in which [a-z] matches only lower-case ASCII
   letters regardless of locale, and similarly for other ranges.  (This
diff --git a/src/main.c b/src/main.c
index 42f9ff3..54d9dfc 100644
--- a/src/main.c
+++ b/src/main.c
@@ -46,6 +46,7 @@
 #include "propername.h"
 #include "quote.h"
 #include "safe-read.h"
+#include "search.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -2364,6 +2365,11 @@ main (int argc, char **argv)
         }
     }
 
+#if MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    build_mbclen_cache ();
+#endif
+
   compile (keys, keycc);
   free (keys);
 
diff --git a/src/search.h b/src/search.h
index 61dcf95..12d0822 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,6 +46,7 @@ typedef signed char mb_len_map_t;
 extern void kwsinit (kwset_t *);
 
 extern char *mbtolower (const char *, size_t *, mb_len_map_t **);
+extern void build_mbclen_cache (void);
 extern bool is_mb_middle (const char **, const char *, const char *, size_t);
 
 /* dfasearch.c */
diff --git a/src/searchutils.c b/src/searchutils.c
index 778f4ad..3478417 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,9 +19,14 @@
 #include <config.h>
 #include <assert.h>
 #include "search.h"
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
 
 #define NCHAR (UCHAR_MAX + 1)
 
+static size_t mbclen_cache[NCHAR];
+
 void
 kwsinit (kwset_t *kwset)
 {
@@ -207,6 +212,20 @@ mbtolower (const char *beg, size_t *n, mb_len_map_t 
**len_map_p)
   return out;
 }
 
+/* Initialize a cache of mbrlen values for each of its 1-byte inputs.  */
+void
+build_mbclen_cache (void)
+{
+  int i;
+
+  for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
+    {
+      char c = i;
+      unsigned char uc = i;
+      mbstate_t mbs = { 0 };
+      mbclen_cache[uc] = mbrlen (&c, 1, &mbs);
+    }
+}
 
 bool
 is_mb_middle (const char **good, const char *buf, const char *end,
@@ -215,12 +234,31 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   const char *p = *good;
   const char *prev = p;
   mbstate_t cur_state;
+#if HAVE_LANGINFO_CODESET
+  static int is_utf8 = -1;
+
+  if (is_utf8 == -1)
+    is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
+
+  if (is_utf8 && buf - p > MB_CUR_MAX)
+    {
+      for (p = buf; buf - p > MB_CUR_MAX; p--)
+        if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
+          break;
+
+      if (buf - p == MB_CUR_MAX)
+        p = buf;
+    }
+#endif
+
+  memset (&cur_state, 0, sizeof cur_state);
 
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
   while (p < buf)
     {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
+      size_t mbclen = mbclen_cache[to_uchar (*p)];
+
+      if (mbclen == (size_t) -2)
+        mbclen = mbrlen (p, end - p, &cur_state);
 
       /* Store the beginning of the previous complete multibyte character.  */
       if (mbclen != (size_t) -2)
@@ -231,7 +269,7 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
           /* An invalid sequence, or a truncated multibyte character.
              We treat it as a single byte character.  */
           mbclen = 1;
-          memset(&cur_state, 0, sizeof cur_state);
+          memset (&cur_state, 0, sizeof cur_state);
         }
       p += mbclen;
     }

-----------------------------------------------------------------------

Summary of changes:


hooks/post-receive
-- 
grep

[Prev in Thread]

Current Thread

[Next in Thread]

grep branch, master, updated. v2.16-14-gb75ce6f, Jim Meyering <=

Prev by Date: grep branch, master, updated. v2.16-14-gee33308
Next by Date: grep branch, master, updated. v2.16-15-gde190ee
Previous by thread: grep branch, master, updated. v2.16-14-gee33308
Next by thread: grep branch, master, updated. v2.16-15-gde190ee
Index(es):
- Date
- Thread