bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets


From: Paolo Bonzini
Subject: [PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets
Date: Fri, 12 Mar 2010 18:49:14 +0100

Only use a bitset when possible without involving MBCSET.  Testcase:
   yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \
     time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]

Before: 51ms (best of three runs); after: 16ms(best of three runs).

* src/dfa.c (check_utf8, using_utf8): New.
(parse_bracket_exp): For simple bracket expressions under UTF-8,
use a CSET.
(dfacomp): Call check_utf8.
---
 src/dfa.c |   33 ++++++++++++++++++++++++++++++++-
 1 files changed, 32 insertions(+), 1 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index add6ebd..f17f550 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -84,6 +84,7 @@
 /* We can handle multibyte strings. */
 # include <wchar.h>
 # include <wctype.h>
+# include <langinfo.h>
 #endif
 
 #include "regex.h"
@@ -296,8 +297,27 @@ static wchar_t *inputwcs;  /* Wide character 
representation of input
                                   And inputwcs[i] is the codepoint.  */
 static unsigned char const *buf_begin; /* reference to begin in dfaexec().  */
 static unsigned char const *buf_end;   /* reference to end in dfaexec().  */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+   assume in a multibyte encoding. */
+static int using_utf8;
+
+void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+    using_utf8 = 1;
+#endif
+}
+#else
+void
+check_utf8 (void)
+{
+}
 #endif /* MBS_SUPPORT  */
 
+
 #ifdef MBS_SUPPORT
 /* Note that characters become unsigned here. */
 # define FETCH_WC(c, wc, eoferr)               \
@@ -688,7 +708,14 @@ parse_bracket_exp (void)
   while ((wc = wc1, (c = c1) != L']'));
 
 #ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
+  if (MB_CUR_MAX > 1
+      && (!using_utf8
+         || invert
+          || work_mbc->nchars != 0
+          || work_mbc->nch_classes != 0
+          || work_mbc->nranges != 0
+          || work_mbc->nequivs != 0
+          || work_mbc->ncoll_elems != 0))
     {
       static charclass zeroclass;
       work_mbc->invert = invert;
@@ -699,6 +726,9 @@ parse_bracket_exp (void)
 
   if (invert)
     {
+#ifdef MBS_SUPPORT
+      assert(MB_CUR_MAX == 1);
+#endif
       notset(ccl);
       if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit(eolbyte, ccl);
@@ -2916,6 +2946,7 @@ dfainit (struct dfa *d)
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
+  check_utf8();
   dfainit(d);
   dfaparse(s, len, d);
   dfamust(d);
-- 
1.6.6






reply via email to

[Prev in Thread] Current Thread [Next in Thread]