[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets

bug-grep

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets

From:	Paolo Bonzini
Subject:	[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets
Date:	Sun, 14 Mar 2010 16:35:10 +0100

Use a bitset when not involving MBCSET is possible.  Testcase:
   yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \
     time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]

Before: 51ms (best of three runs); after: 16ms(best of three runs).

* src/dfa.c (check_utf8, using_utf8): New.
(parse_bracket_exp): For simple bracket expressions under UTF-8,
use a CSET.
(dfacomp): Call check_utf8.
---
 src/dfa.c |   34 +++++++++++++++++++++++++++++++++-
 1 files changed, 33 insertions(+), 1 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index ed4e1ae..da70aa1 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -21,6 +21,7 @@
    Modified July, 1988 by Arthur David Olson to assist BMG speedups  */
 
 #include <config.h>
+#include <assert.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <sys/types.h>
@@ -78,6 +79,7 @@
 /* We can handle multibyte strings. */
 # include <wchar.h>
 # include <wctype.h>
+# include <langinfo.h>
 #endif
 
 #include "regex.h"
@@ -312,8 +314,27 @@ static wchar_t *inputwcs;  /* Wide character 
representation of input
                                   And inputwcs[i] is the codepoint.  */
 static unsigned char const *buf_begin; /* reference to begin in dfaexec().  */
 static unsigned char const *buf_end;   /* reference to end in dfaexec().  */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+   assume in a multibyte encoding. */
+static int using_utf8;
+
+static void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+  if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+    using_utf8 = 1;
+#endif
+}
+#else
+static void
+check_utf8 (void)
+{
+}
 #endif /* MBS_SUPPORT  */
 
+
 #ifdef MBS_SUPPORT
 /* Note that characters become unsigned here. */
 # define FETCH_WC(c, wc, eoferr)               \
@@ -711,7 +732,14 @@ parse_bracket_exp (void)
   while ((wc = wc1, (c = c1) != L']'));
 
 #ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
+  if (MB_CUR_MAX > 1
+      && (!using_utf8
+         || invert
+          || work_mbc->nchars != 0
+          || work_mbc->nch_classes != 0
+          || work_mbc->nranges != 0
+          || work_mbc->nequivs != 0
+          || work_mbc->ncoll_elems != 0))
     {
       static charclass zeroclass;
       work_mbc->invert = invert;
@@ -722,6 +750,9 @@ parse_bracket_exp (void)
 
   if (invert)
     {
+#ifdef MBS_SUPPORT
+      assert(MB_CUR_MAX == 1);
+#endif
       notset(ccl);
       if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
         clrbit(eolbyte, ccl);
@@ -2940,6 +2971,7 @@ dfainit (struct dfa *d)
 void
 dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
 {
+  check_utf8();
   dfainit(d);
   dfaparse(s, len, d);
   dfamust(d);
-- 
1.6.6.1

[Prev in Thread]

Current Thread

[Next in Thread]

Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, (continued)
- [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Jim Meyering, 2010/03/16
    - Re: [PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Paolo Bonzini, 2010/03/17
- [PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 4/9] dfa: speed up handling of brackets, Jim Meyering, 2010/03/17
    - Re: [PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/17
    - Re: [PATCH 4/9] dfa: speed up handling of brackets, Jim Meyering, 2010/03/17
- [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini <=
  - Re: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Jim Meyering, 2010/03/17
    - Re: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/17
- [PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Jim Meyering, 2010/03/15
- [PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Jim Meyering, 2010/03/17
    - Re: [PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/17
- [PATCH 8/9] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/14
  - Re: [PATCH 8/9] grep: remove check_multibyte_string, fix non-UTF8 missed match, Jim Meyering, 2010/03/17
- [PATCH 9/9] grep: match multibyte charsets line-by-line when using -i, Paolo Bonzini, 2010/03/14

Prev by Date: [PATCH 4/9] dfa: speed up handling of brackets
Next by Date: [PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set
Previous by thread: Re: [PATCH 4/9] dfa: speed up handling of brackets
Next by thread: Re: [PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets
Index(es):
- Date
- Thread