[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets
From: |
Paolo Bonzini |
Subject: |
[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets |
Date: |
Sun, 14 Mar 2010 16:35:10 +0100 |
Use a bitset when not involving MBCSET is possible. Testcase:
yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \
time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]
Before: 51ms (best of three runs); after: 16ms(best of three runs).
* src/dfa.c (check_utf8, using_utf8): New.
(parse_bracket_exp): For simple bracket expressions under UTF-8,
use a CSET.
(dfacomp): Call check_utf8.
---
src/dfa.c | 34 +++++++++++++++++++++++++++++++++-
1 files changed, 33 insertions(+), 1 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index ed4e1ae..da70aa1 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -21,6 +21,7 @@
Modified July, 1988 by Arthur David Olson to assist BMG speedups */
#include <config.h>
+#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
@@ -78,6 +79,7 @@
/* We can handle multibyte strings. */
# include <wchar.h>
# include <wctype.h>
+# include <langinfo.h>
#endif
#include "regex.h"
@@ -312,8 +314,27 @@ static wchar_t *inputwcs; /* Wide character
representation of input
And inputwcs[i] is the codepoint. */
static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
static unsigned char const *buf_end; /* reference to end in dfaexec(). */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+ assume in a multibyte encoding. */
+static int using_utf8;
+
+static void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+ using_utf8 = 1;
+#endif
+}
+#else
+static void
+check_utf8 (void)
+{
+}
#endif /* MBS_SUPPORT */
+
#ifdef MBS_SUPPORT
/* Note that characters become unsigned here. */
# define FETCH_WC(c, wc, eoferr) \
@@ -711,7 +732,14 @@ parse_bracket_exp (void)
while ((wc = wc1, (c = c1) != L']'));
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
+ if (MB_CUR_MAX > 1
+ && (!using_utf8
+ || invert
+ || work_mbc->nchars != 0
+ || work_mbc->nch_classes != 0
+ || work_mbc->nranges != 0
+ || work_mbc->nequivs != 0
+ || work_mbc->ncoll_elems != 0))
{
static charclass zeroclass;
work_mbc->invert = invert;
@@ -722,6 +750,9 @@ parse_bracket_exp (void)
if (invert)
{
+#ifdef MBS_SUPPORT
+ assert(MB_CUR_MAX == 1);
+#endif
notset(ccl);
if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit(eolbyte, ccl);
@@ -2940,6 +2971,7 @@ dfainit (struct dfa *d)
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
+ check_utf8();
dfainit(d);
dfaparse(s, len, d);
dfamust(d);
--
1.6.6.1
- Re: [PATCH 2/9] dfa: fix handling of ranges in multibyte character sets, (continued)
[PATCH 3/9] dfa: rewrite handling of multibyte case_fold lexing, Paolo Bonzini, 2010/03/14
[PATCH 4/9] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/14
[PATCH 5/9] dfa: optimize simple character sets under UTF-8 charsets,
Paolo Bonzini <=
[PATCH 7/9] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/14
[PATCH 6/9] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/14
[PATCH 8/9] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/14
[PATCH 9/9] grep: match multibyte charsets line-by-line when using -i, Paolo Bonzini, 2010/03/14