[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 3/3] dfa: optimize UTF-8 period
From: |
Paolo Bonzini |
Subject: |
[PATCH 3/3] dfa: optimize UTF-8 period |
Date: |
Sat, 17 Apr 2010 03:34:54 +0200 |
* NEWS: Document improvement.
* src/dfa.c (struct dfa): Add utf8_anychar_classes.
(add_utf8_anychar): New.
(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR
in UTF-8 locales.
(dfaoptimize): Abort on ANYCHAR.
---
NEWS | 6 ++++++
src/dfa.c | 46 +++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 49 insertions(+), 3 deletions(-)
diff --git a/NEWS b/NEWS
index bcca373..80074a7 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,12 @@ GNU grep NEWS -*- outline
-*-
X{0,0} is implemented correctly. It used to be a synonym of X{0,1}.
[bug present since "the beginning"]
+
+ In UTF-8 locales, regular expressions including "." can be orders
+ of magnitude faster. For example, "grep ." is now twice as fast
+ as "grep -v ^$", instead of being immensely slower. It remains
+ slow in other multibyte locales. [bug present since multi-byte
+ character set support was introduced in 2.5.2]
* Noteworthy changes in release 2.6.3 (2010-04-02) [stable]
diff --git a/src/dfa.c b/src/dfa.c
index 9b67240..6e57fb0 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -311,6 +311,7 @@ struct dfa
with dfaparse(). */
#if MBS_SUPPORT
unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */
+ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */
/* The following are used only if MB_CUR_MAX > 1. */
@@ -1475,6 +1476,35 @@ addtok_wc (wint_t wc)
}
#endif
+static void
+add_utf8_anychar (void)
+{
+ static charclass utf8_classes[5] = {
+ { 0, 0, 0, 0, ~0, ~0, 0, 0 },
+ { ~0, ~0, ~0, ~0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0xfffffffcU, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0xffff },
+ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 }
+ };
+ const int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
+ int i;
+
+ if (dfa->utf8_anychar_classes[0] == 0)
+ for (i = 0; i < n; i++)
+ dfa->utf8_anychar_classes[i] = CSET + charclass_index(utf8_classes[i]);
+
+ /* (B|CA|DAA|EAAA) = (B|(C|(D|EA)A)A) =
+ B (C (D (E A CAT) OR A CAT) OR A CAT) OR */
+ for (i = 1; i < n; i++)
+ addtok (dfa->utf8_anychar_classes[i]);
+ while (--i > 1)
+ {
+ addtok (dfa->utf8_anychar_classes[0]);
+ addtok (CAT);
+ addtok (OR);
+ }
+}
+
/* The grammar understood by the parser is as follows.
regexp:
@@ -1513,8 +1543,11 @@ addtok_wc (wint_t wc)
static void
atom (void)
{
+ if (0)
+ ;
+
#if MBS_SUPPORT
- if (tok == WCHAR)
+ else if (tok == WCHAR)
{
addtok_wc (case_fold ? towlower(wctok) : wctok);
#ifndef GREP
@@ -1526,11 +1559,16 @@ atom (void)
#endif
tok = lex();
- return;
+ }
+
+ else if (tok == ANYCHAR && using_utf8())
+ {
+ add_utf8_anychar();
+ tok = lex();
}
#endif /* MBS_SUPPORT */
- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
+ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
#if MBS_SUPPORT
|| tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */
@@ -3307,6 +3345,8 @@ dfaoptimize (struct dfa *d)
switch(d->tokens[i])
{
case ANYCHAR:
+ /* Lowered. */
+ abort ();
case MBCSET:
/* Requires multi-byte algorithm. */
return;
--
1.6.6.1
Re: [PATCH 1/3] dfa: simplify dfainit, Jim Meyering, 2010/04/17