[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets
From: |
Paolo Bonzini |
Subject: |
[PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets |
Date: |
Fri, 12 Mar 2010 18:49:14 +0100 |
Only use a bitset when possible without involving MBCSET. Testcase:
yes 'the quick brown fox jumps over the lazy dog' | sed 100000q | \
time grep -c [ABCDEFGHIJKLMNOPQRSTUVWXYZ,]
Before: 51ms (best of three runs); after: 16ms(best of three runs).
* src/dfa.c (check_utf8, using_utf8): New.
(parse_bracket_exp): For simple bracket expressions under UTF-8,
use a CSET.
(dfacomp): Call check_utf8.
---
src/dfa.c | 33 ++++++++++++++++++++++++++++++++-
1 files changed, 32 insertions(+), 1 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index add6ebd..f17f550 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -84,6 +84,7 @@
/* We can handle multibyte strings. */
# include <wchar.h>
# include <wctype.h>
+# include <langinfo.h>
#endif
#include "regex.h"
@@ -296,8 +297,27 @@ static wchar_t *inputwcs; /* Wide character
representation of input
And inputwcs[i] is the codepoint. */
static unsigned char const *buf_begin; /* reference to begin in dfaexec(). */
static unsigned char const *buf_end; /* reference to end in dfaexec(). */
+
+/* UTF-8 encoding allows some optimizations that we can't otherwise
+ assume in a multibyte encoding. */
+static int using_utf8;
+
+void
+check_utf8 (void)
+{
+#ifdef HAVE_LANGINFO_CODESET
+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0)
+ using_utf8 = 1;
+#endif
+}
+#else
+void
+check_utf8 (void)
+{
+}
#endif /* MBS_SUPPORT */
+
#ifdef MBS_SUPPORT
/* Note that characters become unsigned here. */
# define FETCH_WC(c, wc, eoferr) \
@@ -688,7 +708,14 @@ parse_bracket_exp (void)
while ((wc = wc1, (c = c1) != L']'));
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
+ if (MB_CUR_MAX > 1
+ && (!using_utf8
+ || invert
+ || work_mbc->nchars != 0
+ || work_mbc->nch_classes != 0
+ || work_mbc->nranges != 0
+ || work_mbc->nequivs != 0
+ || work_mbc->ncoll_elems != 0))
{
static charclass zeroclass;
work_mbc->invert = invert;
@@ -699,6 +726,9 @@ parse_bracket_exp (void)
if (invert)
{
+#ifdef MBS_SUPPORT
+ assert(MB_CUR_MAX == 1);
+#endif
notset(ccl);
if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
clrbit(eolbyte, ccl);
@@ -2916,6 +2946,7 @@ dfainit (struct dfa *d)
void
dfacomp (char const *s, size_t len, struct dfa *d, int searchflag)
{
+ check_utf8();
dfainit(d);
dfaparse(s, len, d);
dfamust(d);
--
1.6.6
- [PATCH 07/17] syntax-check: enable makefile-TAB-only-indentation, (continued)
- [PATCH 07/17] syntax-check: enable makefile-TAB-only-indentation, Paolo Bonzini, 2010/03/12
- [PATCH 06/17] grep: fix error-message-uppercase, Paolo Bonzini, 2010/03/12
- [PATCH 09/17] syntax-check: enable space-tab, Paolo Bonzini, 2010/03/12
- [PATCH 08/17] syntax-check: enable m4-quote-check, Paolo Bonzini, 2010/03/12
- [PATCH 10/17] tests: add more UTF-8 test cases, Paolo Bonzini, 2010/03/12
- [PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets,
Paolo Bonzini <=
- [PATCH 12/17] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/12
- [PATCH 11/17] dfa: rewrite handling of multibyte case folding, Paolo Bonzini, 2010/03/12
- [PATCH 14/17] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/12
- [PATCH 15/17] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/12
- [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/12
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/13
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/15
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/19