>From a905358bfb2ed0d323d77c18008d62f49f0ab2c0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Apr 2010 17:13:32 +0200 Subject: [PATCH 1/2] dfa: fall back to glibc matcher if a MBCSET is found This patch works around some of the performance problems of multibyte grep. For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z]. For other character sets all brackets and `.` as well will trigger it. * NEWS: Document this. * src/dfa.c (dfaexec): Fall back to glibc for multibyte matches, if possible. --- NEWS | 3 +++ src/dfa.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 0 deletions(-) diff --git a/NEWS b/NEWS index 18289ba..922f63c 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,9 @@ GNU grep NEWS -*- outline -*- [[:digit:]]. This new behavior is disabled when the POSIXLY_CORRECT environment variable is set. + On systems using glibc, grep can support equivalence classes. To + ensure this is the case, however, you should configure grep with the + command-line option --without-included-regex. * Noteworthy changes in release 2.6.3 (2010-04-02) [stable] diff --git a/src/dfa.c b/src/dfa.c index 91124b6..10e2ca4 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3237,6 +3237,19 @@ dfaexec (struct dfa *d, char const *begin, char *end, continue; } + /* Falling back to the glibc matcher in this case gives + better performance (up to 25% better on [a-z], for + example) and enables support for collating symbols and + equivalence classes. */ + if (backref) + { + *backref = 1; + free(mblen_buf); + free(inputwcs); + *end = saved_end; + return (char *) p; + } + /* Can match with a multibyte character (and multi character collating element). Transition table might be updated. */ s = transit_state(d, s, &p); -- 1.7.1 >From 9436cd39545c24a5155bbeafa775b54f22fd83a4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Sep 2010 11:39:56 +0200 Subject: [PATCH 2/2] tests: add equiv-classes * configure.ac (USE_INCLUDED_REGEX): Add Automake conditional. * tests/equiv-classes: New test. * tests/Makefile.am (TESTS): Add it. (XFAIL_TESTS) [USE_INCLUDED_REGEX]: Mark it as expected failure. --- configure.ac | 1 + tests/Makefile.am | 7 +++++++ tests/equiv-classes | 12 ++++++++++++ 3 files changed, 20 insertions(+), 0 deletions(-) create mode 100644 tests/equiv-classes diff --git a/configure.ac b/configure.ac index 01ae0f2..481ce86 100644 --- a/configure.ac +++ b/configure.ac @@ -151,6 +151,7 @@ dnl Some installers want to be informed if we do not use our regex. dnl For example, if the host platform uses dynamic linking and the installer dnl knows that the grep may be invoked on other hosts with buggy libraries, dnl then the installer should configure --with-included-regex. +AM_CONDITIONAL([USE_INCLUDED_REGEX], [test "$ac_use_included_regex" = yes]) if test "$ac_use_included_regex" = no; then AC_MSG_WARN([Included lib/regex.c not used]) fi diff --git a/tests/Makefile.am b/tests/Makefile.am index 13e549d..f66543f 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -27,6 +27,12 @@ XFAIL_TESTS = \ word-delim-multibyte \ grep-dir +# Equivalence classes are only supported when using the system +# matcher (which means only with glibc). +if USE_INCLUDED_REGEX +XFAIL_TESTS += equiv-classes +endif + TESTS = \ backref \ backref-word \ @@ -41,6 +47,7 @@ TESTS = \ dfaexec-multibyte \ empty \ ere \ + equiv-classes \ euc-mb \ fedora \ fgrep-infloop \ diff --git a/tests/equiv-classes b/tests/equiv-classes new file mode 100644 index 0000000..de38d95 --- /dev/null +++ b/tests/equiv-classes @@ -0,0 +1,12 @@ +#!/bin/sh +# Test that equivalence classes work. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ + +LC_ALL=en_US.UTF-8 +export LC_ALL + +echo à | grep '[[=a=]]' > /dev/null +Exit $? -- 1.7.1