From cbf32a5096bf394c3031067fc174ea526d41876d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Sep 2010 09:26:23 +0200 Subject: [PATCH 1/3] gnulib: update to latest This is done to include commit "regex: Pass the system regex if its only problem is 32-bit regoff_t". * gnulib: Update to e2b0e1a. --- gnulib | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/gnulib b/gnulib index 8895249..e2b0e1a 160000 --- a/gnulib +++ b/gnulib @@ -1 +1 @@ -Subproject commit 88952495193d0d183a50de8c75b58e7c5bda7b91 +Subproject commit e2b0e1a20b071f1516829ffe5bbddbc41007fd63 -- 1.7.1 From d37022ce1838d55b94dfb7b0d0b26992129ca952 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 29 Apr 2010 17:13:32 +0200 Subject: [PATCH 2/3] dfa: fall back to glibc matcher if a MBCSET is found This patch enables full support of equivalence classes and multicharacter collation symbols. It can also improve performance problems in some cases for multibyte grep. Both these changes however depend on the glibc version installed in the system. For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z]. For other character sets all brackets and `.` as well will trigger it. * NEWS: Document this. * src/dfa.c (dfaexec): Fall back to glibc for multibyte matches, if possible. --- NEWS | 2 ++ src/dfa.c | 13 +++++++++++++ 2 files changed, 15 insertions(+), 0 deletions(-) diff --git a/NEWS b/NEWS index 18289ba..b669bc0 100644 --- a/NEWS +++ b/NEWS @@ -36,6 +36,8 @@ GNU grep NEWS -*- outline -*- [[:digit:]]. This new behavior is disabled when the POSIXLY_CORRECT environment variable is set. + On systems using glibc, grep can support equivalence classes. However, + whether they actually work depends on glibc's locale definitions. * Noteworthy changes in release 2.6.3 (2010-04-02) [stable] diff --git a/src/dfa.c b/src/dfa.c index 91124b6..63a63a3 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3237,6 +3237,19 @@ dfaexec (struct dfa *d, char const *begin, char *end, continue; } + /* Falling back to the glibc matcher in this case gives + better performance (up to 25% better on [a-z], for + example) and enables support for collating symbols and + equivalence classes. */ + if (backref) + { + *backref = 1; + free(mblen_buf); + free(inputwcs); + *end = saved_end; + return (char *) p; + } + /* Can match with a multibyte character (and multi character collating element). Transition table might be updated. */ s = transit_state(d, s, &p); -- 1.7.1 From db810985f3b61b54e4e69e5968d16b2205748ff8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 8 Sep 2010 11:39:56 +0200 Subject: [PATCH 3/3] tests: add equiv-classes * configure.ac (USE_INCLUDED_REGEX): Add Automake conditional. * tests/equiv-classes: New test. * tests/Makefile.am (TESTS): Add it. (XFAIL_TESTS) [USE_INCLUDED_REGEX]: Mark it as expected failure. --- configure.ac | 1 + tests/Makefile.am | 7 +++++++ tests/equiv-classes | 12 ++++++++++++ 3 files changed, 20 insertions(+), 0 deletions(-) create mode 100644 tests/equiv-classes diff --git a/configure.ac b/configure.ac index 01ae0f2..481ce86 100644 --- a/configure.ac +++ b/configure.ac @@ -151,6 +151,7 @@ dnl Some installers want to be informed if we do not use our regex. dnl For example, if the host platform uses dynamic linking and the installer dnl knows that the grep may be invoked on other hosts with buggy libraries, dnl then the installer should configure --with-included-regex. +AM_CONDITIONAL([USE_INCLUDED_REGEX], [test "$ac_use_included_regex" = yes]) if test "$ac_use_included_regex" = no; then AC_MSG_WARN([Included lib/regex.c not used]) fi diff --git a/tests/Makefile.am b/tests/Makefile.am index 13e549d..f66543f 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -27,6 +27,12 @@ XFAIL_TESTS = \ word-delim-multibyte \ grep-dir +# Equivalence classes are only supported when using the system +# matcher (which means only with glibc). +if USE_INCLUDED_REGEX +XFAIL_TESTS += equiv-classes +endif + TESTS = \ backref \ backref-word \ @@ -41,6 +47,7 @@ TESTS = \ dfaexec-multibyte \ empty \ ere \ + equiv-classes \ euc-mb \ fedora \ fgrep-infloop \ diff --git a/tests/equiv-classes b/tests/equiv-classes new file mode 100644 index 0000000..de38d95 --- /dev/null +++ b/tests/equiv-classes @@ -0,0 +1,12 @@ +#!/bin/sh +# Test that equivalence classes work. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src + +require_en_utf8_locale_ + +LC_ALL=en_US.UTF-8 +export LC_ALL + +echo à | grep '[[=a=]]' > /dev/null +Exit $? -- 1.7.1