From 9a5c6c856892fde5df07666d4bb6641a05f33712 Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 19 Feb 2014 19:22:24 -0800 Subject: [PATCH 1/2] maint: give dfa.c's using_utf8 function external scope * src/dfa.c (using_utf8): Remove "static inline". * src/dfa.h (using_utf8): Declare it. * src/searchutils.c (is_mb_middle): Use using_utf8 rather than rolling our own. --- src/dfa.c | 2 +- src/dfa.h | 2 ++ src/searchutils.c | 9 ++------- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index a133e03..ba9e7a2 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c) /* UTF-8 encoding allows some optimizations that we can't otherwise assume in a multibyte encoding. */ -static inline int +int using_utf8 (void) { static int utf8 = -1; diff --git a/src/dfa.h b/src/dfa.h index bacd489..7e0674f 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -99,3 +99,5 @@ extern void dfawarn (const char *); takes a single argument, a NUL-terminated string describing the error. The user must supply a dfaerror. */ extern _Noreturn void dfaerror (const char *); + +extern int using_utf8 (void); diff --git a/src/searchutils.c b/src/searchutils.c index 3478417..7363701 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -19,6 +19,7 @@ #include #include #include "search.h" +#include "dfa.h" #if HAVE_LANGINFO_CODESET # include #endif @@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const char *end, const char *p = *good; const char *prev = p; mbstate_t cur_state; -#if HAVE_LANGINFO_CODESET - static int is_utf8 = -1; - - if (is_utf8 == -1) - is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8"); - if (is_utf8 && buf - p > MB_CUR_MAX) + if (using_utf8 () && buf - p > MB_CUR_MAX) { for (p = buf; buf - p > MB_CUR_MAX; p--) if (mbclen_cache[to_uchar (*p)] != (size_t) -1) @@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const char *end, if (buf - p == MB_CUR_MAX) p = buf; } -#endif memset (&cur_state, 0, sizeof cur_state); -- 1.9.0 From 5295d1d528afabba15ed0710211ba24854c0c7ab Mon Sep 17 00:00:00 2001 From: Jim Meyering Date: Wed, 19 Feb 2014 19:31:43 -0800 Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte non-UTF8 locales * src/main.c: Include dfa.h. (trivial_case_ignore): Perform this optimization only for UTF8 locales. This rectifies a 200x performance regression in multi-byte non-UTF8 locales like ja_JP.eucJP. The regression was introduced by the 10x UTF8/grep-i speedup, commit v2.16-4-g97318f5. * NEWS (Bug fixes): Mention it. Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50 --- NEWS | 5 +++++ src/main.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/NEWS b/NEWS index 6785a96..49a17b0 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Bug fixes + + grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower + than in 2.16. [bug introduced in grep-2.17] + * Noteworthy changes in release 2.17 (2014-02-17) [stable] diff --git a/src/main.c b/src/main.c index bd20297..56ec6b3 100644 --- a/src/main.c +++ b/src/main.c @@ -34,6 +34,7 @@ #include "c-ctype.h" #include "closeout.h" #include "colorize.h" +#include "dfa.h" #include "error.h" #include "exclude.h" #include "exitfail.h" @@ -1883,6 +1884,11 @@ static bool trivial_case_ignore (size_t len, char const *keys, size_t *new_len, char **new_keys) { + /* Perform this translation only for UTF-8. Otherwise, this would induce + a 100-200x performance penalty for non-UTF8 multibyte locales. */ + if ( ! using_utf8 ()) + return false; + /* FIXME: consider removing the following restriction: Reject if KEYS contain ASCII '\\' or '['. */ if (memchr (keys, '\\', len) || memchr (keys, '[', len)) -- 1.9.0