From 9a5c6c856892fde5df07666d4bb6641a05f33712 Mon Sep 17 00:00:00 2001
From: Jim Meyering
Date: Wed, 19 Feb 2014 19:22:24 -0800
Subject: [PATCH 1/2] maint: give dfa.c's using_utf8 function external scope
* src/dfa.c (using_utf8): Remove "static inline".
* src/dfa.h (using_utf8): Declare it.
* src/searchutils.c (is_mb_middle): Use using_utf8 rather than
rolling our own.
---
src/dfa.c | 2 +-
src/dfa.h | 2 ++
src/searchutils.c | 9 ++-------
3 files changed, 5 insertions(+), 8 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index a133e03..ba9e7a2 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -753,7 +753,7 @@ setbit_case_fold_c (int b, charclass c)
/* UTF-8 encoding allows some optimizations that we can't otherwise
assume in a multibyte encoding. */
-static inline int
+int
using_utf8 (void)
{
static int utf8 = -1;
diff --git a/src/dfa.h b/src/dfa.h
index bacd489..7e0674f 100644
--- a/src/dfa.h
+++ b/src/dfa.h
@@ -99,3 +99,5 @@ extern void dfawarn (const char *);
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);
+
+extern int using_utf8 (void);
diff --git a/src/searchutils.c b/src/searchutils.c
index 3478417..7363701 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -19,6 +19,7 @@
#include
#include
#include "search.h"
+#include "dfa.h"
#if HAVE_LANGINFO_CODESET
# include
#endif
@@ -234,13 +235,8 @@ is_mb_middle (const char **good, const char *buf, const char *end,
const char *p = *good;
const char *prev = p;
mbstate_t cur_state;
-#if HAVE_LANGINFO_CODESET
- static int is_utf8 = -1;
-
- if (is_utf8 == -1)
- is_utf8 = STREQ (nl_langinfo (CODESET), "UTF-8");
- if (is_utf8 && buf - p > MB_CUR_MAX)
+ if (using_utf8 () && buf - p > MB_CUR_MAX)
{
for (p = buf; buf - p > MB_CUR_MAX; p--)
if (mbclen_cache[to_uchar (*p)] != (size_t) -1)
@@ -249,7 +245,6 @@ is_mb_middle (const char **good, const char *buf, const char *end,
if (buf - p == MB_CUR_MAX)
p = buf;
}
-#endif
memset (&cur_state, 0, sizeof cur_state);
--
1.9.0
From 5295d1d528afabba15ed0710211ba24854c0c7ab Mon Sep 17 00:00:00 2001
From: Jim Meyering
Date: Wed, 19 Feb 2014 19:31:43 -0800
Subject: [PATCH 2/2] grep -i: avoid 200x perf. regression in multibyte
non-UTF8 locales
* src/main.c: Include dfa.h.
(trivial_case_ignore): Perform this optimization only for UTF8 locales.
This rectifies a 200x performance regression in multi-byte non-UTF8
locales like ja_JP.eucJP. The regression was introduced by the 10x
UTF8/grep-i speedup, commit v2.16-4-g97318f5.
* NEWS (Bug fixes): Mention it.
Reported by Norihiro Tanaka in http://debbugs.gnu.org/16232#50
---
NEWS | 5 +++++
src/main.c | 6 ++++++
2 files changed, 11 insertions(+)
diff --git a/NEWS b/NEWS
index 6785a96..49a17b0 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Bug fixes
+
+ grep -i in a multibyte, non-UTF8 locale could be up to 200 times slower
+ than in 2.16. [bug introduced in grep-2.17]
+
* Noteworthy changes in release 2.17 (2014-02-17) [stable]
diff --git a/src/main.c b/src/main.c
index bd20297..56ec6b3 100644
--- a/src/main.c
+++ b/src/main.c
@@ -34,6 +34,7 @@
#include "c-ctype.h"
#include "closeout.h"
#include "colorize.h"
+#include "dfa.h"
#include "error.h"
#include "exclude.h"
#include "exitfail.h"
@@ -1883,6 +1884,11 @@ static bool
trivial_case_ignore (size_t len, char const *keys,
size_t *new_len, char **new_keys)
{
+ /* Perform this translation only for UTF-8. Otherwise, this would induce
+ a 100-200x performance penalty for non-UTF8 multibyte locales. */
+ if ( ! using_utf8 ())
+ return false;
+
/* FIXME: consider removing the following restriction:
Reject if KEYS contain ASCII '\\' or '['. */
if (memchr (keys, '\\', len) || memchr (keys, '[', len))
--
1.9.0