[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed m
From: |
Paolo Bonzini |
Subject: |
[PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match |
Date: |
Fri, 12 Mar 2010 18:49:17 +0100 |
Avoid computing ahead something that can be computed lazily as efficiently
(or more efficiently in the case of UTF-8, though this is left as TODO).
At the same time, "soften" the rejection condition for matching in the
middle of a multibyte sequence to fix bug 23814.
Multibyte "grep -i" is still very slow.
* src/search.c (check_multibyte_string): Rewrite as...
(is_mb_middle): ... this.
(EGexecute, Fexecute): Adjust.
* tests/Makefile.am (TESTS): Add euc-mb.
* tests/euc-mb: New testcase.
---
src/search.c | 67 +++++++++++++++++++++++++---------------------------
tests/Makefile.am | 1 +
tests/euc-mb | 23 ++++++++++++++++++
3 files changed, 56 insertions(+), 35 deletions(-)
create mode 100644 tests/euc-mb
diff --git a/src/search.c b/src/search.c
index 56f7068..a87e33a 100644
--- a/src/search.c
+++ b/src/search.c
@@ -219,20 +219,24 @@ kwsmusts (void)
#ifdef MBS_SUPPORT
-static char*
-check_multibyte_string(const char *buf, size_t size)
+static bool
+is_mb_middle(const char **good, const char *buf, const char *end)
{
- char *mb_properties = xcalloc(size, 1);
+ const char *p = *good, *prev = p;
mbstate_t cur_state;
wchar_t wc;
int i;
+ /* TODO: can be optimized for UTF-8. */
memset(&cur_state, 0, sizeof(mbstate_t));
-
- for (i = 0; i < size ;)
+ while (p < buf)
{
size_t mbclen;
- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state);
+ mbclen = mbrlen(p, end - p, &cur_state);
+
+ /* Store the beginning of the previous complete multibyte character. */
+ if (mbclen != (size_t) -2)
+ prev = p;
if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
{
@@ -240,11 +244,11 @@ check_multibyte_string(const char *buf, size_t size)
We treat it as a single byte character. */
mbclen = 1;
}
- mb_properties[i] = mbclen;
- i += mbclen;
+ p += mbclen;
}
- return mb_properties;
+ *good = prev;
+ return p > buf;
}
#endif /* MBS_SUPPORT */
@@ -366,13 +370,12 @@ COMPILE_FCT(Ecompile)
EXECUTE_FCT(EGexecute)
{
- register char const *buflim, *beg, *end, *match, *best_match;
+ char const *buflim, *beg, *end, *match, *best_match, *mb_start;
char eol = eolbyte;
int backref, start, len, best_len;
struct kwsmatch kwsm;
size_t i, ret_val;
#ifdef MBS_SUPPORT
- char *mb_properties = NULL;
if (MB_CUR_MAX > 1)
{
if (match_icase)
@@ -384,12 +387,10 @@ EXECUTE_FCT(EGexecute)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
}
-
- if (kwset)
- mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
+ mb_start = buf;
buflim = buf + size;
for (beg = end = buf; end < buflim; beg = end)
@@ -410,14 +411,18 @@ EXECUTE_FCT(EGexecute)
end++;
else
end = buflim;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
- continue;
-#endif
+ match = beg;
while (beg > buf && beg[-1] != eol)
--beg;
if (kwsm.index < kwset_exact_matches)
- goto success;
+ {
+#ifdef MBS_SUPPORT
+ if (mb_start < beg)
+ mb_start = beg;
+ if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match,
buflim))
+#endif
+ goto success;
+ }
if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
continue;
}
@@ -551,10 +556,6 @@ EXECUTE_FCT(EGexecute)
*match_size = len;
ret_val = beg - buf;
out:
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
-#endif /* MBS_SUPPORT */
return ret_val;
}
#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
@@ -606,13 +607,12 @@ COMPILE_FCT(Fcompile)
EXECUTE_FCT(Fexecute)
{
- register char const *beg, *try, *end;
- register size_t len;
+ char const *beg, *try, *end, *mb_start;
+ size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
size_t ret_val;
#ifdef MBS_SUPPORT
- char *mb_properties = NULL;
if (MB_CUR_MAX > 1)
{
if (match_icase)
@@ -622,19 +622,20 @@ EXECUTE_FCT(Fexecute)
start_ptr = case_buf + (start_ptr - buf);
buf = case_buf;
}
-
- mb_properties = check_multibyte_string(buf, size);
}
#endif /* MBS_SUPPORT */
- for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
+ for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
{
size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
if (offset == (size_t) -1)
goto failure;
#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
- continue; /* It is a part of multibyte character. */
+ if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
+ {
+ beg = mb_start - 1;
+ continue; /* It is a part of multibyte character. */
+ }
#endif /* MBS_SUPPORT */
beg += offset;
len = kwsmatch.size[0];
@@ -686,10 +687,6 @@ EXECUTE_FCT(Fexecute)
*match_size = len;
ret_val = beg - buf;
out:
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- free (mb_properties);
-#endif /* MBS_SUPPORT */
return ret_val;
}
#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
diff --git a/tests/Makefile.am b/tests/Makefile.am
index f915e2a..d6602bf 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -23,6 +23,7 @@ TESTS = \
dfaexec-multibyte \
empty.sh \
ere.sh \
+ euc-mb \
file.sh \
fmbtest.sh \
foad1.sh \
diff --git a/tests/euc-mb b/tests/euc-mb
new file mode 100644
index 0000000..a59c295
--- /dev/null
+++ b/tests/euc-mb
@@ -0,0 +1,23 @@
+#!/bin/sh
+# test that matches starting in the middle of a multibyte char aren't rejected
+# too greedily.
+# Derived from https://savannah.gnu.org/bugs/?23814
+: ${srcdir=.}
+. "$srcdir/init.sh"; path_prepend_ ../src
+
+make_input () {
+ echo "$1" | tr AB '\244\263'
+}
+
+euc_grep () {
+ LC_ALL=ja_JP.EUC-JP grep `make_input "$1"`
+}
+
+if make_input BABA |euc_grep AB ; then
+ skip_ 'EUC-JP locale seems not to work'
+fi
+
+make_input BABAAB |euc_grep AB || \
+ fail_ 'whole line rejected after matching in the middle of a multibyte char'
+
+exit 0
--
1.6.6
- Re: [PATCH 09/17] syntax-check: enable space-tab, (continued)
- [PATCH 08/17] syntax-check: enable m4-quote-check, Paolo Bonzini, 2010/03/12
- [PATCH 10/17] tests: add more UTF-8 test cases, Paolo Bonzini, 2010/03/12
- [PATCH 13/17] dfa: optimize simple character sets under UTF-8 charsets, Paolo Bonzini, 2010/03/12
- [PATCH 12/17] dfa: speed up handling of brackets, Paolo Bonzini, 2010/03/12
- [PATCH 11/17] dfa: rewrite handling of multibyte case folding, Paolo Bonzini, 2010/03/12
- [PATCH 14/17] dfa: cache MB_CUR_MAX for dfaexec, Paolo Bonzini, 2010/03/12
- [PATCH 15/17] dfa: run simple UTF-8 regexps as a single-byte character set, Paolo Bonzini, 2010/03/12
- [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match,
Paolo Bonzini <=
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/13
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/14
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Paolo Bonzini, 2010/03/15
- Re: [PATCH 16/17] grep: remove check_multibyte_string, fix non-UTF8 missed match, Norihiro Tanaka, 2010/03/19
[PATCH 17/17] grep: match multibyte charsets line-by-line when using -i, Paolo Bonzini, 2010/03/12
Re: [PATCH 00/16] my last hefty patch drop, Jim Meyering, 2010/03/12