[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.8-16-g126cc2e
From: |
Paolo Bonzini |
Subject: |
grep branch, master, updated. v2.8-16-g126cc2e |
Date: |
Tue, 07 Jun 2011 11:32:12 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".
The branch, master has been updated
via 126cc2ea4d8579f97a8d1071b37ff638c22c2b36 (commit)
via 4b29aa3ca3498fa26e8646a6cebb5f50c396ab6c (commit)
from c4bf2934edfa8eea03e0a8c5028e5d2cdaa63968 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=126cc2ea4d8579f97a8d1071b37ff638c22c2b36
commit 126cc2ea4d8579f97a8d1071b37ff638c22c2b36
Author: Paolo Bonzini <address@hidden>
Date: Tue Jun 7 12:24:38 2011 +0200
dfa: optimize wide characters in a bracket expression
* src/dfa.c (addtok): Compile characters to an alternation. Handle the
case when nothing else remains in the MBCSET.
diff --git a/NEWS b/NEWS
index 67b3fad..d026448 100644
--- a/NEWS
+++ b/NEWS
@@ -4,9 +4,13 @@ GNU grep NEWS -*- outline
-*-
** Bug fixes
+ grep is faster on regular expressions that match multibyte characters
+ in brackets (such as '[áéÃóú]').
+
echo c|grep '[c]' would fail for any c in 0x80..0xff, with a uni-byte
encoding for which the byte-to-wide-char mapping is nontrivial. For
example, the ISO-88591 locales are not affected, but ru_RU.KOI8-R is.
+ [bug introduced in grep-2.6]
grep -P no longer aborts when PCRE's backtracking limit is exceeded
Before, echo aaaaaaaaaaaaaab |grep -P '((a+)*)+$' would abort. Now,
diff --git a/src/dfa.c b/src/dfa.c
index 28f8daf..873530f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1447,6 +1447,8 @@ addtok_mb (token t, int mbprop)
dfa->depth = depth;
}
+static void addtok_wc (wint_t wc);
+
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
@@ -1455,8 +1457,24 @@ addtok (token t)
#if MBS_SUPPORT
if (MB_CUR_MAX > 1 && t == MBCSET)
{
+ bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
+ /* Extract wide characters into alternations for better performance.
+ This does not require UTF-8. */
+ if (!work_mbc->invert)
+ {
+ int i;
+ for (i = 0; i < work_mbc->nchars; i++)
+ {
+ addtok_wc (work_mbc->chars[i]);
+ if (need_or)
+ addtok (OR);
+ need_or = true;
+ }
+ work_mbc->nchars = 0;
+ }
+
/* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
if (work_mbc->invert
|| (!using_utf8() && work_mbc->cset != -1)
@@ -1465,13 +1483,22 @@ addtok (token t)
|| work_mbc->nranges != 0
|| work_mbc->nequivs != 0
|| work_mbc->ncoll_elems != 0)
- addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ {
+ addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ if (need_or)
+ addtok (OR);
+ }
else
{
- /* The single-byte character set must be non-empty, or due to the
- test above the entire MBCSET would be empty (which is invalid).
*/
- assert (using_utf8() && work_mbc->cset != -1);
- addtok (CSET + work_mbc->cset);
+ /* Characters have been handled above, so it is possible
+ that the mbcset is empty now. Do nothing in that case. */
+ if (work_mbc->cset != -1)
+ {
+ assert (using_utf8 ());
+ addtok (CSET + work_mbc->cset);
+ if (need_or)
+ addtok (OR);
+ }
}
}
else
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=4b29aa3ca3498fa26e8646a6cebb5f50c396ab6c
commit 126cc2ea4d8579f97a8d1071b37ff638c22c2b36
Author: Paolo Bonzini <address@hidden>
Date: Tue Jun 7 12:24:38 2011 +0200
dfa: optimize wide characters in a bracket expression
* src/dfa.c (addtok): Compile characters to an alternation. Handle the
case when nothing else remains in the MBCSET.
diff --git a/NEWS b/NEWS
index 67b3fad..d026448 100644
--- a/NEWS
+++ b/NEWS
@@ -4,9 +4,13 @@ GNU grep NEWS -*- outline
-*-
** Bug fixes
+ grep is faster on regular expressions that match multibyte characters
+ in brackets (such as '[áéÃóú]').
+
echo c|grep '[c]' would fail for any c in 0x80..0xff, with a uni-byte
encoding for which the byte-to-wide-char mapping is nontrivial. For
example, the ISO-88591 locales are not affected, but ru_RU.KOI8-R is.
+ [bug introduced in grep-2.6]
grep -P no longer aborts when PCRE's backtracking limit is exceeded
Before, echo aaaaaaaaaaaaaab |grep -P '((a+)*)+$' would abort. Now,
diff --git a/src/dfa.c b/src/dfa.c
index 28f8daf..873530f 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -1447,6 +1447,8 @@ addtok_mb (token t, int mbprop)
dfa->depth = depth;
}
+static void addtok_wc (wint_t wc);
+
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
@@ -1455,8 +1457,24 @@ addtok (token t)
#if MBS_SUPPORT
if (MB_CUR_MAX > 1 && t == MBCSET)
{
+ bool need_or = false;
struct mb_char_classes *work_mbc = &dfa->mbcsets[dfa->nmbcsets - 1];
+ /* Extract wide characters into alternations for better performance.
+ This does not require UTF-8. */
+ if (!work_mbc->invert)
+ {
+ int i;
+ for (i = 0; i < work_mbc->nchars; i++)
+ {
+ addtok_wc (work_mbc->chars[i]);
+ if (need_or)
+ addtok (OR);
+ need_or = true;
+ }
+ work_mbc->nchars = 0;
+ }
+
/* UTF-8 allows treating a simple, non-inverted MBCSET like a CSET. */
if (work_mbc->invert
|| (!using_utf8() && work_mbc->cset != -1)
@@ -1465,13 +1483,22 @@ addtok (token t)
|| work_mbc->nranges != 0
|| work_mbc->nequivs != 0
|| work_mbc->ncoll_elems != 0)
- addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ {
+ addtok_mb (MBCSET, ((dfa->nmbcsets - 1) << 2) + 3);
+ if (need_or)
+ addtok (OR);
+ }
else
{
- /* The single-byte character set must be non-empty, or due to the
- test above the entire MBCSET would be empty (which is invalid).
*/
- assert (using_utf8() && work_mbc->cset != -1);
- addtok (CSET + work_mbc->cset);
+ /* Characters have been handled above, so it is possible
+ that the mbcset is empty now. Do nothing in that case. */
+ if (work_mbc->cset != -1)
+ {
+ assert (using_utf8 ());
+ addtok (CSET + work_mbc->cset);
+ if (need_or)
+ addtok (OR);
+ }
}
}
else
-----------------------------------------------------------------------
Summary of changes:
NEWS | 4 ++++
src/dfa.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++---------
2 files changed, 52 insertions(+), 9 deletions(-)
hooks/post-receive
--
grep
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- grep branch, master, updated. v2.8-16-g126cc2e,
Paolo Bonzini <=