[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/2] dfa: convert to wide character line-by-line
From: |
Paolo Bonzini |
Subject: |
[PATCH 1/2] dfa: convert to wide character line-by-line |
Date: |
Tue, 4 May 2010 17:37:43 +0200 |
This provides a nice speedup for -m in general, but especially
it avoids quadratic complexity in case we have to go to glibc.
* NEWS: Document change.
* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert
only up to the next newline.
(dfaexec): Exit multibyte processing loop if past buf_end.
Call prepare_wc_buf again after processing a newline.
---
NEWS | 5 +++
src/dfa.c | 97 ++++++++++++++++++++++++++++++++++++------------------------
2 files changed, 63 insertions(+), 39 deletions(-)
diff --git a/NEWS b/NEWS
index 1f3d00a..e64ec40 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,11 @@ GNU grep NEWS -*- outline
-*-
X{0,0} is implemented correctly. It used to be a synonym of X{0,1}.
[bug present since "the beginning"]
+ In multibyte locales, regular expressions including backreferences
+ no longer exhibit quadratic complexity (i.e., they are orders
+ of magnitude faster). [bug present since multi-byte character set
+ support was introduced in 2.5.2]
+
In UTF-8 locales, regular expressions including "." can be orders
of magnitude faster. For example, "grep ." is now twice as fast
as "grep -v ^$", instead of being immensely slower. It remains
diff --git a/src/dfa.c b/src/dfa.c
index 340a4c6..44efc02 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -3105,6 +3105,51 @@ transit_state (struct dfa *d, int s, unsigned char const
**pp)
#endif /* MBS_SUPPORT */
+/* Initialize mblen_buf and inputwcs with data from the next line. */
+
+static void
+prepare_wc_buf (const char *begin, const char *end)
+{
+ unsigned char eol = eolbyte;
+ size_t remain_bytes, i;
+
+ buf_begin = (unsigned char *) begin;
+
+ remain_bytes = 0;
+ for (i = 0; i < end - begin + 1; i++)
+ {
+ if (remain_bytes == 0)
+ {
+ remain_bytes
+ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
+ if (remain_bytes < 1
+ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
+ {
+ remain_bytes = 0;
+ inputwcs[i] = (wchar_t)begin[i];
+ mblen_buf[i] = 0;
+ if (begin[i] == eol)
+ break;
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ remain_bytes--;
+ }
+ }
+ else
+ {
+ mblen_buf[i] = remain_bytes;
+ inputwcs[i] = 0;
+ remain_bytes--;
+ }
+ }
+
+ buf_end = (unsigned char *) (begin + i);
+ mblen_buf[i] = 0;
+ inputwcs[i] = 0; /* sentinel */
+}
+
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3151,44 +3196,10 @@ dfaexec (struct dfa *d, char const *begin, char *end,
#if MBS_SUPPORT
if (d->mb_cur_max > 1)
{
- unsigned int i;
- int remain_bytes;
- buf_begin = (unsigned char *) begin;
- buf_end = (unsigned char *) end;
-
- /* initialize mblen_buf, and inputwcs. */
MALLOC(mblen_buf, unsigned char, end - begin + 2);
MALLOC(inputwcs, wchar_t, end - begin + 2);
- memset(&mbs, 0, sizeof mbs);
- remain_bytes = 0;
- for (i = 0; i < end - begin + 1; i++)
- {
- if (remain_bytes == 0)
- {
- remain_bytes
- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs);
- if (remain_bytes < 1
- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i]))
- {
- remain_bytes = 0;
- inputwcs[i] = (wchar_t)begin[i];
- mblen_buf[i] = 0;
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- remain_bytes--;
- }
- }
- else
- {
- mblen_buf[i] = remain_bytes;
- inputwcs[i] = 0;
- remain_bytes--;
- }
- }
- mblen_buf[i] = 0;
- inputwcs[i] = 0; /* sentinel */
+ memset(&mbs, 0, sizeof(mbstate_t));
+ prepare_wc_buf (p, end);
}
#endif /* MBS_SUPPORT */
@@ -3198,7 +3209,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
while ((t = trans[s]))
{
- if ((char *) p > end)
+ if (p > buf_end)
break;
s1 = s;
SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p);
@@ -3258,8 +3269,16 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
/* If the previous character was a newline, count it. */
- if (count && (char *) p <= end && p[-1] == eol)
- ++*count;
+ if ((char *) p <= end && p[-1] == eol)
+ {
+ if (count)
+ ++*count;
+
+#if MBS_SUPPORT
+ if (d->mb_cur_max > 1)
+ prepare_wc_buf (p, end);
+#endif
+ }
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
--
1.6.6.1
- [PATCH 1/2] dfa: convert to wide character line-by-line,
Paolo Bonzini <=
Re: [PATCH 1/2] dfa: convert to wide character line-by-line, Jim Meyering, 2010/05/05