From 74c8e3d18ca14ea502da63f34bd7f97fcd6ffd65 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka
Date: Sat, 29 Mar 2014 20:59:53 +0900
Subject: [PATCH] grep: speed-up of DFA by checking multibyte characters on
demand
If dfaexec() runs in non-UTF8 locales, length and wide character
representation are checked for all characters of a line in a input
string. However, if matched early in the line, results for remaining
characters are wasted.
This patch checks multibyte characters on demand. It enables to
accomplish to speed-up for matched early and reduce required memories.
* src/dfa.c (struct dfa): Remove members.
(buf_begin, buf_end): No longer they are used. Remove them.
(SKIP_REMAINS_MB_IF_INITIAL_STATE): Now, the content of the macro is
extended in dfaexec().
(transit_state_singlebyte): sem check is removed.
(match_anychar): Doesn't use `mblen_buf and' `inputwcs'.
(match_mb_charset): Doesn't use `mblen_buf and' `inputwcs'.
(check_matching_with_multibyte_ops): Modify arguments.
(transit_state_consume_1char): Modify arguments.
(transit_state): Doesn't use `mblen_buf and' `inputwcs'.
(prepare_wc_buf): Remove it.
(dfaexec): Doesn't use `mblen_buf and' `inputwcs'.
---
src/dfa.c | 209 +++++++++++++++++++-------------------------------------------
1 file changed, 62 insertions(+), 147 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index ef5c8a9..ea955bf 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -420,24 +420,6 @@ struct dfa
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
- unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec.
- Each element stores the number of remaining
- bytes of the corresponding multibyte
- character in the input string. A element's
- value is 0 if the corresponding character is
- single-byte.
- e.g., input : 'a', , ,
- mblen_buf : 0, 3, 2, 1
- */
- size_t nmblen_buf; /* Allocated size of mblen_buf. */
- wchar_t *inputwcs; /* Wide character representation of the input
- string in dfaexec.
- The length of this array is the same as
- the length of input string (char array).
- inputstring[i] is a single-byte char,
- or the first byte of a multibyte char;
- inputwcs[i] is the codepoint. */
- size_t ninputwcs; /* Allocated number of inputwcs elements. */
position_set *mb_follows; /* Follow set added by ANYCHAR and/or MBCSET
on demand. */
int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or
@@ -874,8 +856,6 @@ static int cur_mb_len = 1; /* Length of the multibyte representation of
static mbstate_t mbs; /* mbstate for mbrtowc. */
static wchar_t wctok; /* Wide character representation of the current
multibyte character. */
-static unsigned char const *buf_begin; /* reference to begin in dfaexec. */
-static unsigned char const *buf_end; /* reference to end in dfaexec. */
/* Note that characters become unsigned here. */
@@ -2898,27 +2878,6 @@ build_state_zero (struct dfa *d)
/* Multibyte character handling sub-routines for dfaexec. */
-/* The initial state may encounter a byte which is not a single byte character
- nor the first byte of a multibyte character. But it is incorrect for the
- initial state to accept such a byte. For example, in Shift JIS the regular
- expression "\\" accepts the codepoint 0x5c, but should not accept the second
- byte of the codepoint 0x815c. Then the initial state must skip the bytes
- that are not a single byte character nor the first byte of a multibyte
- character. */
-#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \
- if (s == 0) \
- { \
- while (d->inputwcs[p - buf_begin] == 0 \
- && d->mblen_buf[p - buf_begin] != 0 \
- && (unsigned char const *) p < buf_end) \
- ++p; \
- if ((char *) p >= end) \
- { \
- *end = saved_end; \
- return NULL; \
- } \
- }
-
static void
realloc_trans_if_necessary (struct dfa *d, state_num new_state)
{
@@ -2975,14 +2934,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
works = 0;
}
else if (works < 0)
- {
- if (p == buf_end)
- {
- /* At the moment, it must not happen. */
- abort ();
- }
- works = 0;
- }
+ works = 0;
else if (d->fails[works])
{
works = d->fails[works][*p];
@@ -2997,18 +2949,13 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p,
return rval;
}
-/* Match a "." against the current context. buf_begin[IDX] is the
- current position. Return the length of the match, in bytes.
- POS is the position of the ".". */
+/* Match a "." against the current context. Return the length of the
+ match, in bytes. POS is the position of the ".". */
static int
-match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
+match_anychar (struct dfa *d, state_num s, position pos,
+ wchar_t wc, size_t mbclen)
{
int context;
- wchar_t wc;
- int mbclen;
-
- wc = d->inputwcs[idx];
- mbclen = MAX (1, d->mblen_buf[idx]);
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3030,16 +2977,14 @@ match_anychar (struct dfa *d, state_num s, position pos, size_t idx)
}
/* Match a bracket expression against the current context.
- buf_begin[IDX] is the current position.
Return the length of the match, in bytes.
POS is the position of the bracket expression. */
static int
-match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
+match_mb_charset (struct dfa *d, state_num s, position pos,
+ char const *p, wchar_t wc, size_t match_len)
{
size_t i;
int match; /* Matching succeeded. */
- int match_len; /* Length of the character (or collating element)
- with which this operator matches. */
int op_len; /* Length of the operator. */
char buffer[128];
@@ -3047,9 +2992,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
struct mb_char_classes *work_mbc;
int context;
- wchar_t wc; /* Current referring character. */
-
- wc = d->inputwcs[idx];
/* Check syntax bits. */
if (wc == (wchar_t) eolbyte)
@@ -3070,7 +3012,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
/* Assign the current referring operator to work_mbc. */
work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
match = !work_mbc->invert;
- match_len = MAX (1, d->mblen_buf[idx]);
/* Match in range 0-255? */
if (wc < NOTCHAR && work_mbc->cset != -1
@@ -3084,14 +3025,14 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
goto charset_matched;
}
- strncpy (buffer, (char const *) buf_begin + idx, match_len);
+ strncpy (buffer, p, match_len);
buffer[match_len] = '\0';
/* match with an equivalence class? */
for (i = 0; i < work_mbc->nequivs; i++)
{
op_len = strlen (work_mbc->equivs[i]);
- strncpy (buffer, (char const *) buf_begin + idx, op_len);
+ strncpy (buffer, p, op_len);
buffer[op_len] = '\0';
if (strcoll (work_mbc->equivs[i], buffer) == 0)
{
@@ -3104,7 +3045,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx)
for (i = 0; i < work_mbc->ncoll_elems; i++)
{
op_len = strlen (work_mbc->coll_elems[i]);
- strncpy (buffer, (char const *) buf_begin + idx, op_len);
+ strncpy (buffer, p, op_len);
buffer[op_len] = '\0';
if (strcoll (work_mbc->coll_elems[i], buffer) == 0)
@@ -3138,12 +3079,10 @@ charset_matched:
array which corresponds to 'd->states[s].mbps.elem'; each element of the
array contains the number of bytes with which the element can match.
- 'idx' is the index from buf_begin, and it is the current position
- in the buffer.
-
The caller MUST free the array which this function return. */
static int *
-check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
+check_matching_with_multibyte_ops (struct dfa *d, state_num s,
+ char const *p, wchar_t wc, size_t mbclen)
{
size_t i;
int *rarray;
@@ -3155,10 +3094,10 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
switch (d->tokens[pos.index])
{
case ANYCHAR:
- rarray[i] = match_anychar (d, s, pos, idx);
+ rarray[i] = match_anychar (d, s, pos, wc, mbclen);
break;
case MBCSET:
- rarray[i] = match_mb_charset (d, s, pos, idx);
+ rarray[i] = match_mb_charset (d, s, pos, p, wc, mbclen);
break;
default:
break; /* cannot happen. */
@@ -3178,22 +3117,22 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx)
static status_transit_state
transit_state_consume_1char (struct dfa *d, state_num s,
unsigned char const **pp,
- int *match_lens, int *mbclen)
+ wchar_t wc, size_t mbclen,
+ int *match_lens)
{
size_t i, j;
int k;
state_num s1, s2;
- int *work_mbls;
status_transit_state rs = TRANSIT_STATE_DONE;
- /* Calculate the length of the (single/multi byte) character
- to which p points. */
- *mbclen = MAX (1, d->mblen_buf[*pp - buf_begin]);
+ /* Check (input) match_lens, and initialize if it is NULL. */
+ if (match_lens == NULL && d->states[s].mbps.nelem != 0)
+ match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, wc, mbclen);
/* Calculate the state which can be reached from the state 's' by
- consuming '*mbclen' single bytes from the buffer. */
+ consuming 'mbclen' single bytes from the buffer. */
s1 = s;
- for (k = 0; k < *mbclen; k++)
+ for (k = 0; k < mbclen; k++)
{
s2 = s1;
rs = transit_state_singlebyte (d, s2, (*pp)++, &s1);
@@ -3201,17 +3140,11 @@ transit_state_consume_1char (struct dfa *d, state_num s,
/* Copy the positions contained by 's1' to the set 'd->mb_follows'. */
copy (&(d->states[s1].elems), d->mb_follows);
- /* Check (input) match_lens, and initialize if it is NULL. */
- if (match_lens == NULL && d->states[s].mbps.nelem != 0)
- work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
- else
- work_mbls = match_lens;
-
/* Add all of the positions which can be reached from 's' by consuming
a single character. */
for (i = 0; i < d->states[s].mbps.nelem; i++)
{
- if (work_mbls[i] == *mbclen)
+ if (match_lens[i] == mbclen)
for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem;
j++)
insert (d->follows[d->states[s].mbps.elems[i].index].elems[j],
@@ -3226,7 +3159,8 @@ transit_state_consume_1char (struct dfa *d, state_num s,
buffer. This function is for some operator which can match with a multi-
byte character or a collating element (which may be multi characters). */
static state_num
-transit_state (struct dfa *d, state_num s, unsigned char const **pp)
+transit_state (struct dfa *d, state_num s, unsigned char const **pp,
+ unsigned char const *end)
{
state_num s1;
int mbclen; /* The length of current input multibyte character. */
@@ -3242,7 +3176,8 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
We check whether each of them can match or not. */
{
/* Note: caller must free the return value of this function. */
- match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin);
+ mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
+ match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, wc, mbclen);
for (i = 0; i < nelem; i++)
/* Search the operator which match the longest string,
@@ -3274,15 +3209,15 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
not be a character but a (multi character) collating element.
We enumerate all of the positions which 's' can reach by consuming
'maxlen' bytes. */
- transit_state_consume_1char (d, s, pp, match_lens, &mbclen);
+ transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens);
- wc = d->inputwcs[*pp - mbclen - buf_begin];
s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
while (*pp - p1 < maxlen)
{
- transit_state_consume_1char (d, s1, pp, NULL, &mbclen);
+ mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs);
+ transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL);
for (i = 0; i < nelem; i++)
{
@@ -3293,45 +3228,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp)
d->mb_follows);
}
- wc = d->inputwcs[*pp - mbclen - buf_begin];
s1 = state_index (d, d->mb_follows, wchar_context (wc));
realloc_trans_if_necessary (d, s1);
}
return s1;
}
-
-/* Initialize mblen_buf and inputwcs with data from the next line. */
-
-static void
-prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
-{
- unsigned char eol = eolbyte;
- size_t i;
- size_t ilim = end - begin + 1;
-
- buf_begin = (unsigned char *) begin;
-
- for (i = 0; i < ilim; i++)
- {
- size_t nbytes = mbs_to_wchar (d, d->inputwcs + i, begin + i, ilim - i,
- &mbs);
- d->mblen_buf[i] = nbytes - (nbytes == 1);
- if (begin[i] == eol)
- break;
- while (--nbytes != 0)
- {
- i++;
- d->mblen_buf[i] = nbytes;
- d->inputwcs[i] = 0;
- }
- }
-
- buf_end = (unsigned char *) (begin + i);
- d->mblen_buf[i] = 0;
- d->inputwcs[i] = 0; /* sentinel */
-}
-
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
@@ -3349,7 +3251,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
int allow_nl, size_t *count, int *backref)
{
state_num s, s1; /* Current state. */
- unsigned char const *p; /* Current input character. */
+ unsigned char const *p, *mbp; /* Current input character. */
state_num **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
unsigned char eol = eolbyte; /* Likewise for eolbyte. */
@@ -3359,7 +3261,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
build_state_zero (d);
s = s1 = 0;
- p = (unsigned char const *) begin;
+ p = mbp = (unsigned char const *) begin;
trans = d->trans;
saved_end = *(unsigned char *) end;
*end = eol;
@@ -3367,10 +3269,7 @@ dfaexec (struct dfa *d, char const *begin, char *end,
if (d->mb_cur_max > 1)
{
static bool mb_alloc = false;
- REALLOC_IF_NECESSARY (d->mblen_buf, d->nmblen_buf, end - begin + 2);
- REALLOC_IF_NECESSARY (d->inputwcs, d->ninputwcs, end - begin + 2);
memset (&mbs, 0, sizeof (mbstate_t));
- prepare_wc_buf (d, (const char *) p, end);
if (!mb_alloc)
{
MALLOC (d->mb_match_lens, d->nleaves);
@@ -3386,10 +3285,32 @@ dfaexec (struct dfa *d, char const *begin, char *end,
{
while ((t = trans[s]) != NULL)
{
- if (p > buf_end)
- break;
s1 = s;
- SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p);
+
+ if (s == 0)
+ {
+ /* The initial state may encounter a byte which is not
+ a single byte character nor the first byte of a
+ multibyte character. But it is incorrect for the
+ initial state to accept such a byte. For example,
+ in Shift JIS the regular expression "\\" accepts
+ the codepoint 0x5c, but should not accept the second
+ byte of the codepoint 0x815c. Then the initial
+ state must skip the bytes that are not a single
+ byte character nor the first byte of a multibyte
+ character. */
+ wchar_t wc;
+ while (mbp < p)
+ mbp += mbs_to_wchar (d, &wc, (char const *) mbp,
+ end - (char const *) mbp, &mbs);
+ p = mbp;
+
+ if ((char *) p >= end)
+ {
+ *end = saved_end;
+ return NULL;
+ }
+ }
if (d->states[s].mbps.nelem == 0)
{
@@ -3410,7 +3331,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
/* Can match with a multibyte character (and multi character
collating element). Transition table might be updated. */
- s = transit_state (d, s, &p);
+ s = transit_state (d, s, &p, (unsigned char *) end);
+ mbp = p;
trans = d->trans;
}
}
@@ -3445,7 +3367,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
{
/* Can match with a multibyte character (and multicharacter
collating element). Transition table might be updated. */
- s = transit_state (d, s, &p);
+ s = transit_state (d, s, &p, (unsigned char *) end);
+ mbp = p;
trans = d->trans;
}
else
@@ -3454,14 +3377,8 @@ dfaexec (struct dfa *d, char const *begin, char *end,
}
/* If the previous character was a newline, count it. */
- if ((char *) p <= end && p[-1] == eol)
- {
- if (count)
- ++*count;
-
- if (d->mb_cur_max > 1)
- prepare_wc_buf (d, (const char *) p, end);
- }
+ if ((char *) p <= end && p[-1] == eol && count)
+ ++*count;
/* Check if we've run off the end of the buffer. */
if ((char *) p > end)
@@ -3518,8 +3435,6 @@ free_mbdata (struct dfa *d)
d->mbcsets = NULL;
d->nmbcsets = 0;
- free (d->mblen_buf);
- free (d->inputwcs);
if (d->mb_follows)
{
free (d->mb_follows->elems);
--
1.9.1