From 74c8e3d18ca14ea502da63f34bd7f97fcd6ffd65 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 29 Mar 2014 20:59:53 +0900 Subject: [PATCH] grep: speed-up of DFA by checking multibyte characters on demand If dfaexec() runs in non-UTF8 locales, length and wide character representation are checked for all characters of a line in a input string. However, if matched early in the line, results for remaining characters are wasted. This patch checks multibyte characters on demand. It enables to accomplish to speed-up for matched early and reduce required memories. * src/dfa.c (struct dfa): Remove members. (buf_begin, buf_end): No longer they are used. Remove them. (SKIP_REMAINS_MB_IF_INITIAL_STATE): Now, the content of the macro is extended in dfaexec(). (transit_state_singlebyte): sem check is removed. (match_anychar): Doesn't use `mblen_buf and' `inputwcs'. (match_mb_charset): Doesn't use `mblen_buf and' `inputwcs'. (check_matching_with_multibyte_ops): Modify arguments. (transit_state_consume_1char): Modify arguments. (transit_state): Doesn't use `mblen_buf and' `inputwcs'. (prepare_wc_buf): Remove it. (dfaexec): Doesn't use `mblen_buf and' `inputwcs'. --- src/dfa.c | 209 +++++++++++++++++++------------------------------------------- 1 file changed, 62 insertions(+), 147 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index ef5c8a9..ea955bf 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -420,24 +420,6 @@ struct dfa struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ - unsigned char *mblen_buf; /* Correspond to the input buffer in dfaexec. - Each element stores the number of remaining - bytes of the corresponding multibyte - character in the input string. A element's - value is 0 if the corresponding character is - single-byte. - e.g., input : 'a', , , - mblen_buf : 0, 3, 2, 1 - */ - size_t nmblen_buf; /* Allocated size of mblen_buf. */ - wchar_t *inputwcs; /* Wide character representation of the input - string in dfaexec. - The length of this array is the same as - the length of input string (char array). - inputstring[i] is a single-byte char, - or the first byte of a multibyte char; - inputwcs[i] is the codepoint. */ - size_t ninputwcs; /* Allocated number of inputwcs elements. */ position_set *mb_follows; /* Follow set added by ANYCHAR and/or MBCSET on demand. */ int *mb_match_lens; /* Array of length reduced by ANYCHAR and/or @@ -874,8 +856,6 @@ static int cur_mb_len = 1; /* Length of the multibyte representation of static mbstate_t mbs; /* mbstate for mbrtowc. */ static wchar_t wctok; /* Wide character representation of the current multibyte character. */ -static unsigned char const *buf_begin; /* reference to begin in dfaexec. */ -static unsigned char const *buf_end; /* reference to end in dfaexec. */ /* Note that characters become unsigned here. */ @@ -2898,27 +2878,6 @@ build_state_zero (struct dfa *d) /* Multibyte character handling sub-routines for dfaexec. */ -/* The initial state may encounter a byte which is not a single byte character - nor the first byte of a multibyte character. But it is incorrect for the - initial state to accept such a byte. For example, in Shift JIS the regular - expression "\\" accepts the codepoint 0x5c, but should not accept the second - byte of the codepoint 0x815c. Then the initial state must skip the bytes - that are not a single byte character nor the first byte of a multibyte - character. */ -#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ - if (s == 0) \ - { \ - while (d->inputwcs[p - buf_begin] == 0 \ - && d->mblen_buf[p - buf_begin] != 0 \ - && (unsigned char const *) p < buf_end) \ - ++p; \ - if ((char *) p >= end) \ - { \ - *end = saved_end; \ - return NULL; \ - } \ - } - static void realloc_trans_if_necessary (struct dfa *d, state_num new_state) { @@ -2975,14 +2934,7 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, works = 0; } else if (works < 0) - { - if (p == buf_end) - { - /* At the moment, it must not happen. */ - abort (); - } - works = 0; - } + works = 0; else if (d->fails[works]) { works = d->fails[works][*p]; @@ -2997,18 +2949,13 @@ transit_state_singlebyte (struct dfa *d, state_num s, unsigned char const *p, return rval; } -/* Match a "." against the current context. buf_begin[IDX] is the - current position. Return the length of the match, in bytes. - POS is the position of the ".". */ +/* Match a "." against the current context. Return the length of the + match, in bytes. POS is the position of the ".". */ static int -match_anychar (struct dfa *d, state_num s, position pos, size_t idx) +match_anychar (struct dfa *d, state_num s, position pos, + wchar_t wc, size_t mbclen) { int context; - wchar_t wc; - int mbclen; - - wc = d->inputwcs[idx]; - mbclen = MAX (1, d->mblen_buf[idx]); /* Check syntax bits. */ if (wc == (wchar_t) eolbyte) @@ -3030,16 +2977,14 @@ match_anychar (struct dfa *d, state_num s, position pos, size_t idx) } /* Match a bracket expression against the current context. - buf_begin[IDX] is the current position. Return the length of the match, in bytes. POS is the position of the bracket expression. */ static int -match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) +match_mb_charset (struct dfa *d, state_num s, position pos, + char const *p, wchar_t wc, size_t match_len) { size_t i; int match; /* Matching succeeded. */ - int match_len; /* Length of the character (or collating element) - with which this operator matches. */ int op_len; /* Length of the operator. */ char buffer[128]; @@ -3047,9 +2992,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) struct mb_char_classes *work_mbc; int context; - wchar_t wc; /* Current referring character. */ - - wc = d->inputwcs[idx]; /* Check syntax bits. */ if (wc == (wchar_t) eolbyte) @@ -3070,7 +3012,6 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) /* Assign the current referring operator to work_mbc. */ work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); match = !work_mbc->invert; - match_len = MAX (1, d->mblen_buf[idx]); /* Match in range 0-255? */ if (wc < NOTCHAR && work_mbc->cset != -1 @@ -3084,14 +3025,14 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) goto charset_matched; } - strncpy (buffer, (char const *) buf_begin + idx, match_len); + strncpy (buffer, p, match_len); buffer[match_len] = '\0'; /* match with an equivalence class? */ for (i = 0; i < work_mbc->nequivs; i++) { op_len = strlen (work_mbc->equivs[i]); - strncpy (buffer, (char const *) buf_begin + idx, op_len); + strncpy (buffer, p, op_len); buffer[op_len] = '\0'; if (strcoll (work_mbc->equivs[i], buffer) == 0) { @@ -3104,7 +3045,7 @@ match_mb_charset (struct dfa *d, state_num s, position pos, size_t idx) for (i = 0; i < work_mbc->ncoll_elems; i++) { op_len = strlen (work_mbc->coll_elems[i]); - strncpy (buffer, (char const *) buf_begin + idx, op_len); + strncpy (buffer, p, op_len); buffer[op_len] = '\0'; if (strcoll (work_mbc->coll_elems[i], buffer) == 0) @@ -3138,12 +3079,10 @@ charset_matched: array which corresponds to 'd->states[s].mbps.elem'; each element of the array contains the number of bytes with which the element can match. - 'idx' is the index from buf_begin, and it is the current position - in the buffer. - The caller MUST free the array which this function return. */ static int * -check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx) +check_matching_with_multibyte_ops (struct dfa *d, state_num s, + char const *p, wchar_t wc, size_t mbclen) { size_t i; int *rarray; @@ -3155,10 +3094,10 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx) switch (d->tokens[pos.index]) { case ANYCHAR: - rarray[i] = match_anychar (d, s, pos, idx); + rarray[i] = match_anychar (d, s, pos, wc, mbclen); break; case MBCSET: - rarray[i] = match_mb_charset (d, s, pos, idx); + rarray[i] = match_mb_charset (d, s, pos, p, wc, mbclen); break; default: break; /* cannot happen. */ @@ -3178,22 +3117,22 @@ check_matching_with_multibyte_ops (struct dfa *d, state_num s, size_t idx) static status_transit_state transit_state_consume_1char (struct dfa *d, state_num s, unsigned char const **pp, - int *match_lens, int *mbclen) + wchar_t wc, size_t mbclen, + int *match_lens) { size_t i, j; int k; state_num s1, s2; - int *work_mbls; status_transit_state rs = TRANSIT_STATE_DONE; - /* Calculate the length of the (single/multi byte) character - to which p points. */ - *mbclen = MAX (1, d->mblen_buf[*pp - buf_begin]); + /* Check (input) match_lens, and initialize if it is NULL. */ + if (match_lens == NULL && d->states[s].mbps.nelem != 0) + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, wc, mbclen); /* Calculate the state which can be reached from the state 's' by - consuming '*mbclen' single bytes from the buffer. */ + consuming 'mbclen' single bytes from the buffer. */ s1 = s; - for (k = 0; k < *mbclen; k++) + for (k = 0; k < mbclen; k++) { s2 = s1; rs = transit_state_singlebyte (d, s2, (*pp)++, &s1); @@ -3201,17 +3140,11 @@ transit_state_consume_1char (struct dfa *d, state_num s, /* Copy the positions contained by 's1' to the set 'd->mb_follows'. */ copy (&(d->states[s1].elems), d->mb_follows); - /* Check (input) match_lens, and initialize if it is NULL. */ - if (match_lens == NULL && d->states[s].mbps.nelem != 0) - work_mbls = check_matching_with_multibyte_ops (d, s, *pp - buf_begin); - else - work_mbls = match_lens; - /* Add all of the positions which can be reached from 's' by consuming a single character. */ for (i = 0; i < d->states[s].mbps.nelem; i++) { - if (work_mbls[i] == *mbclen) + if (match_lens[i] == mbclen) for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; j++) insert (d->follows[d->states[s].mbps.elems[i].index].elems[j], @@ -3226,7 +3159,8 @@ transit_state_consume_1char (struct dfa *d, state_num s, buffer. This function is for some operator which can match with a multi- byte character or a collating element (which may be multi characters). */ static state_num -transit_state (struct dfa *d, state_num s, unsigned char const **pp) +transit_state (struct dfa *d, state_num s, unsigned char const **pp, + unsigned char const *end) { state_num s1; int mbclen; /* The length of current input multibyte character. */ @@ -3242,7 +3176,8 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) We check whether each of them can match or not. */ { /* Note: caller must free the return value of this function. */ - match_lens = check_matching_with_multibyte_ops (d, s, *pp - buf_begin); + mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs); + match_lens = check_matching_with_multibyte_ops (d, s, (char const *) *pp, wc, mbclen); for (i = 0; i < nelem; i++) /* Search the operator which match the longest string, @@ -3274,15 +3209,15 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) not be a character but a (multi character) collating element. We enumerate all of the positions which 's' can reach by consuming 'maxlen' bytes. */ - transit_state_consume_1char (d, s, pp, match_lens, &mbclen); + transit_state_consume_1char (d, s, pp, wc, mbclen, match_lens); - wc = d->inputwcs[*pp - mbclen - buf_begin]; s1 = state_index (d, d->mb_follows, wchar_context (wc)); realloc_trans_if_necessary (d, s1); while (*pp - p1 < maxlen) { - transit_state_consume_1char (d, s1, pp, NULL, &mbclen); + mbclen = mbs_to_wchar (d, &wc, (char const *) *pp, end - *pp, &mbs); + transit_state_consume_1char (d, s1, pp, wc, mbclen, NULL); for (i = 0; i < nelem; i++) { @@ -3293,45 +3228,12 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) d->mb_follows); } - wc = d->inputwcs[*pp - mbclen - buf_begin]; s1 = state_index (d, d->mb_follows, wchar_context (wc)); realloc_trans_if_necessary (d, s1); } return s1; } - -/* Initialize mblen_buf and inputwcs with data from the next line. */ - -static void -prepare_wc_buf (struct dfa *d, const char *begin, const char *end) -{ - unsigned char eol = eolbyte; - size_t i; - size_t ilim = end - begin + 1; - - buf_begin = (unsigned char *) begin; - - for (i = 0; i < ilim; i++) - { - size_t nbytes = mbs_to_wchar (d, d->inputwcs + i, begin + i, ilim - i, - &mbs); - d->mblen_buf[i] = nbytes - (nbytes == 1); - if (begin[i] == eol) - break; - while (--nbytes != 0) - { - i++; - d->mblen_buf[i] = nbytes; - d->inputwcs[i] = 0; - } - } - - buf_end = (unsigned char *) (begin + i); - d->mblen_buf[i] = 0; - d->inputwcs[i] = 0; /* sentinel */ -} - /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -3349,7 +3251,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, int allow_nl, size_t *count, int *backref) { state_num s, s1; /* Current state. */ - unsigned char const *p; /* Current input character. */ + unsigned char const *p, *mbp; /* Current input character. */ state_num **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ unsigned char eol = eolbyte; /* Likewise for eolbyte. */ @@ -3359,7 +3261,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, build_state_zero (d); s = s1 = 0; - p = (unsigned char const *) begin; + p = mbp = (unsigned char const *) begin; trans = d->trans; saved_end = *(unsigned char *) end; *end = eol; @@ -3367,10 +3269,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, if (d->mb_cur_max > 1) { static bool mb_alloc = false; - REALLOC_IF_NECESSARY (d->mblen_buf, d->nmblen_buf, end - begin + 2); - REALLOC_IF_NECESSARY (d->inputwcs, d->ninputwcs, end - begin + 2); memset (&mbs, 0, sizeof (mbstate_t)); - prepare_wc_buf (d, (const char *) p, end); if (!mb_alloc) { MALLOC (d->mb_match_lens, d->nleaves); @@ -3386,10 +3285,32 @@ dfaexec (struct dfa *d, char const *begin, char *end, { while ((t = trans[s]) != NULL) { - if (p > buf_end) - break; s1 = s; - SKIP_REMAINS_MB_IF_INITIAL_STATE (s, p); + + if (s == 0) + { + /* The initial state may encounter a byte which is not + a single byte character nor the first byte of a + multibyte character. But it is incorrect for the + initial state to accept such a byte. For example, + in Shift JIS the regular expression "\\" accepts + the codepoint 0x5c, but should not accept the second + byte of the codepoint 0x815c. Then the initial + state must skip the bytes that are not a single + byte character nor the first byte of a multibyte + character. */ + wchar_t wc; + while (mbp < p) + mbp += mbs_to_wchar (d, &wc, (char const *) mbp, + end - (char const *) mbp, &mbs); + p = mbp; + + if ((char *) p >= end) + { + *end = saved_end; + return NULL; + } + } if (d->states[s].mbps.nelem == 0) { @@ -3410,7 +3331,8 @@ dfaexec (struct dfa *d, char const *begin, char *end, /* Can match with a multibyte character (and multi character collating element). Transition table might be updated. */ - s = transit_state (d, s, &p); + s = transit_state (d, s, &p, (unsigned char *) end); + mbp = p; trans = d->trans; } } @@ -3445,7 +3367,8 @@ dfaexec (struct dfa *d, char const *begin, char *end, { /* Can match with a multibyte character (and multicharacter collating element). Transition table might be updated. */ - s = transit_state (d, s, &p); + s = transit_state (d, s, &p, (unsigned char *) end); + mbp = p; trans = d->trans; } else @@ -3454,14 +3377,8 @@ dfaexec (struct dfa *d, char const *begin, char *end, } /* If the previous character was a newline, count it. */ - if ((char *) p <= end && p[-1] == eol) - { - if (count) - ++*count; - - if (d->mb_cur_max > 1) - prepare_wc_buf (d, (const char *) p, end); - } + if ((char *) p <= end && p[-1] == eol && count) + ++*count; /* Check if we've run off the end of the buffer. */ if ((char *) p > end) @@ -3518,8 +3435,6 @@ free_mbdata (struct dfa *d) d->mbcsets = NULL; d->nmbcsets = 0; - free (d->mblen_buf); - free (d->inputwcs); if (d->mb_follows) { free (d->mb_follows->elems); -- 1.9.1