From 17f5934d50b121ef3f7c98b0b0db3ae8c891b8d4 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Wed, 12 Mar 2014 00:45:04 +0900 Subject: [PATCH] grep: optimization with the superset of DFA. By the patch, DFA may be also build the superset of itself, which is the same as the itself expect ANYCHAR, MBCSET and BACKREF are replaced CSET set full bits followed by STAR, and mb_cur_max is equal to 1. For example, if given the pattern `a\(b\)c\1', the tokens of original DFA and its superset is below. original: a b CAT c CAT BACKREF CAT superset: a b CAT c CAT CSET STAR CAT (Full bits are set to CSET.) If a string doesn't matches the superset of original DFA for a pattern, the string will also never match original DFA. By the way, matching with the superset is very fast because it never has ANYCHAR, MBCSET and BACKREF, which are very expensive, and its mb_cur_max is always equal to 1. Therefore, the perfomance for matching with DFA may be dramatically improved without overhead by using the superset. I prepare following string to measure the performance. yes jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj | head -10000000 > k I run three tests with this patch (best-of-5 trials): env LC_ALL=ja_JP.eucJP time -p src/grep -i foobar k real 1.77 user 1.23 sys 0.47 env LC_ALL=en_US.UTF-8 time -p src/grep -i 'j[a-c]d' k real 1.86 user 1.35 sys 0.45 env LC_ALL=C time -p src/grep '\(j\)\1d' k real 1.92 user 1.40 sys 0.48 Back out that commit (temporarily), recompile, and rerun the experiment: env LC_ALL=ja_JP.eucJP time -p src/grep -i foobar k real 27.21 user 21.15 sys 5.36 env LC_ALL=en_US.UTF-8 time -p src/grep -i 'j[a-c]d' k real 96.35 user 429.80 sys 57.37 env LC_ALL=C time -p src/grep '\(j\)\1d' k real 502.32 user 429.80 sys 57.37 * src/dfa.c (struct dfa) New member `superset'. (dfahint): New function. (dfaoptimize): If succeed in optimization for UTF-8 locale, don't use the superset. (dfasuperset): New function. (dfacomp): Add call of dfasuperset and dfaanalyze functions. (dfafree): Run free only when the value of the pointer isn't NULL. (dfaalloc): Set NULL to member `superset' of DFA clearly. * src/dfa.h: Define prototype for dfahint function. * dfasearch.c: (EGexecute): Use dfahint. --- src/dfa.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++----- src/dfa.h | 3 ++ src/dfasearch.c | 63 ++++++++++++++++++++------ 3 files changed, 178 insertions(+), 26 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 4ed2189..11ef542 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -389,6 +389,9 @@ struct dfa size_t nmbcsets; size_t mbcsets_alloc; + /* Fields filled by the superset. */ + struct dfa *superset; /* Hint of the dfa. */ + /* Fields filled by the state builder. */ dfa_state *states; /* States of the dfa. */ state_num sindex; /* Index for adding new states. */ @@ -3551,6 +3554,21 @@ dfaexec (struct dfa *d, char const *begin, char *end, } } +size_t +dfahint (struct dfa *d, char const *begin, char *end, size_t *count) +{ + char const *match; + + if (d->superset == NULL) + return (size_t) -2; + + match = dfaexec (d->superset, begin, end, 1, count, NULL); + if (match == NULL) + return (size_t) -1; + + return match - begin; +} + static void free_mbdata (struct dfa *d) { @@ -3633,6 +3651,81 @@ dfaoptimize (struct dfa *d) d->mb_cur_max = 1; } +static void +dfasuperset (struct dfa *d) +{ + size_t i, j; + charclass ccl; + bool have_achar = false; + bool have_nchar = false; + + dfa = d->superset = dfaalloc (); + *d->superset = *d; + d->superset->mb_cur_max = 1; + d->superset->multibyte_prop = NULL; + d->superset->mbcsets = NULL; + d->superset->superset = NULL; + d->superset->states = NULL; + d->superset->follows = NULL; + d->superset->trans = NULL; + d->superset->realtrans = NULL; + d->superset->fails = NULL; + d->superset->success = NULL; + d->superset->newlines = NULL; + d->superset->musts = NULL; + + MALLOC (d->superset->charclasses, d->superset->calloc); + for (i = 0; i < d->cindex; i++) + copyset (d->charclasses[i], d->superset->charclasses[i]); + + d->superset->talloc = d->tindex * 2; + MALLOC (d->superset->tokens, d->superset->talloc); + + for (i = j = 0; i < d->tindex; i++) + { + switch (d->tokens[i]) + { + case ANYCHAR: + case MBCSET: + case BACKREF: + zeroset (ccl); + notset (ccl); + d->superset->tokens[j++] = CSET + charclass_index (ccl); + d->superset->tokens[j++] = STAR; + if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR + || d->tokens[i + 1] == PLUS) + i++; + have_achar = true; + break; + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + if (MB_CUR_MAX > 1) + { + /* Ignore these constraints. */ + d->superset->tokens[j++] = EMPTY; + break; + } + default: + d->superset->tokens[j++] = d->tokens[i]; + if ((d->tokens[i] >= 0 && d->tokens[i] < NOTCHAR) + || d->tokens[i] >= CSET) + have_nchar = true; + break; + } + } + + if ((d->mb_cur_max == 1 && !have_achar) || !have_nchar) + { + dfafree (d->superset); + d->superset = NULL; + return; + } + + d->superset->tindex = j; +} + /* Parse and analyze a single string of the given length. */ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) @@ -3642,7 +3735,10 @@ dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) dfaparse (s, len, d); dfamust (d); dfaoptimize (d); + dfasuperset (d); dfaanalyze (d, searchflag); + if (d->superset != NULL) + dfaanalyze (d->superset, searchflag); } /* Free the storage held by the components of a dfa. */ @@ -3658,31 +3754,47 @@ dfafree (struct dfa *d) if (d->mb_cur_max > 1) free_mbdata (d); - for (i = 0; i < d->sindex; ++i) + if (d->states != NULL) { - free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + for (i = 0; i < d->sindex; ++i) + { + free (d->states[i].elems.elems); + if (MBS_SUPPORT) + free (d->states[i].mbps.elems); + } + free (d->states); } - free (d->states); - for (i = 0; i < d->tindex; ++i) - free (d->follows[i].elems); - free (d->follows); - for (i = 0; i < d->tralloc; ++i) + if (d->follows != NULL) + { + for (i = 0; i < d->tindex; ++i) + free (d->follows[i].elems); + free (d->follows); + } + if (d->trans != NULL) + { + for (i = 0; i < d->tralloc; ++i) + free (d->trans[i]); + } + if (d->fails != NULL) { - free (d->trans[i]); - free (d->fails[i]); + for (i = 0; i < d->tralloc; ++i) + free (d->fails[i]); } + free (d->realtrans); free (d->fails); free (d->newlines); free (d->success); + for (dm = d->musts; dm; dm = ndm) { ndm = dm->next; free (dm->must); free (dm); } + + if (d->superset != NULL) + dfafree (d->superset); } /* Having found the postfix representation of the regular expression, @@ -4190,7 +4302,9 @@ done: struct dfa * dfaalloc (void) { - return xmalloc (sizeof (struct dfa)); + struct dfa *d = xmalloc (sizeof (struct dfa)); + d->superset = NULL; + return d; } struct dfamust *_GL_ATTRIBUTE_PURE diff --git a/src/dfa.h b/src/dfa.h index 24fbcbe..ad97498 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -67,6 +67,9 @@ extern void dfacomp (char const *, size_t, struct dfa *, int); to decide whether to fall back on a backtracking matcher. */ extern char *dfaexec (struct dfa *d, char const *begin, char *end, int newline, size_t *count, int *backref); +extern size_t dfahint (struct dfa *d, char const *begin, char *end, + size_t *count); + /* Free the storage held by the components of a struct dfa. */ extern void dfafree (struct dfa *); diff --git a/src/dfasearch.c b/src/dfasearch.c index d098a9b..98d7e42 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -236,7 +236,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, match = beg; while (beg > buf && beg[-1] != eol) --beg; - char const *dfa_start = beg; if (kwsm.index < kwset_exact_matches) { if (!MBS_SUPPORT) @@ -251,29 +250,65 @@ EGexecute (char const *buf, size_t size, size_t *match_size, /* The matched line starts in the middle of a multibyte character. Perform the DFA search starting from the beginning of the next character. */ - dfa_start = mb_start; + if (dfaexec (dfa, mb_start, (char *) end, 0, NULL, + &backref) == NULL) + continue; + } + else + { + if (dfahint (dfa, beg, (char *) end, NULL) == + (size_t) -1) + continue; + if (dfaexec (dfa, beg, (char *) end, 0, NULL, + &backref) == NULL) + continue; } - if (dfaexec (dfa, dfa_start, (char *) end, 0, NULL, - &backref) == NULL) - continue; } else { /* No good fixed strings; start with DFA. */ - char const *next_beg = dfaexec (dfa, beg, (char *) buflim, - 0, NULL, &backref); - /* If there's no match, or if we've matched the sentinel, - we're done. */ - if (next_beg == NULL || next_beg == buflim) - break; + size_t offset, count; + char const *next_beg; + count = 0; + offset = dfahint (dfa, beg, (char *) buflim, &count); + if (offset == (size_t) -1) + goto failure; + if (offset == (size_t) -2) + { + /* No use hint. */ + next_beg = dfaexec (dfa, beg, (char *) buflim, 0, + NULL, &backref); + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == buflim) + goto failure; + } + else + next_beg = beg + offset; /* Narrow down to the line we've found. */ beg = next_beg; - if ((end = memchr(beg, eol, buflim - beg)) != NULL) + while (beg > buf && beg[-1] != eol) + --beg; + if (count > 0) + { + /* dfahint() may match in multiple lines. If that is + the case, try to match in one line. */ + end = beg; + continue; + } + if ((end = memchr(next_beg, eol, buflim - beg)) != NULL) end++; else end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; + if (offset != (size_t) -2) + { + next_beg = dfaexec (dfa, beg, (char *) end, 0, NULL, + &backref); + /* If there's no match, or if we've matched the sentinel, + we're done. */ + if (next_beg == NULL || next_beg == end) + continue; + } } /* Successful, no backreferences encountered! */ if (!backref) -- 1.9.1