From 642d6fed4310b3e4ee9569473b03e10dbee0e523 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 31 Mar 2014 21:51:48 +0900 Subject: [PATCH] grep: speed-up for exact matching with begline and endline constraints dfamust turns on the flag when a state exactly matches the proposed one. However, when the state has begline and/or endline constraints, turns off it. This patch enables to match a state exactly, even if the state has begline and/or endline constraints. If a exact string has one of their constrations, the string adding eolbyte to a head and/or foot is pushed to kwsincr(). In addition, if it has begline constration, start searching from just before the position of the text. * src/dfa.c (variable must): New members `begline' and `endline'. (dfamust): Consideration of begline and endline constrations. * src/dfa.h (struct dfamust): New members `begline' and `endline'. * src/dfasearch.c (kwsmusts): If a exact string has begline constration, start searching from just before the position of the text. (EGexecute): Same as above. * src/kwsearch.c (Fexecute): Same as above. --- src/dfa.c | 40 +++++++++++++++++++++++++++++++++++----- src/dfa.h | 2 ++ src/dfasearch.c | 29 ++++++++++++++++++++++++----- src/kwsearch.c | 32 +++++++++++++++++++------------- 4 files changed, 80 insertions(+), 23 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index ef5c8a9..c4a3194 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -3902,6 +3902,8 @@ typedef struct char *left; char *right; char *is; + int begline; + int endline; } must; static void @@ -3920,6 +3922,8 @@ dfamust (struct dfa *d) size_t ri; size_t i; int exact; + int begline; + int endline; token t; static must must0; struct dfamust *dm; @@ -3927,6 +3931,8 @@ dfamust (struct dfa *d) result = empty_string; exact = 0; + begline = 0; + endline = 0; MALLOC (musts, d->tindex + 1); mp = musts; for (i = 0; i <= d->tindex; ++i) @@ -3939,6 +3945,8 @@ dfamust (struct dfa *d) mp[i].is = xmalloc (2); mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; mp[i].in[0] = NULL; + mp[i].begline = 0; + mp[i].endline = 0; } #ifdef DEBUG fprintf (stderr, "dfamust:\n"); @@ -3953,12 +3961,18 @@ dfamust (struct dfa *d) { switch (t = d->tokens[ri]) { + case BEGLINE: + resetmust (mp); + mp->begline = 1; + break; + case ENDLINE: + resetmust (mp); + mp->endline = 1; + break; case LPAREN: case RPAREN: assert (!"neither LPAREN nor RPAREN may appear here"); case EMPTY: - case BEGLINE: - case ENDLINE: case BEGWORD: case ENDWORD: case LIMWORD: @@ -3985,6 +3999,10 @@ dfamust (struct dfa *d) /* Guaranteed to be. Unlikely, but ... */ if (!STREQ (lmp->is, rmp->is)) lmp->is[0] = '\0'; + if (lmp->begline) + lmp->begline = rmp->begline; + if (lmp->endline) + lmp->endline = rmp->endline; /* Left side--easy */ i = 0; while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) @@ -4021,7 +4039,11 @@ dfamust (struct dfa *d) if (strlen (musts[0].in[i]) > strlen (result)) result = musts[0].in[i]; if (STREQ (result, musts[0].is)) - exact = 1; + { + exact = 1; + begline = musts[0].begline; + endline = musts[0].endline; + } goto done; case CAT: assert (&musts[2] <= mp); @@ -4062,14 +4084,20 @@ dfamust (struct dfa *d) if (lmp->right == NULL) goto done; /* Guaranteed to be */ - if (lmp->is[0] != '\0' && rmp->is[0] != '\0') + if ((lmp->is[0] != '\0' || lmp->begline) + && (rmp->is[0] != '\0' || rmp->endline)) { lmp->is = icatalloc (lmp->is, rmp->is); if (lmp->is == NULL) goto done; + lmp->endline = rmp->endline; } else - lmp->is[0] = '\0'; + { + lmp->is[0] = '\0'; + lmp->begline = 0; + lmp->endline = 0; + } } break; default: @@ -4116,6 +4144,8 @@ done: { MALLOC (dm, 1); dm->exact = exact; + dm->begline = begline; + dm->endline = endline; dm->must = xmemdup (result, strlen (result) + 1); dm->next = d->musts; d->musts = dm; diff --git a/src/dfa.h b/src/dfa.h index 24fbcbe..7aa1b9a 100644 --- a/src/dfa.h +++ b/src/dfa.h @@ -26,6 +26,8 @@ struct dfamust { int exact; + int begline; + int endline; char *must; struct dfamust *next; }; diff --git a/src/dfasearch.c b/src/dfasearch.c index ca00505..f0ab4a8 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -51,6 +51,8 @@ static size_t pcount; call the regexp matcher at all. */ static size_t kwset_exact_matches; +static int begline; + void dfaerror (char const *mesg) { @@ -92,16 +94,31 @@ kwsmusts (void) { char const *err; kwsinit (&kwset); + begline = 0; /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index of the matching string that it chooses. */ for (; dm; dm = dm->next) { + char *must, *mp; + size_t old_len, new_len; if (!dm->exact) continue; ++kwset_exact_matches; - if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != NULL) + old_len = strlen (dm->must); + new_len = old_len + dm->begline + dm->endline; + must = mp = xmalloc (new_len); + if (dm->begline) + { + (mp++)[0] = eolbyte; + begline = 1; + } + memcpy (mp, dm->must, old_len); + if (dm->endline) + mp[old_len] = eolbyte; + if ((err = kwsincr (kwset, must, new_len)) != NULL) error (EXIT_TROUBLE, 0, "%s", err); + free (must); } /* Now, we compile the substrings that will require the use of the regexp matcher. */ @@ -223,7 +240,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size, if (kwset) { /* Find a possible match using the KWset matcher. */ - size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); + size_t offset = kwsexec (kwset, beg - begline, + buflim - beg + begline, &kwsm); if (offset == (size_t) -1) goto failure; beg += offset; @@ -239,11 +257,12 @@ EGexecute (char const *buf, size_t size, size_t *match_size, char const *dfa_start = beg; if (kwsm.index < kwset_exact_matches) { + if (MB_CUR_MAX == 1) + goto success; if (mb_start < beg) mb_start = beg; - if (MB_CUR_MAX == 1 - || !is_mb_middle (&mb_start, match, buflim, - kwsm.size[0])) + if (!is_mb_middle (&mb_start, match, buflim, + kwsm.size[0] - begline)) goto success; /* The matched line starts in the middle of a multibyte character. Perform the DFA search starting from the diff --git a/src/kwsearch.c b/src/kwsearch.c index dd01518..1e471d0 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -65,8 +65,20 @@ Fcompile (char const *pattern, size_t size) #endif } - if ((err = kwsincr (kwset, beg, end - beg)) != NULL) - error (EXIT_TROUBLE, 0, "%s", err); + if (match_lines) + { + char *new_beg = xmalloc (end - beg + 2); + new_beg[0] = new_beg[end - beg + 1] = eolbyte; + memcpy (&new_beg[1], beg, end - beg); + if ((err = kwsincr (kwset, new_beg, end - beg + 2)) != NULL) + error (EXIT_TROUBLE, 0, "%s", err); + free (new_beg); + } + else + { + if ((err = kwsincr (kwset, beg, end - beg)) != NULL) + error (EXIT_TROUBLE, 0, "%s", err); + } beg = lim; } while (beg < pat + psize); @@ -119,11 +131,11 @@ Fexecute (char const *buf, size_t size, size_t *match_size, for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { - size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + size_t offset = kwsexec (kwset, beg - match_lines, buf + size - beg + match_lines, &kwsmatch); if (offset == (size_t) -1) goto failure; - len = kwsmatch.size[0]; - if (MB_CUR_MAX > 1 + len = kwsmatch.size[0] - match_lines; + if (MB_CUR_MAX > 1 && !match_lines && is_mb_middle (&mb_start, beg + offset, buf + size, len)) { /* The match was a part of multibyte character, advance at least @@ -140,14 +152,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size, if (start_ptr && !match_words) goto success_in_beg_and_len; if (match_lines) - { - if (beg > buf && beg[-1] != eol) - continue; - if (beg + len < buf + size && beg[len] != eol) - continue; - goto success; - } - else if (match_words) + goto success_in_beg_and_len; + if (match_words) for (try = beg; ; ) { if (try > buf && WCHAR(to_uchar (try[-1]))) -- 1.9.1