From e9496f8d82ea9ba260d337a2d8fae32161633637 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 22 Nov 2014 15:16:55 +0900 Subject: [PATCH 2/3] grep immediately return without longest match to search multiple fixed words if not needed Searching multiple fixed words, grep immediately returns without longest match if not needed. Without this change, grep tries longest match for multiple words even if not needed. * src/kwset.c (kwsexec, acexec, cwexec, bmexec): Add an argument which is bool whether needed longest match or not. All callers are changed. * src/kwset.h (kwsexec): Update prototype. --- src/dfasearch.c | 2 +- src/kwsearch.c | 18 +++++-- src/kwset.c | 165 ++++++++++++++++++++++++++++++-------------------------- src/kwset.h | 4 +- 4 files changed, 105 insertions(+), 84 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 77b4e3e..1eb24ec 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -239,7 +239,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, /* Find a possible match using the KWset matcher. */ size_t offset = kwsexec (kwset, beg - begline, - buflim - beg + begline, &kwsm); + buflim - beg + begline, &kwsm, true); if (offset == (size_t) -1) goto failure; match = beg + offset; diff --git a/src/kwsearch.c b/src/kwsearch.c index 1335a26..c44ec08 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -111,6 +111,8 @@ Fexecute (char const *buf, size_t size, size_t *match_size, struct kwsmatch kwsmatch; size_t ret_val; mb_len_map_t *map = NULL; + bool mb_check; + bool longest; if (MB_CUR_MAX > 1) { @@ -123,15 +125,23 @@ Fexecute (char const *buf, size_t size, size_t *match_size, } } + if (match_lines) + mb_check = longest = false; + else + { + mb_check = MB_CUR_MAX > 1 && !using_utf8 (); + longest = mb_check || start_ptr || match_words; + } + for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) { size_t offset = kwsexec (kwset, beg - match_lines, - buf + size - beg + match_lines, &kwsmatch); + buf + size - beg + match_lines, &kwsmatch, + longest); if (offset == (size_t) -1) goto failure; len = kwsmatch.size[0] - 2 * match_lines; - if (!match_lines && MB_CUR_MAX > 1 && !using_utf8 () - && mb_goback (&mb_start, beg + offset, buf + size) != 0) + if (mb_check && mb_goback (&mb_start, beg + offset, buf + size) != 0) { /* We have matched a single byte that is not at the beginning of a multibyte character. mb_goback has advanced MB_START past that @@ -166,7 +176,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size, { if (!len) break; - offset = kwsexec (kwset, beg, --len, &kwsmatch); + offset = kwsexec (kwset, beg, --len, &kwsmatch, true); if (offset == (size_t) -1) break; try = beg + offset; diff --git a/src/kwset.c b/src/kwset.c index 9585f08..a02bade 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -116,7 +116,8 @@ struct kwset bool reverse; /* kwsexec() implementation. */ - size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *); + size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *, + bool); }; /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ @@ -126,9 +127,12 @@ tr (char const *trans, char c) return trans ? trans[U(c)] : c; } -static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *); -static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *); -static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); +static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); +static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool); /* Allocate and initialize a keyword set object, returning an opaque pointer to it. */ @@ -722,7 +726,7 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size) /* Fast Boyer-Moore search. */ static size_t bmexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ @@ -742,7 +746,8 @@ bmexec (kwset_t kwset, char const *text, size_t size, /* Hairy multiple string search with Commentz-Walter algorithm. */ static size_t _GL_ARG_NONNULL ((4)) -cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) +cwexec (kwset_t kwset, char const *text, size_t len, + struct kwsmatch *kwsmatch, bool longest) { struct trie * const *next; struct trie const *trie; @@ -837,56 +842,59 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) /* Given a known match, find the longest possible match anchored at or before its starting point. This is nearly a verbatim copy of the preceding main search loops. */ - if (lim - mch > kwset->maxd) - lim = mch + kwset->maxd; - lmch = 0; - d = 1; - while (lim - end >= d) + if (longest) { - if ((d = delta[c = (end += d)[-1]]) != 0) - continue; - beg = end - 1; - if (!(trie = next[c])) - { - d = 1; - continue; - } - if (trie->accepting && beg <= mch) - { - lmch = beg; - accept = trie; - } - d = trie->shift; - while (beg > text) + if (lim - mch > kwset->maxd) + lim = mch + kwset->maxd; + lmch = 0; + d = 1; + while (lim - end >= d) { - unsigned char uc = *--beg; - c = trans ? trans[uc] : uc; - tree = trie->links; - while (tree && c != tree->label) - if (c < tree->label) - tree = tree->llink; - else - tree = tree->rlink; - if (tree) + if ((d = delta[c = (end += d)[-1]]) != 0) + continue; + beg = end - 1; + if (!(trie = next[c])) { - trie = tree->trie; - if (trie->accepting && beg <= mch) + d = 1; + continue; + } + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } + d = trie->shift; + while (beg > text) + { + unsigned char uc = *--beg; + c = trans ? trans[uc] : uc; + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) { - lmch = beg; - accept = trie; + trie = tree->trie; + if (trie->accepting && beg <= mch) + { + lmch = beg; + accept = trie; + } } + else + break; + d = trie->shift; } - else - break; - d = trie->shift; - } - if (lmch) - { - mch = lmch; - goto match; + if (lmch) + { + mch = lmch; + goto match; + } + if (!d) + d = 1; } - if (!d) - d = 1; } kwsmatch->index = accept->accepting / 2; @@ -900,7 +908,7 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) (inlinable version) */ static inline size_t _GL_ARG_NONNULL ((4)) acexec_trans (kwset_t kwset, char const *text, size_t len, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { struct trie * const *next; struct trie const *trie, *accept; @@ -1020,30 +1028,33 @@ acexec_trans (kwset_t kwset, char const *text, size_t len, left = tp - accept->depth; /* Try left-most longest match. */ - while (tp < lim) + if (longest) { - struct trie const *accept1; - char const *left1; - c = tr (trans, *tp++); - tree = trie->links; - while (tree && c != tree->label) - if (c < tree->label) - tree = tree->llink; - else - tree = tree->rlink; - if (!tree) - break; - trie = tree->trie; - if (!trie->accepting) - continue; - accept1 = trie; - while (accept1->accepting == (size_t) -1) - accept1 = accept1->fail; - left1 = tp - accept1->depth; - if (left1 <= left) + while (tp < lim) { - left = left1; - accept = accept1; + struct trie const *accept1; + char const *left1; + c = tr (trans, *tp++); + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (!tree) + break; + trie = tree->trie; + if (!trie->accepting) + continue; + accept1 = trie; + while (accept1->accepting == (size_t) -1) + accept1 = accept1->fail; + left1 = tp - accept1->depth; + if (left1 <= left) + { + left = left1; + accept = accept1; + } } } @@ -1057,13 +1068,13 @@ acexec_trans (kwset_t kwset, char const *text, size_t len, /* Hairy multiple string search with Aho-Corasick algorithm. */ static size_t acexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ return (kwset->trans - ? acexec_trans (kwset, text, size, kwsmatch) - : acexec_trans (kwset, text, size, kwsmatch)); + ? acexec_trans (kwset, text, size, kwsmatch, longest) + : acexec_trans (kwset, text, size, kwsmatch, longest)); } /* Search TEXT for a match of any member of KWSET. @@ -1073,9 +1084,9 @@ acexec (kwset_t kwset, char const *text, size_t size, value), and length. */ size_t kwsexec (kwset_t kwset, char const *text, size_t size, - struct kwsmatch *kwsmatch) + struct kwsmatch *kwsmatch, bool const longest) { - return kwset->kwsexec (kwset, text, size, kwsmatch); + return kwset->kwsexec (kwset, text, size, kwsmatch, longest); } /* Free the components of the given keyword set. */ diff --git a/src/kwset.h b/src/kwset.h index 2476db3..e36c860 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -54,8 +54,8 @@ extern void kwsprep (kwset_t); the matching substring in the integer it points to. Similarly, if foundindex is non-NULL, store the index of the particular keyword found therein. */ -extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *) - _GL_ARG_NONNULL ((4)); +extern size_t kwsexec (kwset_t, char const *, size_t, struct kwsmatch *, + bool const) _GL_ARG_NONNULL ((4)); /* Deallocate the given keyword set and all its associated storage. */ extern void kwsfree (kwset_t); -- 2.2.0