From 755f39d4c2e2822d4c241e357d258e5b25ba0d04 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 22 Nov 2014 14:47:45 +0900 Subject: [PATCH 1/3] grep: use Aho-Corasick algorithm to search multiple fixed words Searching multiple fixed words, grep uses Commentz-Walter algorithm, but it is very slow for a worst case e.g. as following. It is O(m*n). - input: yes `printf %040d` | head -10000000 - word1: x0000000000000000000 - word2: x This change uses Aho-Corasick algorithm instead of Commentz-Walter algorithm to search multiple fixed words. It uses a function to build tries which has been already defined for Commentz-Walter algorithm in kwset.c and which has been already high quality. I see 7x speed-up even for a typical case on Fedora 21 with a 3.2GHz i5 by this change. First without this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 11.37 user 11.03 sys 0.24 Next with this change (best-of-5 trials): find /usr/share/doc/ -type f | LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null real 1.49 user 1.31 sys 0.15 * src/kwset.c (struct kwset): Add a new member 'mode'. (kwsalloc): Use it. All callers are changed. (kwsincr): Using Aho-Corasick algorithm, build tries in normal order. (acexec_trans, acexec): Add a new function. (kwsexec): Use it. * src/kwset.h (kwsalloc): Update a prototype. * NEWS (Improvements): Mention it. --- NEWS | 5 + src/kwset.c | 326 +++++++++++++++++++++++++++++++++++++++++++++--------- src/kwset.h | 3 +- src/searchutils.c | 4 +- 4 files changed, 282 insertions(+), 56 deletions(-) diff --git a/NEWS b/NEWS index 6138c4e..653213a 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** Improvements + + grep -F is now typically 7 times faster for a lot of words. The + improvement replace Commentz-Walter algorithm which has been used for + a long time to Aho-Corasick algorithm. * Noteworthy changes in release 2.21 (2014-11-23) [stable] diff --git a/src/kwset.c b/src/kwset.c index 6d21893..9585f08 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -35,7 +35,6 @@ #include "kwset.h" -#include #include #include #include "system.h" @@ -67,7 +66,7 @@ struct tree char balance; /* Difference in depths of subtrees. */ }; -/* Node of a trie representing a set of reversed keywords. */ +/* Node of a trie representing a set of keywords. */ struct trie { size_t accepting; /* Word index of accepted word, or zero. */ @@ -111,6 +110,13 @@ struct kwset more such matches; e.g., Greek has three sigma characters that all match when case-folding. */ int gc1help; + + /* If true, prefer Aho-Corasick algorithm to Beate Commentz-Walter + algorithm in multiple words. */ + bool reverse; + + /* kwsexec() implementation. */ + size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *); }; /* Use TRANS to transliterate C. A null TRANS does no transliteration. */ @@ -120,10 +126,14 @@ tr (char const *trans, char c) return trans ? trans[U(c)] : c; } +static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *); +static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *); + /* Allocate and initialize a keyword set object, returning an opaque pointer to it. */ kwset_t -kwsalloc (char const *trans) +kwsalloc (char const *trans, bool const reverse) { struct kwset *kwset = xmalloc (sizeof *kwset); @@ -141,6 +151,11 @@ kwsalloc (char const *trans) kwset->maxd = -1; kwset->target = NULL; kwset->trans = trans; + kwset->reverse = reverse; + if (reverse) + kwset->kwsexec = cwexec; + else + kwset->kwsexec = acexec; return kwset; } @@ -156,13 +171,14 @@ kwsincr (kwset_t kwset, char const *text, size_t len) struct trie *trie = kwset->trie; char const *trans = kwset->trans; - text += len; + if (kwset->reverse) + text += len; - /* Descend the trie (built of reversed keywords) character-by-character, + /* Descend the trie (built of keywords) character-by-character, installing new nodes when necessary. */ while (len--) { - unsigned char uc = *--text; + unsigned char uc = kwset->reverse ? *--text : *text++; unsigned char label = trans ? trans[uc] : uc; /* Descend the tree of outgoing links for this trie node, @@ -311,15 +327,15 @@ enqueue (struct tree *tree, struct trie **last) well as a last resort failure node. */ static void treefails (struct tree const *tree, struct trie const *fail, - struct trie *recourse) + struct trie *recourse, bool reverse) { struct tree *link; if (!tree) return; - treefails(tree->llink, fail, recourse); - treefails(tree->rlink, fail, recourse); + treefails(tree->llink, fail, recourse, reverse); + treefails(tree->rlink, fail, recourse, reverse); /* Find, in the chain of fails going back to the root, the first node that has a descendant on the current label. */ @@ -334,6 +350,8 @@ treefails (struct tree const *tree, struct trie const *fail, if (link) { tree->trie->fail = link->trie; + if (!reverse && link->trie->accepting && !tree->trie->accepting) + tree->trie->accepting = -1; return; } fail = fail->fail; @@ -396,6 +414,34 @@ kwsprep (kwset_t kwset) int i; unsigned char deltabuf[NCHAR]; unsigned char *delta = trans ? deltabuf : kwset->delta; + struct trie *curr, *last; + + if (kwset->words == 1) + { + if (!kwset->reverse) + { + kwset_t new_kwset; + + /* Enqueue the immediate descendants in the level order queue. */ + for (curr = last = kwset->trie; curr; curr = curr->next) + enqueue (curr->links, &last); + + /* Looking for just one string. Extract it from the trie. */ + kwset->target = obstack_alloc (&kwset->obstack, kwset->mind); + for (i = 0, curr = kwset->trie; i < kwset->mind; ++i) + { + kwset->target[i] = curr->links->label; + curr = curr->next; + } + + new_kwset = kwsalloc (kwset->trans, true); + kwsincr (new_kwset, kwset->target, kwset->mind); + obstack_free (&kwset->obstack, NULL); + *kwset = *new_kwset; + free (new_kwset); + } + kwset->kwsexec = bmexec; + } /* Initial values for the delta table; will be changed later. The delta entry for a given character is the smallest depth of any @@ -404,49 +450,54 @@ kwsprep (kwset_t kwset) /* Traverse the nodes of the trie in level order, simultaneously computing the delta table, failure function, and shift function. */ - struct trie *curr, *last; for (curr = last = kwset->trie; curr; curr = curr->next) { /* Enqueue the immediate descendants in the level order queue. */ enqueue (curr->links, &last); - curr->shift = kwset->mind; - curr->maxshift = kwset->mind; - /* Update the delta table for the descendants of this node. */ treedelta (curr->links, curr->depth, delta); /* Compute the failure function for the descendants of this node. */ - treefails (curr->links, curr->fail, kwset->trie); + treefails (curr->links, curr->fail, kwset->trie, kwset->reverse); - /* Update the shifts at each node in the current node's chain - of fails back to the root. */ - struct trie *fail; - for (fail = curr->fail; fail; fail = fail->fail) + if (kwset->reverse) { - /* If the current node has some outgoing edge that the fail - doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery (fail->links, curr->links)) - if (curr->depth - fail->depth < fail->shift) - fail->shift = curr->depth - fail->depth; - - /* If the current node is accepting then the shift at the - fail and its descendants should be no larger than the - difference of their depths. */ - if (curr->accepting && fail->maxshift > curr->depth - fail->depth) - fail->maxshift = curr->depth - fail->depth; + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + struct trie *fail; + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery (fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } } } - /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ - for (curr = kwset->trie->next; curr; curr = curr->next) + if (kwset->reverse) { - if (curr->maxshift > curr->parent->maxshift) - curr->maxshift = curr->parent->maxshift; - if (curr->shift > curr->maxshift) - curr->shift = curr->maxshift; + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } } /* Create a vector, indexed by character code, of the outgoing links @@ -470,6 +521,7 @@ kwsprep (kwset_t kwset) kwset->target[i] = curr->links->label; curr = curr->next; } + /* Looking for the delta2 shift that we might make after a backwards match has failed. Extract it from the trie. */ if (kwset->mind > 1) @@ -669,16 +721,26 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size) /* Fast Boyer-Moore search. */ static size_t -bmexec (kwset_t kwset, char const *text, size_t size) +bmexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) { /* Help the compiler inline bmexec_trans in two ways, depending on whether kwset->trans is null. */ - return (kwset->trans + size_t ret = (kwset->trans ? bmexec_trans (kwset, text, size) : bmexec_trans (kwset, text, size)); + + if (ret != (size_t) -1) + { + kwsmatch->index = 0; + kwsmatch->offset[0] = ret; + kwsmatch->size[0] = kwset->mind; + } + + return ret; } -/* Hairy multiple string search. */ +/* Hairy multiple string search with Commentz-Walter algorithm. */ static size_t _GL_ARG_NONNULL ((4)) cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) { @@ -834,6 +896,176 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch) return mch - text; } +/* Hairy multiple string search with Aho-Corasick algorithm. + (inlinable version) */ +static inline size_t _GL_ARG_NONNULL ((4)) +acexec_trans (kwset_t kwset, char const *text, size_t len, + struct kwsmatch *kwsmatch) +{ + struct trie * const *next; + struct trie const *trie, *accept; + char const *tp, *left, *lim; + unsigned char c; + struct tree const *tree; + char const *trans; + +#ifdef lint + accept = NULL; +#endif + + /* Initialize register copies and look for easy ways out. */ + if (len < kwset->mind) + return -1; + + next = kwset->next; + trans = kwset->trans; + lim = text + len; + tp = text; + + if (kwset->trie->accepting) + { + trie = kwset->trie; + goto match; + } + + trie = next[U(tr (trans, *tp++))]; + + while (true) + { + if (tp < lim - 8) + { + while (!trie) + { + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + trie = next[U(tr (trans, *tp++))]; + if (trie) + break; + if (tp >= lim - 8) + break; + trie = next[U(tr (trans, *tp++))]; + } + } + + while (!trie) + { + if (tp >= lim) + return -1; + trie = next[U(tr (trans, *tp++))]; + } + + if (trie->accepting) + goto match; + if (tp >= lim) + return -1; + c = tr (trans, *tp++); + tree = trie->links; + + while (true) + { + if (c == tree->label) + { + trie = tree->trie; + if (trie->accepting) + goto match; + if (tp >= lim) + return -1; + c = tr (trans, *tp++); + tree = trie->links; + continue; + } + else if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (tree) + continue; + trie = trie->fail; + if (!trie) + break; + if (trie->accepting) + { + --tp; + goto match; + } + tree = trie->links; + } + + trie = next[c]; + } + + match: + accept = trie; + while (accept->accepting == (size_t) -1) + accept = accept->fail; + left = tp - accept->depth; + + /* Try left-most longest match. */ + while (tp < lim) + { + struct trie const *accept1; + char const *left1; + c = tr (trans, *tp++); + tree = trie->links; + while (tree && c != tree->label) + if (c < tree->label) + tree = tree->llink; + else + tree = tree->rlink; + if (!tree) + break; + trie = tree->trie; + if (!trie->accepting) + continue; + accept1 = trie; + while (accept1->accepting == (size_t) -1) + accept1 = accept1->fail; + left1 = tp - accept1->depth; + if (left1 <= left) + { + left = left1; + accept = accept1; + } + } + + kwsmatch->index = accept->accepting / 2; + kwsmatch->offset[0] = left - text; + kwsmatch->size[0] = accept->depth; + + return left - text; +} + +/* Hairy multiple string search with Aho-Corasick algorithm. */ +static size_t +acexec (kwset_t kwset, char const *text, size_t size, + struct kwsmatch *kwsmatch) +{ + /* Help the compiler inline bmexec_trans in two ways, depending on + whether kwset->trans is null. */ + return (kwset->trans + ? acexec_trans (kwset, text, size, kwsmatch) + : acexec_trans (kwset, text, size, kwsmatch)); +} + /* Search TEXT for a match of any member of KWSET. Return the offset (into TEXT) of the first byte of the matching substring, or (size_t) -1 if no match is found. Upon a match, store details in @@ -843,19 +1075,7 @@ size_t kwsexec (kwset_t kwset, char const *text, size_t size, struct kwsmatch *kwsmatch) { - if (kwset->words == 1) - { - size_t ret = bmexec (kwset, text, size); - if (ret != (size_t) -1) - { - kwsmatch->index = 0; - kwsmatch->offset[0] = ret; - kwsmatch->size[0] = kwset->mind; - } - return ret; - } - else - return cwexec (kwset, text, size, kwsmatch); + return kwset->kwsexec (kwset, text, size, kwsmatch); } /* Free the components of the given keyword set. */ diff --git a/src/kwset.h b/src/kwset.h index 12afb8e..2476db3 100644 --- a/src/kwset.h +++ b/src/kwset.h @@ -22,6 +22,7 @@ or (US mail) as Mike Haertel c/o Free Software Foundation. */ #include +#include struct kwsmatch { @@ -38,7 +39,7 @@ typedef struct kwset *kwset_t; /* Return an opaque pointer to a newly allocated keyword set. A nonnull arg specifies a table of character translations to be applied to all pattern and search text. */ -extern kwset_t kwsalloc (char const *); +extern kwset_t kwsalloc (char const *, bool const); /* Incrementally extend the keyword set to include the given string. Remember an index number for each keyword included in the set. */ diff --git a/src/searchutils.c b/src/searchutils.c index 9edc785..314ab8e 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -39,10 +39,10 @@ kwsinit (kwset_t *kwset) for (i = 0; i < NCHAR; ++i) trans[i] = toupper (i); - *kwset = kwsalloc (trans); + *kwset = kwsalloc (trans, false); } else - *kwset = kwsalloc (NULL); + *kwset = kwsalloc (NULL, false); if (!*kwset) xalloc_die (); -- 2.2.0