From 755f39d4c2e2822d4c241e357d258e5b25ba0d04 Mon Sep 17 00:00:00 2001
From: Norihiro Tanaka
Date: Sat, 22 Nov 2014 14:47:45 +0900
Subject: [PATCH 1/3] grep: use Aho-Corasick algorithm to search multiple fixed
words
Searching multiple fixed words, grep uses Commentz-Walter algorithm, but
it is very slow for a worst case e.g. as following. It is O(m*n).
- input: yes `printf %040d` | head -10000000
- word1: x0000000000000000000
- word2: x
This change uses Aho-Corasick algorithm instead of Commentz-Walter
algorithm to search multiple fixed words. It uses a function to build
tries which has been already defined for Commentz-Walter algorithm in
kwset.c and which has been already high quality.
I see 7x speed-up even for a typical case on Fedora 21 with a 3.2GHz i5
by this change.
First without this change (best-of-5 trials):
find /usr/share/doc/ -type f |
LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null
real 11.37 user 11.03 sys 0.24
Next with this change (best-of-5 trials):
find /usr/share/doc/ -type f |
LC_ALL=C time -p xargs.sh src/grep -Ff /usr/share/dict/linux.words >/dev/null
real 1.49 user 1.31 sys 0.15
* src/kwset.c (struct kwset): Add a new member 'mode'.
(kwsalloc): Use it.
All callers are changed.
(kwsincr): Using Aho-Corasick algorithm, build tries in normal order.
(acexec_trans, acexec): Add a new function.
(kwsexec): Use it.
* src/kwset.h (kwsalloc): Update a prototype.
* NEWS (Improvements): Mention it.
---
NEWS | 5 +
src/kwset.c | 326 +++++++++++++++++++++++++++++++++++++++++++++---------
src/kwset.h | 3 +-
src/searchutils.c | 4 +-
4 files changed, 282 insertions(+), 56 deletions(-)
diff --git a/NEWS b/NEWS
index 6138c4e..653213a 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,11 @@ GNU grep NEWS -*- outline -*-
* Noteworthy changes in release ?.? (????-??-??) [?]
+** Improvements
+
+ grep -F is now typically 7 times faster for a lot of words. The
+ improvement replace Commentz-Walter algorithm which has been used for
+ a long time to Aho-Corasick algorithm.
* Noteworthy changes in release 2.21 (2014-11-23) [stable]
diff --git a/src/kwset.c b/src/kwset.c
index 6d21893..9585f08 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -35,7 +35,6 @@
#include "kwset.h"
-#include
#include
#include
#include "system.h"
@@ -67,7 +66,7 @@ struct tree
char balance; /* Difference in depths of subtrees. */
};
-/* Node of a trie representing a set of reversed keywords. */
+/* Node of a trie representing a set of keywords. */
struct trie
{
size_t accepting; /* Word index of accepted word, or zero. */
@@ -111,6 +110,13 @@ struct kwset
more such matches; e.g., Greek has three sigma characters that
all match when case-folding. */
int gc1help;
+
+ /* If true, prefer Aho-Corasick algorithm to Beate Commentz-Walter
+ algorithm in multiple words. */
+ bool reverse;
+
+ /* kwsexec() implementation. */
+ size_t (*kwsexec) (kwset_t, char const *, size_t, struct kwsmatch *);
};
/* Use TRANS to transliterate C. A null TRANS does no transliteration. */
@@ -120,10 +126,14 @@ tr (char const *trans, char c)
return trans ? trans[U(c)] : c;
}
+static size_t acexec (kwset_t, char const *, size_t, struct kwsmatch *);
+static size_t cwexec (kwset_t, char const *, size_t, struct kwsmatch *);
+static size_t bmexec (kwset_t, char const *, size_t, struct kwsmatch *);
+
/* Allocate and initialize a keyword set object, returning an opaque
pointer to it. */
kwset_t
-kwsalloc (char const *trans)
+kwsalloc (char const *trans, bool const reverse)
{
struct kwset *kwset = xmalloc (sizeof *kwset);
@@ -141,6 +151,11 @@ kwsalloc (char const *trans)
kwset->maxd = -1;
kwset->target = NULL;
kwset->trans = trans;
+ kwset->reverse = reverse;
+ if (reverse)
+ kwset->kwsexec = cwexec;
+ else
+ kwset->kwsexec = acexec;
return kwset;
}
@@ -156,13 +171,14 @@ kwsincr (kwset_t kwset, char const *text, size_t len)
struct trie *trie = kwset->trie;
char const *trans = kwset->trans;
- text += len;
+ if (kwset->reverse)
+ text += len;
- /* Descend the trie (built of reversed keywords) character-by-character,
+ /* Descend the trie (built of keywords) character-by-character,
installing new nodes when necessary. */
while (len--)
{
- unsigned char uc = *--text;
+ unsigned char uc = kwset->reverse ? *--text : *text++;
unsigned char label = trans ? trans[uc] : uc;
/* Descend the tree of outgoing links for this trie node,
@@ -311,15 +327,15 @@ enqueue (struct tree *tree, struct trie **last)
well as a last resort failure node. */
static void
treefails (struct tree const *tree, struct trie const *fail,
- struct trie *recourse)
+ struct trie *recourse, bool reverse)
{
struct tree *link;
if (!tree)
return;
- treefails(tree->llink, fail, recourse);
- treefails(tree->rlink, fail, recourse);
+ treefails(tree->llink, fail, recourse, reverse);
+ treefails(tree->rlink, fail, recourse, reverse);
/* Find, in the chain of fails going back to the root, the first
node that has a descendant on the current label. */
@@ -334,6 +350,8 @@ treefails (struct tree const *tree, struct trie const *fail,
if (link)
{
tree->trie->fail = link->trie;
+ if (!reverse && link->trie->accepting && !tree->trie->accepting)
+ tree->trie->accepting = -1;
return;
}
fail = fail->fail;
@@ -396,6 +414,34 @@ kwsprep (kwset_t kwset)
int i;
unsigned char deltabuf[NCHAR];
unsigned char *delta = trans ? deltabuf : kwset->delta;
+ struct trie *curr, *last;
+
+ if (kwset->words == 1)
+ {
+ if (!kwset->reverse)
+ {
+ kwset_t new_kwset;
+
+ /* Enqueue the immediate descendants in the level order queue. */
+ for (curr = last = kwset->trie; curr; curr = curr->next)
+ enqueue (curr->links, &last);
+
+ /* Looking for just one string. Extract it from the trie. */
+ kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
+ for (i = 0, curr = kwset->trie; i < kwset->mind; ++i)
+ {
+ kwset->target[i] = curr->links->label;
+ curr = curr->next;
+ }
+
+ new_kwset = kwsalloc (kwset->trans, true);
+ kwsincr (new_kwset, kwset->target, kwset->mind);
+ obstack_free (&kwset->obstack, NULL);
+ *kwset = *new_kwset;
+ free (new_kwset);
+ }
+ kwset->kwsexec = bmexec;
+ }
/* Initial values for the delta table; will be changed later. The
delta entry for a given character is the smallest depth of any
@@ -404,49 +450,54 @@ kwsprep (kwset_t kwset)
/* Traverse the nodes of the trie in level order, simultaneously
computing the delta table, failure function, and shift function. */
- struct trie *curr, *last;
for (curr = last = kwset->trie; curr; curr = curr->next)
{
/* Enqueue the immediate descendants in the level order queue. */
enqueue (curr->links, &last);
- curr->shift = kwset->mind;
- curr->maxshift = kwset->mind;
-
/* Update the delta table for the descendants of this node. */
treedelta (curr->links, curr->depth, delta);
/* Compute the failure function for the descendants of this node. */
- treefails (curr->links, curr->fail, kwset->trie);
+ treefails (curr->links, curr->fail, kwset->trie, kwset->reverse);
- /* Update the shifts at each node in the current node's chain
- of fails back to the root. */
- struct trie *fail;
- for (fail = curr->fail; fail; fail = fail->fail)
+ if (kwset->reverse)
{
- /* If the current node has some outgoing edge that the fail
- doesn't, then the shift at the fail should be no larger
- than the difference of their depths. */
- if (!hasevery (fail->links, curr->links))
- if (curr->depth - fail->depth < fail->shift)
- fail->shift = curr->depth - fail->depth;
-
- /* If the current node is accepting then the shift at the
- fail and its descendants should be no larger than the
- difference of their depths. */
- if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
- fail->maxshift = curr->depth - fail->depth;
+ curr->shift = kwset->mind;
+ curr->maxshift = kwset->mind;
+
+ /* Update the shifts at each node in the current node's chain
+ of fails back to the root. */
+ struct trie *fail;
+ for (fail = curr->fail; fail; fail = fail->fail)
+ {
+ /* If the current node has some outgoing edge that the fail
+ doesn't, then the shift at the fail should be no larger
+ than the difference of their depths. */
+ if (!hasevery (fail->links, curr->links))
+ if (curr->depth - fail->depth < fail->shift)
+ fail->shift = curr->depth - fail->depth;
+
+ /* If the current node is accepting then the shift at the
+ fail and its descendants should be no larger than the
+ difference of their depths. */
+ if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
+ fail->maxshift = curr->depth - fail->depth;
+ }
}
}
- /* Traverse the trie in level order again, fixing up all nodes whose
- shift exceeds their inherited maxshift. */
- for (curr = kwset->trie->next; curr; curr = curr->next)
+ if (kwset->reverse)
{
- if (curr->maxshift > curr->parent->maxshift)
- curr->maxshift = curr->parent->maxshift;
- if (curr->shift > curr->maxshift)
- curr->shift = curr->maxshift;
+ /* Traverse the trie in level order again, fixing up all nodes whose
+ shift exceeds their inherited maxshift. */
+ for (curr = kwset->trie->next; curr; curr = curr->next)
+ {
+ if (curr->maxshift > curr->parent->maxshift)
+ curr->maxshift = curr->parent->maxshift;
+ if (curr->shift > curr->maxshift)
+ curr->shift = curr->maxshift;
+ }
}
/* Create a vector, indexed by character code, of the outgoing links
@@ -470,6 +521,7 @@ kwsprep (kwset_t kwset)
kwset->target[i] = curr->links->label;
curr = curr->next;
}
+
/* Looking for the delta2 shift that we might make after a
backwards match has failed. Extract it from the trie. */
if (kwset->mind > 1)
@@ -669,16 +721,26 @@ bmexec_trans (kwset_t kwset, char const *text, size_t size)
/* Fast Boyer-Moore search. */
static size_t
-bmexec (kwset_t kwset, char const *text, size_t size)
+bmexec (kwset_t kwset, char const *text, size_t size,
+ struct kwsmatch *kwsmatch)
{
/* Help the compiler inline bmexec_trans in two ways, depending on
whether kwset->trans is null. */
- return (kwset->trans
+ size_t ret = (kwset->trans
? bmexec_trans (kwset, text, size)
: bmexec_trans (kwset, text, size));
+
+ if (ret != (size_t) -1)
+ {
+ kwsmatch->index = 0;
+ kwsmatch->offset[0] = ret;
+ kwsmatch->size[0] = kwset->mind;
+ }
+
+ return ret;
}
-/* Hairy multiple string search. */
+/* Hairy multiple string search with Commentz-Walter algorithm. */
static size_t _GL_ARG_NONNULL ((4))
cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
{
@@ -834,6 +896,176 @@ cwexec (kwset_t kwset, char const *text, size_t len, struct kwsmatch *kwsmatch)
return mch - text;
}
+/* Hairy multiple string search with Aho-Corasick algorithm.
+ (inlinable version) */
+static inline size_t _GL_ARG_NONNULL ((4))
+acexec_trans (kwset_t kwset, char const *text, size_t len,
+ struct kwsmatch *kwsmatch)
+{
+ struct trie * const *next;
+ struct trie const *trie, *accept;
+ char const *tp, *left, *lim;
+ unsigned char c;
+ struct tree const *tree;
+ char const *trans;
+
+#ifdef lint
+ accept = NULL;
+#endif
+
+ /* Initialize register copies and look for easy ways out. */
+ if (len < kwset->mind)
+ return -1;
+
+ next = kwset->next;
+ trans = kwset->trans;
+ lim = text + len;
+ tp = text;
+
+ if (kwset->trie->accepting)
+ {
+ trie = kwset->trie;
+ goto match;
+ }
+
+ trie = next[U(tr (trans, *tp++))];
+
+ while (true)
+ {
+ if (tp < lim - 8)
+ {
+ while (!trie)
+ {
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ if (trie)
+ break;
+ if (tp >= lim - 8)
+ break;
+ trie = next[U(tr (trans, *tp++))];
+ }
+ }
+
+ while (!trie)
+ {
+ if (tp >= lim)
+ return -1;
+ trie = next[U(tr (trans, *tp++))];
+ }
+
+ if (trie->accepting)
+ goto match;
+ if (tp >= lim)
+ return -1;
+ c = tr (trans, *tp++);
+ tree = trie->links;
+
+ while (true)
+ {
+ if (c == tree->label)
+ {
+ trie = tree->trie;
+ if (trie->accepting)
+ goto match;
+ if (tp >= lim)
+ return -1;
+ c = tr (trans, *tp++);
+ tree = trie->links;
+ continue;
+ }
+ else if (c < tree->label)
+ tree = tree->llink;
+ else
+ tree = tree->rlink;
+ if (tree)
+ continue;
+ trie = trie->fail;
+ if (!trie)
+ break;
+ if (trie->accepting)
+ {
+ --tp;
+ goto match;
+ }
+ tree = trie->links;
+ }
+
+ trie = next[c];
+ }
+
+ match:
+ accept = trie;
+ while (accept->accepting == (size_t) -1)
+ accept = accept->fail;
+ left = tp - accept->depth;
+
+ /* Try left-most longest match. */
+ while (tp < lim)
+ {
+ struct trie const *accept1;
+ char const *left1;
+ c = tr (trans, *tp++);
+ tree = trie->links;
+ while (tree && c != tree->label)
+ if (c < tree->label)
+ tree = tree->llink;
+ else
+ tree = tree->rlink;
+ if (!tree)
+ break;
+ trie = tree->trie;
+ if (!trie->accepting)
+ continue;
+ accept1 = trie;
+ while (accept1->accepting == (size_t) -1)
+ accept1 = accept1->fail;
+ left1 = tp - accept1->depth;
+ if (left1 <= left)
+ {
+ left = left1;
+ accept = accept1;
+ }
+ }
+
+ kwsmatch->index = accept->accepting / 2;
+ kwsmatch->offset[0] = left - text;
+ kwsmatch->size[0] = accept->depth;
+
+ return left - text;
+}
+
+/* Hairy multiple string search with Aho-Corasick algorithm. */
+static size_t
+acexec (kwset_t kwset, char const *text, size_t size,
+ struct kwsmatch *kwsmatch)
+{
+ /* Help the compiler inline bmexec_trans in two ways, depending on
+ whether kwset->trans is null. */
+ return (kwset->trans
+ ? acexec_trans (kwset, text, size, kwsmatch)
+ : acexec_trans (kwset, text, size, kwsmatch));
+}
+
/* Search TEXT for a match of any member of KWSET.
Return the offset (into TEXT) of the first byte of the matching substring,
or (size_t) -1 if no match is found. Upon a match, store details in
@@ -843,19 +1075,7 @@ size_t
kwsexec (kwset_t kwset, char const *text, size_t size,
struct kwsmatch *kwsmatch)
{
- if (kwset->words == 1)
- {
- size_t ret = bmexec (kwset, text, size);
- if (ret != (size_t) -1)
- {
- kwsmatch->index = 0;
- kwsmatch->offset[0] = ret;
- kwsmatch->size[0] = kwset->mind;
- }
- return ret;
- }
- else
- return cwexec (kwset, text, size, kwsmatch);
+ return kwset->kwsexec (kwset, text, size, kwsmatch);
}
/* Free the components of the given keyword set. */
diff --git a/src/kwset.h b/src/kwset.h
index 12afb8e..2476db3 100644
--- a/src/kwset.h
+++ b/src/kwset.h
@@ -22,6 +22,7 @@
or (US mail) as Mike Haertel c/o Free Software Foundation. */
#include
+#include
struct kwsmatch
{
@@ -38,7 +39,7 @@ typedef struct kwset *kwset_t;
/* Return an opaque pointer to a newly allocated keyword set. A nonnull arg
specifies a table of character translations to be applied to all
pattern and search text. */
-extern kwset_t kwsalloc (char const *);
+extern kwset_t kwsalloc (char const *, bool const);
/* Incrementally extend the keyword set to include the given string.
Remember an index number for each keyword included in the set. */
diff --git a/src/searchutils.c b/src/searchutils.c
index 9edc785..314ab8e 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -39,10 +39,10 @@ kwsinit (kwset_t *kwset)
for (i = 0; i < NCHAR; ++i)
trans[i] = toupper (i);
- *kwset = kwsalloc (trans);
+ *kwset = kwsalloc (trans, false);
}
else
- *kwset = kwsalloc (NULL);
+ *kwset = kwsalloc (NULL, false);
if (!*kwset)
xalloc_die ();
--
2.2.0