From ad03d4eb22be1b907221af78f4a3b4bcd85e1fc2 Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Sat, 15 Mar 2014 14:41:52 +0900 Subject: [PATCH 1/3] grep: use the Galil rule for Boyer-Moore algorithm in KWSet The Boyer-Moore algorithm is O(m*n), which means it may be much slower than the DFA. Its Galil rule variant is O(n) and increases efficiency in the typical case; it skips sections that are known to match and does not compare more than once for a position in the text. To use the Galil rule, look for the delta2 shift at each position from the trie instead of the 'mind2' value. * src/kwset.c (struct kwset): Replace member 'mind2' with 'shift'. (kwsprep): Look for the delta2 shift. (bmexec): Use it. --- src/kwset.c | 214 ++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 127 insertions(+), 87 deletions(-) diff --git a/src/kwset.c b/src/kwset.c index 410e046..e0bf6b2 100644 --- a/src/kwset.c +++ b/src/kwset.c @@ -83,7 +83,7 @@ struct kwset unsigned char delta[NCHAR]; /* Delta table for rapid search. */ struct trie *next[NCHAR]; /* Table of children of the root. */ char *target; /* Target string if there's only one. */ - int mind2; /* Used in Boyer-Moore search for one string. */ + int *shift; /* Used in Boyer-Moore search for one string. */ char const *trans; /* Character translation table. */ }; @@ -397,12 +397,70 @@ kwsprep (kwset_t kws) node at which an outgoing edge is labeled by that character. */ memset(delta, kwset->mind < UCHAR_MAX ? kwset->mind : UCHAR_MAX, NCHAR); + struct trie *fail; + struct trie *last, *next[NCHAR]; + + /* Traverse the nodes of the trie in level order, simultaneously + computing the delta table, failure function, and shift function. */ + for (curr = last = kwset->trie; curr; curr = curr->next) + { + /* Enqueue the immediate descendants in the level order queue. */ + enqueue(curr->links, &last); + + curr->shift = kwset->mind; + curr->maxshift = kwset->mind; + + /* Update the delta table for the descendants of this node. */ + treedelta(curr->links, curr->depth, delta); + + /* Compute the failure function for the descendants of this node. */ + treefails(curr->links, curr->fail, kwset->trie); + + /* Update the shifts at each node in the current node's chain + of fails back to the root. */ + for (fail = curr->fail; fail; fail = fail->fail) + { + /* If the current node has some outgoing edge that the fail + doesn't, then the shift at the fail should be no larger + than the difference of their depths. */ + if (!hasevery(fail->links, curr->links)) + if (curr->depth - fail->depth < fail->shift) + fail->shift = curr->depth - fail->depth; + + /* If the current node is accepting then the shift at the + fail and its descendants should be no larger than the + difference of their depths. */ + if (curr->accepting && fail->maxshift > curr->depth - fail->depth) + fail->maxshift = curr->depth - fail->depth; + } + } + + /* Traverse the trie in level order again, fixing up all nodes whose + shift exceeds their inherited maxshift. */ + for (curr = kwset->trie->next; curr; curr = curr->next) + { + if (curr->maxshift > curr->parent->maxshift) + curr->maxshift = curr->parent->maxshift; + if (curr->shift > curr->maxshift) + curr->shift = curr->maxshift; + } + + /* Create a vector, indexed by character code, of the outgoing links + from the root node. */ + for (i = 0; i < NCHAR; ++i) + next[i] = NULL; + treenext(kwset->trie->links, next); + + if ((trans = kwset->trans) != NULL) + for (i = 0; i < NCHAR; ++i) + kwset->next[i] = next[U(trans[i])]; + else + memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); + /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ if (kwset->words == 1 && kwset->trans == NULL) { - char c; - /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); if (!kwset->target) @@ -410,80 +468,22 @@ kwsprep (kwset_t kws) for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) { kwset->target[i] = curr->links->label; - curr = curr->links->trie; + curr = curr->next; } - /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ - for (i = 0; i < kwset->mind; ++i) - delta[U(kwset->target[i])] = kwset->mind - (i + 1); - /* Find the minimal delta2 shift that we might make after - a backwards match has failed. */ - c = kwset->target[kwset->mind - 1]; - for (i = kwset->mind - 2; i >= 0; --i) - if (kwset->target[i] == c) - break; - kwset->mind2 = kwset->mind - (i + 1); - } - else - { - struct trie *fail; - struct trie *last, *next[NCHAR]; - - /* Traverse the nodes of the trie in level order, simultaneously - computing the delta table, failure function, and shift function. */ - for (curr = last = kwset->trie; curr; curr = curr->next) + /* Looking for the delta2 shift that we might make after a + backwards match has failed. Extract it from the trie. */ + if (kwset->mind > 1) { - /* Enqueue the immediate descendants in the level order queue. */ - enqueue(curr->links, &last); - - curr->shift = kwset->mind; - curr->maxshift = kwset->mind; - - /* Update the delta table for the descendants of this node. */ - treedelta(curr->links, curr->depth, delta); - - /* Compute the failure function for the descendants of this node. */ - treefails(curr->links, curr->fail, kwset->trie); - - /* Update the shifts at each node in the current node's chain - of fails back to the root. */ - for (fail = curr->fail; fail; fail = fail->fail) + kwset->shift = obstack_alloc(&kwset->obstack, + sizeof (*kwset->shift) * (kwset->mind - 1)); + if (!kwset->shift) + return _("memory exhausted"); + for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i) { - /* If the current node has some outgoing edge that the fail - doesn't, then the shift at the fail should be no larger - than the difference of their depths. */ - if (!hasevery(fail->links, curr->links)) - if (curr->depth - fail->depth < fail->shift) - fail->shift = curr->depth - fail->depth; - - /* If the current node is accepting then the shift at the - fail and its descendants should be no larger than the - difference of their depths. */ - if (curr->accepting && fail->maxshift > curr->depth - fail->depth) - fail->maxshift = curr->depth - fail->depth; + kwset->shift[i] = curr->shift; + curr = curr->next; } } - - /* Traverse the trie in level order again, fixing up all nodes whose - shift exceeds their inherited maxshift. */ - for (curr = kwset->trie->next; curr; curr = curr->next) - { - if (curr->maxshift > curr->parent->maxshift) - curr->maxshift = curr->parent->maxshift; - if (curr->shift > curr->maxshift) - curr->shift = curr->maxshift; - } - - /* Create a vector, indexed by character code, of the outgoing links - from the root node. */ - for (i = 0; i < NCHAR; ++i) - next[i] = NULL; - treenext(kwset->trie->links, next); - - if ((trans = kwset->trans) != NULL) - for (i = 0; i < NCHAR; ++i) - kwset->next[i] = next[U(trans[i])]; - else - memcpy(kwset->next, next, NCHAR * sizeof(struct trie *)); } /* Fix things up for any translation table. */ @@ -503,7 +503,8 @@ bmexec (kwset_t kws, char const *text, size_t size) struct kwset const *kwset; unsigned char const *d1; char const *ep, *sp, *tp; - int d, gc, i, len, md2; + int d, i, len, skip; + char gc1, gc2; kwset = (struct kwset const *) kws; len = kwset->mind; @@ -520,9 +521,10 @@ bmexec (kwset_t kws, char const *text, size_t size) d1 = kwset->delta; sp = kwset->target + len; - gc = U(sp[-2]); - md2 = kwset->mind2; + gc1 = sp[-1]; + gc2 = sp[-2]; tp = text + len; + skip = 0; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ if (size > 12 * len) @@ -550,14 +552,33 @@ bmexec (kwset_t kws, char const *text, size_t size) } break; found: - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - tp += md2; } /* Now we have only a few characters left to search. We @@ -569,14 +590,33 @@ bmexec (kwset_t kws, char const *text, size_t size) d = d1[U((tp += d)[-1])]; if (d != 0) continue; - if (U(tp[-2]) == gc) + d = len; + while (1) { - for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) - ; - if (i > len) - return tp - len - text; + if (tp[-2] == gc2) + { + for (i = 3; i <= d && tp[-i] == sp[-i]; ++i) + ; + if (i > d) + { + for (i = d + skip + 1; i <= len && tp[-i] == sp[-i]; ++i) + ; + if (i > len) + return tp - len - text; + } + d = kwset->shift[i - 2]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = i - 1; + } + else + { + d = kwset->shift[0]; tp += d; + if (tp > ep || tp[-1] != gc1) + break; + skip = 1; + } } - d = md2; } return -1; -- 1.9.0