[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 04/15] Lay out the trees more systematically
From: |
Nick Cleaton |
Subject: |
[PATCH 04/15] Lay out the trees more systematically |
Date: |
Sat, 2 Oct 2010 07:07:44 +0100 |
Lay out sibling nodes according to location within the sibling trees,
rather than adding sibling nodes as they are encountered and
rebalancing the trees with rotations.
---
src/kwset.c | 187 ++++++++++++++++-------------------------------------------
1 files changed, 51 insertions(+), 136 deletions(-)
diff --git a/src/kwset.c b/src/kwset.c
index e4712fb..079c28a 100644
--- a/src/kwset.c
+++ b/src/kwset.c
@@ -57,10 +57,9 @@
arranged as a balanced tree. */
struct trie
{
- struct trie *llink; /* Left tree link; MUST be first field. */
+ struct trie *llink; /* Left tree link (to smaller labels). */
struct trie *rlink; /* Right tree link (to larger labels). */
unsigned char label; /* Label on the incoming edge. */
- char balance; /* Difference in depths of subtrees. */
unsigned int accepting; /* Word index of accepted word, or zero. */
struct trie *links; /* Tree of edges leaving this node. */
struct trie *parent; /* Parent of this node. */
@@ -241,10 +240,34 @@ compute_brood_notes (struct kwset *kwset, struct
cpsort_dump *keywords,
return brood_dest + 1;
}
-/* Used to allocate trie nodes from the node vector when building the trie
- from the sorted reversed keyword list. */
+/* Pre-allocate and plumb in a tree of 'count' trie nodes. Push the nodes
+ onto the prealloced node list in right to left order. */
static struct trie *
-allocate_trie_node (struct kwset *kwset, int index, int depth)
+alloc_and_plumb_siblings (struct kwset *kwset, int count, int depth)
+{
+ if (count == 0)
+ return NULL;
+
+ int lcount = --count / 2;
+ int rcount = count - lcount;
+ struct trie *t = kwset->next_free_node++;
+
+ t->rlink = alloc_and_plumb_siblings (kwset, rcount, depth);
+ t->depth = depth;
+ t->links = kwset->next_prealloced;
+ kwset->next_prealloced = t;
+ t->llink = alloc_and_plumb_siblings (kwset, lcount, depth);
+
+ return t;
+}
+
+/* Used to allocate each trie node from the node vector when building the
+ trie from the sorted reversed keyword list, and to plumb the new node
+ into the trie. Responsible for setting 'llink' and 'rlink', and for
+ updating 'links' in the parent node. */
+static struct trie *
+allocate_and_plumb_trie_node (struct kwset *kwset, int index, int depth,
+ struct trie *parent)
{
if (kwset->bn_cursor->index == index && kwset->bn_cursor->depth == depth)
{
@@ -252,29 +275,31 @@ allocate_trie_node (struct kwset *kwset, int index, int
depth)
2 or more siblings. Pre-allocate a block of nodes for the siblings,
so that they are kept adjacent. Pre-allocated nodes are stored in
a linked list (using 'links' as the list pointer) and tagged with
- the depth at which they are to be used. */
- int siblings = kwset->bn_cursor->count - 1;
-
- while (siblings--)
- {
- kwset->next_free_node->depth = depth;
- kwset->next_free_node->links = kwset->next_prealloced;
- kwset->next_prealloced = kwset->next_free_node++;
- }
+ the depth at which they are to be used.
+
+ Plumb in llink and rlink for the pre-allocated node block now.
+ This requires that they be pushed onto the pre-alloced list in right
+ to left order, so that they will come off the list in left to right
+ order, matching the order in which keywords are being added. */
+ parent->links = alloc_and_plumb_siblings (kwset,
+ kwset->bn_cursor->count,
+ depth);
kwset->bn_cursor++;
- return kwset->next_free_node++;
}
if (kwset->next_prealloced && kwset->next_prealloced->depth == depth)
{
/* This node is part of a group of siblings for which a block of
- adjacent nodes has been pre-allocated. */
+ adjacent nodes has been allocated and plumbed in. */
struct trie *t = kwset->next_prealloced;
kwset->next_prealloced = kwset->next_prealloced->links;
return t;
}
- return kwset->next_free_node++;
+ /* This node is an only child. */
+ parent->links = kwset->next_free_node++;
+ parent->links->llink = parent->links->rlink = NULL;
+ return parent->links;
}
/* Add a keyword to the under-construction trie. */
@@ -285,143 +310,33 @@ install_suffix (kwset_t kws, struct cpsort_md *md, char
const *text,
struct kwset *kwset;
struct trie *trie;
struct trie *link;
- int depth, trie_depth = md->cpl, len = md->suf_len;
- struct trie *links[DEPTH_SIZE];
- enum { L, R } dirs[DEPTH_SIZE];
- struct trie *t, *r, *l, *rl, *lr;
+ int trie_depth = md->cpl, len = md->suf_len;
kwset = (struct kwset *) kws;
/* Start at the trie node at which this suffix slots in. */
trie = kwset->prev_kw_tries[trie_depth];
- if (len == 0)
- {
- trie->accepting = 1 + 2 * md->index;
- return;
- }
- /* Descend the tree of outgoing links, looking for the place that the first
- character of the suffix goes and keeping track of the path followed. */
- link = trie->links;
- links[0] = (struct trie *) &trie->links;
- dirs[0] = L;
- depth = 1;
-
- while (link)
- {
- links[depth] = link;
- if ((unsigned char) text[0] < link->label)
- dirs[depth++] = L, link = link->llink;
- else
- dirs[depth++] = R, link = link->rlink;
- }
-
- /* Install a new trie node for the first character of this suffix. */
- link = allocate_trie_node (kwset, index, trie_depth + 1);
- link->llink = NULL;
- link->rlink = NULL;
- link->balance = 0;
- if (dirs[--depth] == L)
- links[depth]->llink = link;
- else
- links[depth]->rlink = link;
-
- /* Back up the tree fixing the balance flags. */
- while (depth && !links[depth]->balance)
- {
- if (dirs[depth] == L)
- --links[depth]->balance;
- else
- ++links[depth]->balance;
- --depth;
- }
-
- /* Rebalance the tree by pointer rotations if necessary. */
- if (depth && ((dirs[depth] == L && --links[depth]->balance)
- || (dirs[depth] == R && ++links[depth]->balance)))
- {
- switch (links[depth]->balance)
- {
- case (char) -2:
- switch (dirs[depth + 1])
- {
- case L:
- r = links[depth], t = r->llink, rl = t->rlink;
- t->rlink = r, r->llink = rl;
- t->balance = r->balance = 0;
- break;
- case R:
- r = links[depth], l = r->llink, t = l->rlink;
- rl = t->rlink, lr = t->llink;
- t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
- l->balance = t->balance != 1 ? 0 : -1;
- r->balance = t->balance != (char) -1 ? 0 : 1;
- t->balance = 0;
- break;
- default:
- abort ();
- }
- break;
- case 2:
- switch (dirs[depth + 1])
- {
- case R:
- l = links[depth], t = l->rlink, lr = t->llink;
- t->llink = l, l->rlink = lr;
- t->balance = l->balance = 0;
- break;
- case L:
- l = links[depth], r = l->rlink, t = r->llink;
- lr = t->llink, rl = t->rlink;
- t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
- l->balance = t->balance != 1 ? 0 : -1;
- r->balance = t->balance != (char) -1 ? 0 : 1;
- t->balance = 0;
- break;
- default:
- abort ();
- }
- break;
- default:
- abort ();
- }
-
- if (dirs[depth - 1] == L)
- links[depth - 1]->llink = t;
- else
- links[depth - 1]->rlink = t;
- }
-
-
- /* Populate trie nodes for the characters of this suffix. The first node
- has already been allocated and installed, the others are simple because
- they have no siblings yet. */
- while (1)
+ /* Populate trie nodes for the characters of this suffix. */
+ while (len--)
{
+ link = allocate_and_plumb_trie_node (kwset, index, ++trie_depth, trie);
link->accepting = 0;
- link->links = NULL;
link->parent = trie;
link->next = NULL;
link->fail = NULL;
- link->depth = trie->depth + 1;
+ link->depth = trie_depth;
link->shift = 0;
link->label = *text++;
- kwset->prev_kw_tries[++trie_depth] = link;
-
- if (--len == 0)
- break;
-
+ kwset->prev_kw_tries[trie_depth] = link;
trie = link;
- link = allocate_trie_node (kwset, index, trie_depth + 1);
- link->llink = NULL;
- link->rlink = NULL;
- link->balance = 0;
- trie->links = link;
}
/* Mark the node we finally reached as accepting, encoding the
index number of this word in the keyword set. */
- link->accepting = 1 + 2 * md->index;
+ trie->accepting = 1 + 2 * md->index;
+
+ trie->links = NULL;
}
static char const *
--
1.5.6.3
- Proof of concept: kwset running 3 times smaller, Nick Cleaton, 2010/10/02
- [PATCH 01/15] Merge trie and tree into a single struct, Nick Cleaton, 2010/10/02
- [PATCH 02/15] Sort the keywords before building the trie, Nick Cleaton, 2010/10/02
- [PATCH 03/15] Lay out the trie more systematically, Nick Cleaton, 2010/10/02
- [PATCH 04/15] Lay out the trees more systematically,
Nick Cleaton <=
- [PATCH 05/15] Avoid using llink and rlink during prep, Nick Cleaton, 2010/10/02
- [PATCH 06/15] Defer llink/rlink plumbing to the end of prep, Nick Cleaton, 2010/10/02
- [PATCH 07/15] Share storage between llink and fail, Nick Cleaton, 2010/10/02
- [PATCH 08/15] Avoid using trie->depth during matching, Nick Cleaton, 2010/10/02
- [PATCH 09/15] Share storage between rlink and depth, Nick Cleaton, 2010/10/02
- [PATCH 10/15] Eliminate the trie->parent field, Nick Cleaton, 2010/10/02
- [PATCH 11/15] Eliminate the trie->next field, Nick Cleaton, 2010/10/02
- [PATCH 12/15] Eliminate the trie->maxshift field, Nick Cleaton, 2010/10/02
- [PATCH 13/15] Reduce trie->accepting to a flag, Nick Cleaton, 2010/10/02
- [PATCH 14/15] Merge flags into a trie->flags field, Nick Cleaton, 2010/10/02