[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Grammatica-users] Massive performance improvement by backing LookAheadS
From: |
Steve Davis |
Subject: |
[Grammatica-users] Massive performance improvement by backing LookAheadSet with a HashSet |
Date: |
Tue, 31 May 2005 19:25:03 +0200 |
User-agent: |
Mozilla Thunderbird 1.0.2 (Windows/20050317) |
Hi,
I have been working on a non-trivial grammar, and found that after a
certain point in complexity, Grammatica was taking 10-12 seconds to
parse each individual source file.
By changing the implementation of LookAheadSet (backed by a HashSet not
an ArrayList) as shown below, I am now getting parse timings of about
400-500 ms.
...a twenty-fold improvement.
/*
....
*/
package net.percederberg.grammatica.parser;
import java.util.*;
/**
....
*/
class LookAheadSet {
/**
* The set of token look-ahead sequences. Each sequence in
* turn is represented by an ArrayList with Integers for the
* token id:s.
*/
private Set elements = new HashSet();
/**
* The maximum length of any look-ahead sequence.
*/
private int maxLength;
/**
* Creates a new look-ahead set with the specified maximum
* length.
*
* @param maxLength the maximum token sequence length
*/
public LookAheadSet(int maxLength) {
this.maxLength = maxLength;
}
/**
* Creates a duplicate look-ahead set, possibly with a different
* maximum length.
*
* @param maxLength the maximum token sequence length
* @param set the look-ahead set to copy
*/
public LookAheadSet(int maxLength, LookAheadSet set) {
this(maxLength);
addAll(set);
}
/**
* Returns the size of this look-ahead set.
*
* @return the number of token sequences in the set
*/
public int size() {
return elements.size();
}
/**
* Returns the length of the shortest token sequence in this
* set. This method will return zero (0) if the set is empty.
*
* @return the length of the shortest token sequence
*/
public int getMinLength() {
Sequence seq;
int min = -1;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (min < 0 || seq.length() < min) {
min = seq.length();
}
}
return (min < 0) ? 0 : min;
}
/**
* Returns the length of the longest token sequence in this
* set. This method will return zero (0) if the set is empty.
*
* @return the length of the longest token sequence
*/
public int getMaxLength() {
Sequence seq;
int max = 0;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (seq.length() > max) {
max = seq.length();
}
}
return max;
}
/**
* Returns a list of the initial token id:s in this look-ahead
* set. The list returned will not contain any duplicates.
*
* @return a list of the inital token id:s in this look-ahead set
*/
public int[] getInitialTokens() {
ArrayList list = new ArrayList();
int[] result;
Integer token;
int i;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
token = ((Sequence) iter.next()).getToken(0);
if (token != null && !list.contains(token)) {
list.add(token);
}
}
result = new int[list.size()];
for (i = 0; i < list.size(); i++) {
result[i] = ((Integer) list.get(i)).intValue();
}
return result;
}
/**
* Checks if this look-ahead set contains a repetitive token
* sequence.
*
* @return true if at least one token sequence is repetitive, or
* false otherwise
*/
public boolean isRepetitive() {
Sequence seq;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (seq.isRepetitive()) {
return true;
}
}
return false;
}
/**
* Checks if the next token(s) in the parser match any token
* sequence in this set.
*
* @param parser the parser to check
*
* @return true if the next tokens are in the set, or
* false otherwise
*/
public boolean isNext(Parser parser) {
Sequence seq;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (seq.isNext(parser)) {
return true;
}
}
return false;
}
/**
* Checks if the next token(s) in the parser match any token
* sequence in this set.
*
* @param parser the parser to check
* @param length the maximum number of tokens to check
*
* @return true if the next tokens are in the set, or
* false otherwise
*/
public boolean isNext(Parser parser, int length) {
Sequence seq;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (seq.isNext(parser, length)) {
return true;
}
}
return false;
}
/**
* Checks if another look-ahead set has an overlapping token
* sequence. An overlapping token sequence is a token sequence
* that is identical to another sequence, but for the length. I.e.
* one of the two sequences may be longer than the other.
*
* @param set the look-ahead set to check
*
* @return true if there is some token sequence that overlaps, or
* false otherwise
*/
public boolean isOverlap(LookAheadSet set) {
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
if (set.isOverlap((Sequence)iter.next())) {
return true;
}
}
return false;
}
/**
* Checks if a token sequence is overlapping. An overlapping token
* sequence is a token sequence that is identical to another
* sequence, but for the length. I.e. one of the two sequences may
* be longer than the other.
*
* @param seq the token sequence to check
*
* @return true if there is some token sequence that overlaps, or
* false otherwise
*/
private boolean isOverlap(Sequence seq) {
Sequence elem;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
elem = (Sequence)iter.next();
if (seq.startsWith(elem) || elem.startsWith(seq)) {
return true;
}
}
return false;
}
/**
* Checks if the specified token sequence is present in the
* set.
*
* @param elem the token sequence to check
*
* @return true if the sequence is present in this set, or
* false otherwise
*/
private boolean contains(Sequence elem) {
return findSequence(elem) != null;
}
/**
* Checks if some token sequence is present in both this set
* and a specified one.
*
* @param set the look-ahead set to compare with
*
* @return true if the look-ahead sets intersect, or
* false otherwise
*/
public boolean intersects(LookAheadSet set) {
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
if (set.contains((Sequence)iter.next())) {
return true;
}
}
return false;
}
/**
* Finds an identical token sequence if present in the set.
*
* @param elem the token sequence to search for
*
* @return an identical the token sequence if found, or
* null if not found
*/
private Sequence findSequence(Sequence elem) {
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
Sequence seq = (Sequence)iter.next();
if (seq.equals(elem)) {
return seq;
}
}
return null;
}
/**
* Adds a token sequence to this set. The sequence will only be
* added if it is not already in the set. Also, if the sequence is
* longer than the allowed maximum, a truncated sequence will be
* added instead.
*
* @param seq the token sequence to add
*/
private void add(Sequence seq) {
if (seq.length() > maxLength) {
seq = new Sequence(maxLength, seq);
}
elements.add(seq);
}
/**
* Adds a new token sequence with a single token to this set. The
* sequence will only be added if it is not already in the set.
*
* @param token the token to add
*/
public void add(int token) {
add(new Sequence(false, token));
}
/**
* Adds all the token sequences from a specified set. Only
* sequences not already in this set will be added.
*
* @param set the set to add from
*/
public void addAll(LookAheadSet set) {
for (Iterator iter = set.iterator(); iter.hasNext(); ) {
add((Sequence) iter.next());
}
}
public Iterator iterator() {
return elements.iterator();
}
/**
* Adds an empty token sequence to this set. The sequence will
* only be added if it is not already in the set.
*/
public void addEmpty() {
add(new Sequence());
}
/**
* Removes a token sequence from this set.
*
* @param seq the token sequence to remove
*/
private void remove(Sequence seq) {
elements.remove(seq);
}
/**
* Removes all the token sequences from a specified set. Only
* sequences already in this set will be removed.
*
* @param set the set to remove from
*/
public void removeAll(LookAheadSet set) {
for (Iterator iter = set.iterator(); iter.hasNext(); ) {
remove((Sequence) iter.next());
}
}
/**
* Creates a new look-ahead set that is the result of reading the
* specified token. The new look-ahead set will contain the
* rest of all the token sequences that started with the specified
* token.
*
* @param token the token to read
*
* @return a new look-ahead set containing the remaining tokens
*/
public LookAheadSet createNextSet(int token) {
LookAheadSet result = new LookAheadSet(maxLength - 1);
Sequence seq;
Integer value;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
value = seq.getToken(0);
if (value != null && value.intValue() == token) {
result.add(seq.subsequence(1));
}
}
return result;
}
/**
* Creates a new look-ahead set that is the intersection of
* this set with another set. The token sequences in the net set
* will only have the repeat flag set if it was set in both the
* identical token sequences.
*
* @param set the set to intersect with
*
* @return a new look-ahead set containing the intersection
*/
public LookAheadSet createIntersection(LookAheadSet set) {
LookAheadSet result = new LookAheadSet(maxLength);
Sequence seq1;
Sequence seq2;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq1 = (Sequence) iter.next();
seq2 = set.findSequence(seq1);
if (seq2 != null && seq1.isRepetitive()) {
result.add(seq2);
} else if (seq2 != null) {
result.add(seq1);
}
}
return result;
}
/**
* Creates a new look-ahead set that is the combination of
* this set with another set. The combination is created by
* creating new token sequences that consist of appending all
* elements from the specified set onto all elements in this set.
* This is sometimes referred to as the cartesian product.
*
* @param set the set to combine with
*
* @return a new look-ahead set containing the combination
*/
public LookAheadSet createCombination(LookAheadSet set) {
LookAheadSet result = new LookAheadSet(maxLength);
Sequence first;
Sequence second;
// Handle special cases
if (this.size() <= 0) {
return set;
} else if (set.size() <= 0) {
return this;
}
// Create combinations
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
first = (Sequence)iter.next();
if (first.length() >= maxLength) {
result.add(first);
} else if (first.length() <= 0) {
result.addAll(set);
} else {
for (Iterator iter2 = set.iterator(); iter2.hasNext(); ) {
second = (Sequence) iter2.next();
result.add(first.concat(maxLength, second));
}
}
}
return result;
}
/**
* Creates a new look-ahead set with overlaps from another. All
* token sequences in this set that overlaps with the other set
* will be added to the new look-ahead set.
*
* @param set the look-ahead set to check with
*
* @return a new look-ahead set containing the overlaps
*/
public LookAheadSet createOverlaps(LookAheadSet set) {
LookAheadSet result = new LookAheadSet(maxLength);
Sequence seq;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence) iter.next();
if (set.isOverlap(seq)) {
result.add(seq);
}
}
return result;
}
/**
* Creates a new look-ahead set filter. The filter will contain
* all sequences from this set, possibly left trimmed by each one
* of the sequences in the specified set.
*
* @param set the look-ahead set to trim with
*
* @return a new look-ahead set filter
*/
public LookAheadSet createFilter(LookAheadSet set) {
LookAheadSet result = new LookAheadSet(maxLength);
Sequence first;
Sequence second;
// Handle special cases
if (this.size() <= 0 || set.size() <= 0) {
return this;
}
// Create combinations
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
first = (Sequence)iter.next();
for (Iterator iter2 = set.iterator(); iter2.hasNext(); ) {
second = (Sequence) iter2.next();
if (first.startsWith(second)) {
result.add(first.subsequence(second.length()));
}
}
}
return result;
}
/**
* Creates a new identical look-ahead set, except for the repeat
* flag being set in each token sequence.
*
* @return a new repetitive look-ahead set
*/
public LookAheadSet createRepetitive() {
LookAheadSet result = new LookAheadSet(maxLength);
Sequence seq;
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence)iter.next();
if (seq.isRepetitive()) {
result.add(seq);
} else {
result.add(new Sequence(true, seq));
}
}
return result;
}
/**
* Returns a string representation of this object.
*
* @return a string representation of this object
*/
public String toString() {
return toString(null);
}
/**
* Returns a string representation of this object.
*
* @param tokenizer the tokenizer containing the tokens
*
* @return a string representation of this object
*/
public String toString(Tokenizer tokenizer) {
StringBuffer buffer = new StringBuffer();
Sequence seq;
buffer.append("{");
for (Iterator iter = elements.iterator(); iter.hasNext(); ) {
seq = (Sequence) iter.next();
buffer.append("\n ");
buffer.append(seq.toString(tokenizer));
}
buffer.append("\n}");
return buffer.toString();
}
/**
* A token sequence. This class contains a list of token ids. It
* is immutable after creation, meaning that no changes will be
* made to an instance after creation.
*
* @author Per Cederberg, <per at percederberg dot net>
* @version 1.0
*/
private class Sequence {
/**
* The repeat flag. If this flag is set, the token sequence
* or some part of it may be repeated infinitely.
*/
private boolean repeat = false;
/**
* The list of token ids in this sequence.
*/
private ArrayList tokens = null;
/**
* Creates a new empty token sequence. The repeat flag will be
* set to false.
*/
public Sequence() {
this.repeat = false;
this.tokens = new ArrayList(0);
}
/**
* Creates a new token sequence with a single token.
*
* @param repeat the repeat flag value
* @param token the token to add
*/
public Sequence(boolean repeat, int token) {
this.repeat = false;
this.tokens = new ArrayList(1);
this.tokens.add(new Integer(token));
}
/**
* Creates a new token sequence that is a duplicate of another
* sequence. Only a limited number of tokens will be copied
* however. The repeat flag from the original will be kept
* intact.
*
* @param length the maximum number of tokens to copy
* @param seq the sequence to copy
*/
public Sequence(int length, Sequence seq) {
this.repeat = seq.repeat;
this.tokens = new ArrayList(length);
if (seq.length() < length) {
length = seq.length();
}
for (int i = 0; i < length; i++) {
tokens.add(seq.tokens.get(i));
}
}
/**
* Creates a new token sequence that is a duplicate of another
* sequence. The new value of the repeat flag will be used
* however.
*
* @param repeat the new repeat flag value
* @param seq the sequence to copy
*/
public Sequence(boolean repeat, Sequence seq) {
this.repeat = repeat;
this.tokens = seq.tokens;
}
/**
* Returns the length of the token sequence.
*
* @return the number of tokens in the sequence
*/
public int length() {
return tokens.size();
}
/**
* Returns a token at a specified position in the sequence.
*
* @param pos the sequence position
*
* @return the token id found, or null
*/
public Integer getToken(int pos) {
if (pos >= 0 && pos < tokens.size()) {
return (Integer) tokens.get(pos);
} else {
return null;
}
}
/**
* Checks if this sequence is equal to another object. Only
* token sequences with the same tokens in the same order will
* be considered equal. The repeat flag will be disregarded.
*
* @param obj the object to compare with
*
* @return true if the objects are equal, or
* false otherwise
*/
public boolean equals(Object obj) {
if (obj instanceof Sequence) {
return tokens.equals(((Sequence) obj).tokens);
} else {
return false;
}
}
/**
* Checks if this token sequence starts with the tokens from
* another sequence. If the other sequence is longer than this
* sequence, this method will always return false.
*
* @param seq the token sequence to check
*
* @return true if this sequence starts with the other, or
* false otherwise
*/
public boolean startsWith(Sequence seq) {
if (length() < seq.length()) {
return false;
}
for (int i = 0; i < seq.tokens.size(); i++) {
if (!tokens.get(i).equals(seq.tokens.get(i))) {
return false;
}
}
return true;
}
/**
* Checks if this token sequence is repetitive. A repetitive
* token sequence is one with the repeat flag set.
*
* @return true if this token sequence is repetitive, or
* false otherwise
*/
public boolean isRepetitive() {
return repeat;
}
/**
* Checks if the next token(s) in the parser matches this
* token sequence.
*
* @param parser the parser to check
*
* @return true if the next tokens are in the sequence, or
* false otherwise
*/
public boolean isNext(Parser parser) {
Token token;
Integer id;
for (int i = 0; i < tokens.size(); i++) {
id = (Integer) tokens.get(i);
token = parser.peekToken(i);
if (token == null || token.getId() != id.intValue()) {
return false;
}
}
return true;
}
/**
* Checks if the next token(s) in the parser matches this
* token sequence.
*
* @param parser the parser to check
* @param length the maximum number of tokens to check
*
* @return true if the next tokens are in the sequence, or
* false otherwise
*/
public boolean isNext(Parser parser, int length) {
Token token;
Integer id;
if (length > tokens.size()) {
length = tokens.size();
}
for (int i = 0; i < length; i++) {
id = (Integer) tokens.get(i);
token = parser.peekToken(i);
if (token == null || token.getId() != id.intValue()) {
return false;
}
}
return true;
}
/**
* Returns a string representation of this object.
*
* @return a string representation of this object
*/
public String toString() {
return toString(null);
}
/**
* Returns a string representation of this object.
*
* @param tokenizer the tokenizer containing the tokens
*
* @return a string representation of this object
*/
public String toString(Tokenizer tokenizer) {
StringBuffer buffer = new StringBuffer();
String str;
Integer id;
if (tokenizer == null) {
buffer.append(tokens.toString());
} else {
buffer.append("[");
for (int i = 0; i < tokens.size(); i++) {
id = (Integer) tokens.get(i);
str = tokenizer.getPatternDescription(id.intValue());
if (i > 0) {
buffer.append(" ");
}
buffer.append(str);
}
buffer.append("]");
}
if (repeat) {
buffer.append(" *");
}
return buffer.toString();
}
/**
* Creates a new token sequence that is the concatenation of
* this sequence and another. A maximum length for the new
* sequence is also specified.
*
* @param length the maximum length of the result
* @param seq the other sequence
*
* @return the concatenated token sequence
*/
public Sequence concat(int length, Sequence seq) {
Sequence res = new Sequence(length, this);
if (seq.repeat) {
res.repeat = true;
}
length -= this.length();
if (length > seq.length()) {
res.tokens.addAll(seq.tokens);
} else {
for (int i = 0; i < length; i++) {
res.tokens.add(seq.tokens.get(i));
}
}
return res;
}
/**
* Creates a new token sequence that is a subsequence of this
* one.
*
* @param start the subsequence start position
*
* @return the new token subsequence
*/
public Sequence subsequence(int start) {
Sequence res = new Sequence(length(), this);
while (start > 0 && res.tokens.size() > 0) {
res.tokens.remove(0);
start--;
}
return res;
}
}
}
- [Grammatica-users] Massive performance improvement by backing LookAheadSet with a HashSet,
Steve Davis <=