[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[cp-patches] RFC: gnu.regexp: fixed bugs in stingy match of RETokenRepea
From: |
Ito Kazumitsu |
Subject: |
[cp-patches] RFC: gnu.regexp: fixed bugs in stingy match of RETokenRepeated |
Date: |
Mon, 23 Jan 2006 23:06:42 +0900 (JST) |
Another fix of gnu.regexp package.
After checkin this in, I will move some more Mauve testcases of
gnu/testlet/java/util/regex/Pattern/testdata2
to
gnu/testlet/java/util/regex/Pattern/testdata1
In my local environment, 436 cases in testdata1 pass.
Remained in testdata2 are:
(1) (?=pattern) and (?!pattern)
(2) (?<=pattern) and (?<!pattern)
(3) (?>pattern)
(4) (?x)
(5) \z and \G
(6) (a\1?) --- Back reference within the referenced group.
ChangeLog
2006-01-23 Ito Kazumitsu <address@hidden>
Fixes bug #24876
* gnu/regexp/gnu/regexp/RE.java(getMatchImpl):
Changed the way of finding the best match: in case of
stingy match, the less repeated, the better.
(compareRepeats): New method.
* gnu/regexp/REMatch.java(stingy): New boolean field
indicating that some stingy matching was done.
(repeats): New vector for memorizing the number of repeats
when some stingy matching was done.
* gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
* gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java 22 Jan 2006 02:22:21 -0000 1.12
+++ classpath/gnu/regexp/RE.java 23 Jan 2006 13:43:11 -0000
@@ -1343,17 +1343,27 @@
// Optimization: check if anchor + minimumLength > length
if (minimumLength == 0 || input.charAt(minimumLength-1) !=
CharIndexed.OUT_OF_BOUNDS) {
if (match(input, mymatch)) {
- // Find longest match of them all to observe leftmost longest
- REMatch longest = mymatch;
- while ((mymatch = mymatch.next) != null) {
- if (mymatch.index > longest.index) {
- longest = mymatch;
+ REMatch best = mymatch;
+ if (! best.stingy) {
+ // Find best match of them all to observe leftmost longest
+ while ((mymatch = mymatch.next) != null) {
+ if (mymatch.index > best.index) {
+ best = mymatch;
+ }
}
}
-
- longest.end[0] = longest.index;
- longest.finish(input);
- return longest;
+ else {
+ // Find best match of them all to observe
+ // leftmost least repeated
+ while ((mymatch = mymatch.next) != null) {
+ if (compareRepeats(mymatch, best) < 0) {
+ best = mymatch;
+ }
+ }
+ }
+ best.end[0] = best.index;
+ best.finish(input);
+ return best;
}
}
mymatch.clear(++anchor);
@@ -1374,6 +1384,21 @@
return null;
}
+ private int compareRepeats(REMatch m1, REMatch m2) {
+ Vector v1 = m1.repeats;
+ Vector v2 = m2.repeats;
+ int s1 = v1.size();
+ int s2 = v2.size();
+ for (int i = 0; i < s1; i++) {
+ if (i >= s2) return 1;
+ int r1 = ((Integer)(v1.elementAt(i))).intValue();
+ int r2 = ((Integer)(v2.elementAt(i))).intValue();
+ if (r1 != r2) return (r1 - r2);
+ }
+ // If numbers of repeats are equal, then the longer, the better
+ return (m2.index - m1.index);
+ }
+
/**
* Returns an REMatchEnumeration that can be used to iterate over the
* matches found in the input text.
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.3
diff -u -r1.3 REMatch.java
--- classpath/gnu/regexp/REMatch.java 22 Jan 2006 02:22:21 -0000 1.3
+++ classpath/gnu/regexp/REMatch.java 23 Jan 2006 13:43:12 -0000
@@ -38,6 +38,7 @@
package gnu.regexp;
import java.io.Serializable;
+import java.util.Vector;
/**
* An instance of this class represents a match
@@ -68,6 +69,11 @@
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
boolean empty; // empty string matched
+ boolean stingy; // stingy repeated token matched
+ Vector repeats; // number of repeats of each stingy repeated token
+ // Request For Comment: The Vector repeats contains number of repeats
+ // from left to right without regard to what the token is.
+ // I am not quite sure this is reasonable.
public Object clone() {
try {
@@ -77,6 +83,8 @@
copy.start = (int[]) start.clone();
copy.end = (int[]) end.clone();
+ copy.repeats = (Vector) repeats.clone();
+
return copy;
} catch (CloneNotSupportedException e) {
throw new Error(); // doesn't happen
@@ -90,6 +98,8 @@
// need to deep clone?
next = other.next;
empty = other.empty;
+ stingy = other.stingy;
+ repeats = other.repeats;
}
REMatch(int subs, int anchor, int eflags) {
@@ -127,6 +137,8 @@
}
next = null; // cut off alternates
empty = false;
+ stingy = false;
+ repeats = new Vector();
}
/**
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java 23 Jan 2006 13:18:10 -0000
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java 23 Jan 2006 13:43:12 -0000
@@ -98,7 +98,7 @@
REMatch last = null;
REToken tk;
for (int i=0; i < options.size(); i++) {
- // In ordaer that the backtracking can work,
+ // In order that the backtracking can work,
// each option must be chained to the next token.
// But the chain method has some side effect, so
// we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 22 Jan 2006 02:22:21 -0000
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java 23 Jan 2006 13:43:12 -0000
@@ -113,20 +113,24 @@
REMatch recurrent;
int lastIndex = mymatch.index;
- do {
- // Check for stingy match for each possibility.
- if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- mymatch.empty = (mymatch.index == origin);
- return true;
- }
- else {
- // Special case of {0}. It must always match an empty string.
- if (alwaysEmpty) return false;
- }
+ // {0} needs some special treatment.
+ if (alwaysEmpty) {
+ REMatch result = matchRest(input, newMatch);
+ if (result != null) {
+ mymatch.assignFrom(result);
+ mymatch.empty = (mymatch.index == origin);
+ return true;
+ }
+ else {
+ return false;
}
+ }
+
+ do {
+ // We want to check something like
+ // if (stingy && (numRepeats >= min)
+ // and neglect the further matching. But experience tells
+ // such neglection may cause incomplete matching.
doables = null;
doablesLast = null;
@@ -222,17 +226,34 @@
// desired.
indexCount = 1;
}
+ if (stingy) {
+ posIndex = (posIndex - indexCount - 1);
+ numRepeats = min - 1;
+ }
while (indexCount-- > 0) {
- --posIndex;
+ if (stingy) {
+ ++posIndex;
+ ++numRepeats;
+ }
+ else --posIndex;
newMatch = (REMatch) positions.elementAt(posIndex);
results = matchRest(input, newMatch);
if (results != null) {
+ if (stingy) {
+ results.stingy = stingy;
+ REMatch tmp = results;
+ while (tmp != null) {
+ tmp.repeats.add(new Integer(numRepeats));
+ tmp = tmp.next;
+ }
+ }
if (allResults == null) {
allResults = results;
allResultsLast = results;
} else {
// Order these from longest to shortest
// Start by assuming longest (more repeats)
+ // If stingy the order is shortest to longest.
allResultsLast.next = results;
}
// Find new doablesLast