[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated
From: |
Ito Kazumitsu |
Subject: |
Re: [cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated |
Date: |
Sat, 21 Jan 2006 01:56:31 +0900 (JST) |
From: Ito Kazumitsu <address@hidden>
Date: Thu, 19 Jan 2006 23:22:52 +0900 (JST)
> And this is my fix.
Slightly improved. And this is supposed to fix the bug #25837
ChangeLog
2006-01-20 Ito Kazumitsu <address@hidden>
Fixes bug #25837
* gnu/regexp/REMatch.java(empty): New boolean indicating
an empty string matched.
* gnu/regexp/RE.java(match): Sets empty flag when an empty
string matched.
(initialize): Support back reference \10, \11, and so on.
(parseInt): renamed from getEscapedChar and returns int.
* gnu/regexp/RETokenRepeated.java(match): Sets empty flag
when an empty string matched. Fixed a bug of the case where
an empty string matched. Added special handling of {0}.
* gnu/regexp/RETokenBackRef.java(match): Sets empty flag
when an empty string matched. Fixed the case insensitive matching.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.11
diff -u -r1.11 RE.java
--- classpath/gnu/regexp/RE.java 19 Jan 2006 13:45:51 -0000 1.11
+++ classpath/gnu/regexp/RE.java 20 Jan 2006 16:37:03 -0000
@@ -825,12 +825,31 @@
}
// BACKREFERENCE OPERATOR
- // \1 \2 ... \9
+ // \1 \2 ... \9 and \10 \11 \12 ...
// not available if RE_NO_BK_REFS is set
+ // Perl recognizes \10, \11, and so on only if enough number of
+ // parentheses have opened before it, otherwise they are treated
+ // as aliases of \010, \011, ... (octal characters). In case of
+ // Sun's JDK, octal character expression must always begin with \0.
+ // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
+ // JDK treats \2 as a back reference to the 2nd group because
+ // there are only two groups. But in our poor implementation,
+ // we cannot help but treat \29 as a back reference to the 29th group.
else if (unit.bk && Character.isDigit(unit.ch) &&
!syntax.get(RESyntax.RE_NO_BK_REFS)) {
addToken(currentToken);
- currentToken = new
RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
+ int numBegin = index - 1;
+ int numEnd = pLength;
+ for (int i = index; i < pLength; i++) {
+ if (! Character.isDigit(pattern[i])) {
+ numEnd = i;
+ break;
+ }
+ }
+ int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
+
+ currentToken = new RETokenBackRef(subIndex,num,insens);
+ index = numEnd;
}
// START OF STRING OPERATOR
@@ -999,12 +1018,12 @@
return index;
}
- private static char getEscapedChar(char[] input, int pos, int len, int
radix) {
+ private static int parseInt(char[] input, int pos, int len, int radix) {
int ret = 0;
for (int i = pos; i < pos + len; i++) {
ret = ret * radix + Character.digit(input[i], radix);
}
- return (char)ret;
+ return ret;
}
/**
@@ -1059,7 +1078,7 @@
l++;
}
if (l != expectedLength) return null;
- ce.ch = getEscapedChar(input, pos + 2, l, 16);
+ ce.ch = (char)(parseInt(input, pos + 2, l, 16));
ce.len = l + 2;
}
else {
@@ -1077,7 +1096,7 @@
}
if (l == 3 && input[pos + 2] > '3') l--;
if (l <= 0) return null;
- ce.ch = getEscapedChar(input, pos + 2, l, 8);
+ ce.ch = (char)(parseInt(input, pos + 2, l, 8));
ce.len = l + 2;
}
else {
@@ -1246,12 +1265,20 @@
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- if (firstToken == null) return next(input, mymatch);
+ int origin = mymatch.index;
+ boolean b;
+ if (firstToken == null) {
+ b = next(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
+ }
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
- return firstToken.match(input, mymatch);
+ b = firstToken.match(input, mymatch);
+ if (b) mymatch.empty = (mymatch.index == origin);
+ return b;
}
/**
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.2
diff -u -r1.2 REMatch.java
--- classpath/gnu/regexp/REMatch.java 2 Jul 2005 20:32:15 -0000 1.2
+++ classpath/gnu/regexp/REMatch.java 20 Jan 2006 16:37:03 -0000
@@ -67,6 +67,7 @@
int[] start; // start positions (relative to offset) for each (sub)exp.
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
+ boolean empty; // empty string matched
public Object clone() {
try {
@@ -88,6 +89,7 @@
index = other.index;
// need to deep clone?
next = other.next;
+ empty = other.empty;
}
REMatch(int subs, int anchor, int eflags) {
@@ -124,6 +126,7 @@
start[i] = end[i] = -1;
}
next = null; // cut off alternates
+ empty = false;
}
/**
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.2
diff -u -r1.2 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java 2 Jul 2005 20:32:15 -0000
1.2
+++ classpath/gnu/regexp/RETokenBackRef.java 20 Jan 2006 16:37:03 -0000
@@ -51,17 +51,32 @@
// should implement getMinimumLength() -- any ideas?
boolean match(CharIndexed input, REMatch mymatch) {
+ if (num >= mymatch.start.length) return false;
+ if (num >= mymatch.end.length) return false;
int b,e;
b = mymatch.start[num];
e = mymatch.end[num];
if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
+ int origin = mymatch.index;
for (int i=b; i<e; i++) {
- if (input.charAt(mymatch.index+i-b) != input.charAt(i)) {
- return false;
+ char c1 = input.charAt(mymatch.index+i-b);
+ char c2 = input.charAt(i);
+ if (c1 != c2) {
+ if (insens) {
+ if (c1 != Character.toLowerCase(c2) &&
+ c1 != Character.toUpperCase(c2)) {
+ return false;
+ }
+ }
+ else {
+ return false;
+ }
}
}
mymatch.index += e-b;
- return next(input, mymatch);
+ boolean result = next(input, mymatch);
+ if (result) mymatch.empty = (mymatch.index == origin);
+ return result;
}
void dump(StringBuffer os) {
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java 8 Jan 2006 23:06:43 -0000
1.5
+++ classpath/gnu/regexp/RETokenRepeated.java 20 Jan 2006 16:37:03 -0000
@@ -45,12 +45,14 @@
private int min,max;
private boolean stingy;
private boolean possessive;
+ private boolean alwaysEmpty; // Special case of {0}
RETokenRepeated(int subIndex, REToken token, int min, int max) {
super(subIndex);
this.token = token;
this.min = min;
this.max = max;
+ alwaysEmpty = (min == 0 && max == 0);
}
/** Sets the minimal matching mode to true. */
@@ -91,6 +93,7 @@
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
+ int origin = mymatch.index;
// number of times we've matched so far
int numRepeats = 0;
@@ -112,12 +115,17 @@
do {
// Check for stingy match for each possibility.
- if (stingy && (numRepeats >= min)) {
+ if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
REMatch result = matchRest(input, newMatch);
if (result != null) {
mymatch.assignFrom(result);
+ mymatch.empty = (mymatch.index == origin);
return true;
}
+ else {
+ // Special case of {0}. It must always match an empty string.
+ if (alwaysEmpty) return false;
+ }
}
doables = null;
@@ -153,12 +161,43 @@
positions.addElement(newMatch);
- // doables.index == lastIndex means an empty string
- // was the longest that matched this token.
- // We break here, otherwise we will fall into an endless loop.
+ // doables.index == lastIndex occurs either
+ // (1) when an empty string was the longest
+ // that matched this token.
+ // And this case occurs either
+ // (1-1) when this token is always empty,
+ // for example "()" or "(())".
+ // (1-2) when this token is not always empty
+ // but can match an empty string, for example,
+ // "a*", "(a|)".
+ // or
+ // (2) when the same string matches this token many times.
+ // For example, "acbab" itself matches "a.*b" and
+ // its substrings "acb" and "ab" also match.
if (doables.index == lastIndex) {
- if (numRepeats < min) numRepeats = min;
- break;
+ if (doables.empty) {
+ // Case (1): We break here, otherwise we will fall
+ // into an endless loop.
+ if (numRepeats < min) numRepeats = min;
+ break;
+ }
+ else {
+ // Case (2): We cannot break here because, for example,
+ // "acbacb" matches "a.*b" up to 2 times but
+ // not 3 times. So we have to check numRepeats >= min.
+ // But we do not have to go further until numRepeats == max
+ // because the more numRepeats grows, the shorter the
+ // substring matching this token becomes.
+ if (numRepeats > min) {
+ // This means the previous match was successful,
+ // and that must be the best match. This match
+ // resulted in shortening the matched substring.
+ numRepeats--;
+ positions.remove(positions.size() - 1);
+ break;
+ }
+ if (numRepeats == min) break;
+ }
}
lastIndex = doables.index;
} while (numRepeats < max);
@@ -207,6 +246,7 @@
}
if (allResults != null) {
mymatch.assignFrom(allResults); // does this get all?
+ mymatch.empty = (mymatch.index == origin);
return true;
}
// If we fall out, no matches.