classpath-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated


From: Ito Kazumitsu
Subject: Re: [cp-patches] RFC: gnu.regexp: fixed bugs in RETokenRepeated
Date: Sat, 21 Jan 2006 01:56:31 +0900 (JST)

From: Ito Kazumitsu <address@hidden>
Date: Thu, 19 Jan 2006 23:22:52 +0900 (JST)

> And this is my fix.

Slightly improved. And this is supposed to fix the bug #25837

ChangeLog
2006-01-20  Ito Kazumitsu  <address@hidden>

        Fixes bug #25837
        * gnu/regexp/REMatch.java(empty): New boolean indicating
        an empty string matched.
        * gnu/regexp/RE.java(match): Sets empty flag when an empty
        string matched.
        (initialize): Support back reference \10, \11, and so on.
        (parseInt): renamed from getEscapedChar and returns int.
        * gnu/regexp/RETokenRepeated.java(match): Sets empty flag
        when an empty string matched. Fixed a bug of the case where
        an empty string matched. Added special handling of {0}.
        * gnu/regexp/RETokenBackRef.java(match): Sets empty flag
        when an empty string matched. Fixed the case insensitive matching.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.11
diff -u -r1.11 RE.java
--- classpath/gnu/regexp/RE.java        19 Jan 2006 13:45:51 -0000      1.11
+++ classpath/gnu/regexp/RE.java        20 Jan 2006 16:37:03 -0000
@@ -825,12 +825,31 @@
       }
 
       // BACKREFERENCE OPERATOR
-      //  \1 \2 ... \9
+      //  \1 \2 ... \9 and \10 \11 \12 ...
       // not available if RE_NO_BK_REFS is set
+      // Perl recognizes \10, \11, and so on only if enough number of
+      // parentheses have opened before it, otherwise they are treated
+      // as aliases of \010, \011, ... (octal characters).  In case of
+      // Sun's JDK, octal character expression must always begin with \0.
+      // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
+      // JDK treats \2 as a back reference to the 2nd group because
+      // there are only two groups. But in our poor implementation,
+      // we cannot help but treat \29 as a back reference to the 29th group.
 
       else if (unit.bk && Character.isDigit(unit.ch) && 
!syntax.get(RESyntax.RE_NO_BK_REFS)) {
        addToken(currentToken);
-       currentToken = new 
RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
+       int numBegin = index - 1;
+       int numEnd = pLength;
+       for (int i = index; i < pLength; i++) {
+           if (! Character.isDigit(pattern[i])) {
+               numEnd = i;
+               break;
+           }
+       }
+       int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
+
+       currentToken = new RETokenBackRef(subIndex,num,insens);
+       index = numEnd;
       }
 
       // START OF STRING OPERATOR
@@ -999,12 +1018,12 @@
     return index;
   }
 
-  private static char getEscapedChar(char[] input, int pos, int len, int 
radix) {
+  private static int parseInt(char[] input, int pos, int len, int radix) {
     int ret = 0;
     for (int i = pos; i < pos + len; i++) {
        ret = ret * radix + Character.digit(input[i], radix);
     }
-    return (char)ret;
+    return ret;
   }
 
   /**
@@ -1059,7 +1078,7 @@
            l++;
           }
           if (l != expectedLength) return null;
-          ce.ch = getEscapedChar(input, pos + 2, l, 16);
+          ce.ch = (char)(parseInt(input, pos + 2, l, 16));
          ce.len = l + 2;
         }
         else {
@@ -1077,7 +1096,7 @@
           }
           if (l == 3 && input[pos + 2] > '3') l--;
           if (l <= 0) return null;
-          ce.ch = getEscapedChar(input, pos + 2, l, 8);
+          ce.ch = (char)(parseInt(input, pos + 2, l, 8));
           ce.len = l + 2;
         }
         else {
@@ -1246,12 +1265,20 @@
   
     /* Implements abstract method REToken.match() */
     boolean match(CharIndexed input, REMatch mymatch) { 
-       if (firstToken == null) return next(input, mymatch);
+       int origin = mymatch.index;
+       boolean b;
+       if (firstToken == null) {
+           b = next(input, mymatch);
+           if (b) mymatch.empty = (mymatch.index == origin);
+           return b;
+       }
 
        // Note the start of this subexpression
        mymatch.start[subIndex] = mymatch.index;
 
-       return firstToken.match(input, mymatch);
+       b = firstToken.match(input, mymatch);
+       if (b) mymatch.empty = (mymatch.index == origin);
+       return b;
     }
   
   /**
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.2
diff -u -r1.2 REMatch.java
--- classpath/gnu/regexp/REMatch.java   2 Jul 2005 20:32:15 -0000       1.2
+++ classpath/gnu/regexp/REMatch.java   20 Jan 2006 16:37:03 -0000
@@ -67,6 +67,7 @@
     int[] start; // start positions (relative to offset) for each (sub)exp.
     int[] end;   // end positions for the same
     REMatch next; // other possibility (to avoid having to use arrays)
+    boolean empty; // empty string matched
 
     public Object clone() {
        try {
@@ -88,6 +89,7 @@
        index = other.index;
        // need to deep clone?
        next = other.next;
+       empty = other.empty;
     }
 
     REMatch(int subs, int anchor, int eflags) {
@@ -124,6 +126,7 @@
            start[i] = end[i] = -1;
        }
        next = null; // cut off alternates
+       empty = false;
     }
     
     /**
Index: classpath/gnu/regexp/RETokenBackRef.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenBackRef.java,v
retrieving revision 1.2
diff -u -r1.2 RETokenBackRef.java
--- classpath/gnu/regexp/RETokenBackRef.java    2 Jul 2005 20:32:15 -0000       
1.2
+++ classpath/gnu/regexp/RETokenBackRef.java    20 Jan 2006 16:37:03 -0000
@@ -51,17 +51,32 @@
   // should implement getMinimumLength() -- any ideas?
 
     boolean match(CharIndexed input, REMatch mymatch) {
+       if (num >= mymatch.start.length) return false;
+       if (num >= mymatch.end.length) return false;
        int b,e;
        b = mymatch.start[num];
        e = mymatch.end[num];
        if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
+       int origin = mymatch.index;
        for (int i=b; i<e; i++) {
-           if (input.charAt(mymatch.index+i-b) != input.charAt(i)) {
-               return false;
+           char c1 = input.charAt(mymatch.index+i-b);
+           char c2 = input.charAt(i);
+           if (c1 != c2) {
+               if (insens) {
+                   if (c1 != Character.toLowerCase(c2) &&
+                       c1 != Character.toUpperCase(c2)) {
+                       return false;
+                   }
+               }
+               else {
+                   return false;
+               }
            }
        }
        mymatch.index += e-b;
-       return next(input, mymatch);
+       boolean result = next(input, mymatch);
+       if (result) mymatch.empty = (mymatch.index == origin);
+       return result;
     }
     
     void dump(StringBuffer os) {
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.5
diff -u -r1.5 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   8 Jan 2006 23:06:43 -0000       
1.5
+++ classpath/gnu/regexp/RETokenRepeated.java   20 Jan 2006 16:37:03 -0000
@@ -45,12 +45,14 @@
     private int min,max;
     private boolean stingy;
     private boolean possessive;
+    private boolean alwaysEmpty; // Special case of {0}
     
     RETokenRepeated(int subIndex, REToken token, int min, int max) {
        super(subIndex);
        this.token = token;
        this.min = min;
        this.max = max;
+       alwaysEmpty = (min == 0 && max == 0);
     }
 
     /** Sets the minimal matching mode to true. */
@@ -91,6 +93,7 @@
     // the subexpression back-reference operator allow that?
 
     boolean match(CharIndexed input, REMatch mymatch) {
+       int origin = mymatch.index;
        // number of times we've matched so far
        int numRepeats = 0; 
        
@@ -112,12 +115,17 @@
 
        do {
            // Check for stingy match for each possibility.
-           if (stingy && (numRepeats >= min)) {
+           if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
                REMatch result = matchRest(input, newMatch);
                if (result != null) {
                    mymatch.assignFrom(result);
+                   mymatch.empty = (mymatch.index == origin);
                    return true;
                }
+               else {
+               // Special case of {0}. It must always match an empty string.
+                   if (alwaysEmpty) return false;
+               }
            }
 
            doables = null;
@@ -153,12 +161,43 @@
            
            positions.addElement(newMatch);
 
-           // doables.index == lastIndex means an empty string
-           // was the longest that matched this token.
-           // We break here, otherwise we will fall into an endless loop.
+           // doables.index == lastIndex occurs either
+           //   (1) when an empty string was the longest
+           //       that matched this token.
+           //       And this case occurs either
+           //         (1-1) when this token is always empty,
+           //               for example "()" or "(())".
+           //         (1-2) when this token is not always empty
+           //               but can match an empty string, for example,
+           //               "a*", "(a|)".
+           // or
+           //   (2) when the same string matches this token many times.
+           //       For example, "acbab" itself matches "a.*b" and
+           //       its substrings "acb" and "ab" also match.
            if (doables.index == lastIndex) {
-               if (numRepeats < min) numRepeats = min;
-               break;
+               if (doables.empty) {
+                   // Case (1): We break here, otherwise we will fall
+                   //          into an endless loop.
+                   if (numRepeats < min) numRepeats = min;
+                   break;
+               }
+               else {
+                   // Case (2): We cannot break here because, for example,
+                   // "acbacb" matches "a.*b" up to 2 times but
+                   // not 3 times.  So we have to check numRepeats >= min.
+                   // But we do not have to go further until numRepeats == max
+                   // because the more numRepeats grows, the shorter the
+                   // substring matching this token becomes. 
+                   if (numRepeats > min) {
+                       // This means the previous match was successful,
+                       // and that must be the best match.  This match
+                       // resulted in shortening the matched substring.
+                       numRepeats--;
+                       positions.remove(positions.size() - 1);
+                       break;
+                   }
+                   if (numRepeats == min) break;
+               }
            }           
            lastIndex = doables.index;
        } while (numRepeats < max);
@@ -207,6 +246,7 @@
        }
        if (allResults != null) {
            mymatch.assignFrom(allResults); // does this get all?
+           mymatch.empty = (mymatch.index == origin);
            return true;
        }
        // If we fall out, no matches.

reply via email to

[Prev in Thread] Current Thread [Next in Thread]