classpath-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[cp-patches] RFC: gnu.regexp: fixed bugs in stingy match of RETokenRepea


From: Ito Kazumitsu
Subject: [cp-patches] RFC: gnu.regexp: fixed bugs in stingy match of RETokenRepeated
Date: Mon, 23 Jan 2006 23:06:42 +0900 (JST)

Another fix of gnu.regexp package.

After checkin this in, I will move some more Mauve testcases of
   gnu/testlet/java/util/regex/Pattern/testdata2 
to
   gnu/testlet/java/util/regex/Pattern/testdata1

In my local environment, 436 cases in testdata1 pass.

Remained in testdata2 are:

  (1) (?=pattern) and (?!pattern)
  (2) (?<=pattern) and (?<!pattern)
  (3) (?>pattern)
  (4) (?x)
  (5) \z and \G
  (6) (a\1?) --- Back reference within the referenced group.

ChangeLog
2006-01-23  Ito Kazumitsu  <address@hidden>

        Fixes bug #24876
        * gnu/regexp/gnu/regexp/RE.java(getMatchImpl):
        Changed the way of finding the best match: in case of
        stingy match, the less repeated, the better.
        (compareRepeats): New method.
        * gnu/regexp/REMatch.java(stingy): New boolean field
        indicating that some stingy matching was done.
        (repeats): New vector for memorizing the number of repeats
        when some stingy matching was done.
        * gnu/regexp/RETokenOneOf.java: Corrected a typo in a comment.
        * gnu/regexp/RETokenRepeated.java (match): Rewrote stingy matching.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.12
diff -u -r1.12 RE.java
--- classpath/gnu/regexp/RE.java        22 Jan 2006 02:22:21 -0000      1.12
+++ classpath/gnu/regexp/RE.java        23 Jan 2006 13:43:11 -0000
@@ -1343,17 +1343,27 @@
          // Optimization: check if anchor + minimumLength > length
          if (minimumLength == 0 || input.charAt(minimumLength-1) != 
CharIndexed.OUT_OF_BOUNDS) {
              if (match(input, mymatch)) {
-                 // Find longest match of them all to observe leftmost longest
-                 REMatch longest = mymatch;
-                 while ((mymatch = mymatch.next) != null) {
-                     if (mymatch.index > longest.index) {
-                         longest = mymatch;
+                 REMatch best = mymatch;
+                 if (! best.stingy) {
+                     // Find best match of them all to observe leftmost longest
+                     while ((mymatch = mymatch.next) != null) {
+                         if (mymatch.index > best.index) {
+                             best = mymatch;
+                         }
                      }
                  }
-                 
-                 longest.end[0] = longest.index;
-                 longest.finish(input);
-                 return longest;
+                 else {
+                     // Find best match of them all to observe
+                     // leftmost least repeated
+                     while ((mymatch = mymatch.next) != null) {
+                         if (compareRepeats(mymatch, best) < 0) {
+                             best = mymatch;
+                         }
+                     }
+                 }
+                 best.end[0] = best.index;
+                 best.finish(input);
+                 return best;
              }
          }
          mymatch.clear(++anchor);
@@ -1374,6 +1384,21 @@
       return null;
   }
 
+  private int compareRepeats(REMatch m1, REMatch m2) {
+      Vector v1 = m1.repeats;
+      Vector v2 = m2.repeats;
+      int s1 = v1.size();
+      int s2 = v2.size();
+      for (int i = 0; i < s1; i++) {
+          if (i >= s2) return 1;
+          int r1 = ((Integer)(v1.elementAt(i))).intValue();
+          int r2 = ((Integer)(v2.elementAt(i))).intValue();
+         if (r1 != r2) return (r1 - r2);
+      }
+      // If numbers of repeats are equal, then the longer, the better
+      return (m2.index - m1.index);
+  }
+
   /**
    * Returns an REMatchEnumeration that can be used to iterate over the
    * matches found in the input text.
Index: classpath/gnu/regexp/REMatch.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/REMatch.java,v
retrieving revision 1.3
diff -u -r1.3 REMatch.java
--- classpath/gnu/regexp/REMatch.java   22 Jan 2006 02:22:21 -0000      1.3
+++ classpath/gnu/regexp/REMatch.java   23 Jan 2006 13:43:12 -0000
@@ -38,6 +38,7 @@
 
 package gnu.regexp;
 import java.io.Serializable;
+import java.util.Vector;
 
 /**
  * An instance of this class represents a match
@@ -68,6 +69,11 @@
     int[] end;   // end positions for the same
     REMatch next; // other possibility (to avoid having to use arrays)
     boolean empty; // empty string matched
+    boolean stingy; // stingy repeated token matched
+    Vector repeats; // number of repeats of each stingy repeated token
+    // Request For Comment: The Vector repeats contains number of repeats
+    // from left to right without regard to what the token is.
+    // I am not quite sure this is reasonable. 
 
     public Object clone() {
        try {
@@ -77,6 +83,8 @@
            copy.start = (int[]) start.clone();
            copy.end = (int[]) end.clone();
 
+           copy.repeats = (Vector) repeats.clone();
+
            return copy;
        } catch (CloneNotSupportedException e) {
            throw new Error(); // doesn't happen
@@ -90,6 +98,8 @@
        // need to deep clone?
        next = other.next;
        empty = other.empty;
+       stingy = other.stingy;
+       repeats = other.repeats;
     }
 
     REMatch(int subs, int anchor, int eflags) {
@@ -127,6 +137,8 @@
        }
        next = null; // cut off alternates
        empty = false;
+       stingy = false;
+       repeats = new Vector();
     }
     
     /**
Index: classpath/gnu/regexp/RETokenOneOf.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenOneOf.java,v
retrieving revision 1.3
diff -u -r1.3 RETokenOneOf.java
--- classpath/gnu/regexp/RETokenOneOf.java      23 Jan 2006 13:18:10 -0000      
1.3
+++ classpath/gnu/regexp/RETokenOneOf.java      23 Jan 2006 13:43:12 -0000
@@ -98,7 +98,7 @@
     REMatch last = null;
     REToken tk;
     for (int i=0; i < options.size(); i++) {
-       // In ordaer that the backtracking can work,
+       // In order that the backtracking can work,
        // each option must be chained to the next token.
        // But the chain method has some side effect, so
        // we use clones.
Index: classpath/gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.6
diff -u -r1.6 RETokenRepeated.java
--- classpath/gnu/regexp/RETokenRepeated.java   22 Jan 2006 02:22:21 -0000      
1.6
+++ classpath/gnu/regexp/RETokenRepeated.java   23 Jan 2006 13:43:12 -0000
@@ -113,20 +113,24 @@
        REMatch recurrent;
        int lastIndex = mymatch.index;
 
-       do {
-           // Check for stingy match for each possibility.
-           if ((stingy && (numRepeats >= min)) || alwaysEmpty) {
-               REMatch result = matchRest(input, newMatch);
-               if (result != null) {
-                   mymatch.assignFrom(result);
-                   mymatch.empty = (mymatch.index == origin);
-                   return true;
-               }
-               else {
-               // Special case of {0}. It must always match an empty string.
-                   if (alwaysEmpty) return false;
-               }
+       // {0} needs some special treatment.
+       if (alwaysEmpty) {
+           REMatch result = matchRest(input, newMatch);
+           if (result != null) {
+               mymatch.assignFrom(result);
+               mymatch.empty = (mymatch.index == origin);
+               return true;
+           }
+           else {
+               return false;
            }
+       }
+
+       do {
+           // We want to check something like  
+           //    if (stingy && (numRepeats >= min)
+           // and neglect the further matching.  But experience tells
+           // such neglection may cause incomplete matching.
 
            doables = null;
            doablesLast = null;
@@ -222,17 +226,34 @@
            // desired.
            indexCount = 1;
        }
+       if (stingy) {
+           posIndex = (posIndex - indexCount - 1);
+           numRepeats = min - 1;
+       }
        while (indexCount-- > 0) {
-           --posIndex;
+           if (stingy) {
+               ++posIndex;
+               ++numRepeats;
+           }
+           else --posIndex;
            newMatch = (REMatch) positions.elementAt(posIndex);
            results = matchRest(input, newMatch);
            if (results != null) {
+               if (stingy) {
+                   results.stingy = stingy;
+                   REMatch tmp = results;
+                   while (tmp != null) {
+                       tmp.repeats.add(new Integer(numRepeats));
+                       tmp = tmp.next;
+                   }
+               }
                if (allResults == null) {
                    allResults = results;
                    allResultsLast = results;
                } else {
                    // Order these from longest to shortest
                    // Start by assuming longest (more repeats)
+                   // If stingy the order is shortest to longest.
                    allResultsLast.next = results;
                }
                // Find new doablesLast

reply via email to

[Prev in Thread] Current Thread [Next in Thread]