[cp-patches] RFC: gnu.regexp: support escaped characters

classpath-patches

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[cp-patches] RFC: gnu.regexp: support escaped characters

From:	Ito Kazumitsu
Subject:	[cp-patches] RFC: gnu.regexp: support escaped characters
Date:	Wed, 18 Jan 2006 00:26:30 +0900 (JST)

This fixes the bug #23212.

ChangeLog:
2006-01-17  Ito Kazumitsu  <address@hidden>

        Fixes bug #23212
        * gnu/regexp/RE.java(initialize): Support escaped characters such as
        \0123, \x1B, \u1234.
        (getEscapedChar): New method.
        (CharExpression): New inner class.
        (getCharExpression): New Method.
        * gnu/regexp/RESyntax.java(RE_OCTAL_CHAR, RE_HEX_CHAR,
        RE_UNICODE_CHAR): New syntax bits.

Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.10
diff -u -r1.10 RE.java
--- classpath/gnu/regexp/RE.java        16 Jan 2006 13:38:25 -0000      1.10
+++ classpath/gnu/regexp/RE.java        17 Jan 2006 15:14:44 -0000
@@ -409,6 +409,8 @@
       else if ((unit.ch == '[') && !(unit.bk || quot)) {
        Vector options = new Vector();
        boolean negative = false;
+       // FIXME: lastChar == 0 means lastChar is not set. But what if
+       // \u0000 is used as a meaningful character?
        char lastChar = 0;
        if (index == pLength) throw new 
REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
        
@@ -432,6 +434,13 @@
              options.addElement(new RETokenChar(subIndex,lastChar,insens));
              lastChar = '-';
            } else {
+             if ((ch == '\\') && 
syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+               CharExpression ce = getCharExpression(pattern, index, pLength, 
syntax);
+               if (ce == null)
+                 throw new REException("invalid escape sequence", 
REException.REG_ESCAPE, index);
+               ch = ce.ch;
+               index = index + ce.len - 1;
+             }
              options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
              lastChar = 0;
              index++;
@@ -440,6 +449,8 @@
             if (index == pLength) throw new 
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
            int posixID = -1;
            boolean negate = false;
+           // FIXME: asciiEsc == 0 means asciiEsc is not set. But what if
+           // \u0000 is used as a meaningful character?
             char asciiEsc = 0;
            if (("dswDSW".indexOf(pattern[index]) != -1) && 
syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
              switch (pattern[index]) {
@@ -460,19 +471,13 @@
                break;
              }
            }
-            else if ("nrt".indexOf(pattern[index]) != -1) {
-              switch (pattern[index]) {
-                case 'n':
-                  asciiEsc = '\n';
-                  break;
-                case 't':
-                  asciiEsc = '\t';
-                  break;
-                case 'r':
-                  asciiEsc = '\r';
-                  break;
-              }
-            }
+           else {
+             CharExpression ce = getCharExpression(pattern, index - 1, 
pLength, syntax);
+             if (ce == null)
+               throw new REException("invalid escape sequence", 
REException.REG_ESCAPE, index);
+             asciiEsc = ce.ch;
+             index = index - 1 + ce.len - 1;
+           }
            if (lastChar != 0) options.addElement(new 
RETokenChar(subIndex,lastChar,insens));
            
            if (posixID != -1) {
@@ -806,7 +811,19 @@
        else
          currentToken = setRepeated(currentToken,0,1,index);
       }
+
+      // OCTAL CHARACTER
+      //  \0377
        
+      else if (unit.bk && (unit.ch == '0') && 
syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+       CharExpression ce = getCharExpression(pattern, index - 2, pLength, 
syntax);
+       if (ce == null)
+         throw new REException("invalid octal character", 
REException.REG_ESCAPE, index);
+       index = index - 2 + ce.len;
+       addToken(currentToken);
+       currentToken = new RETokenChar(subIndex,ce.ch,insens);
+      }
+
       // BACKREFERENCE OPERATOR
       //  \1 \2 ... \9
       // not available if RE_NO_BK_REFS is set
@@ -935,6 +952,19 @@
          currentToken = new RETokenEnd(subIndex,null);
        }
 
+        // HEX CHARACTER, UNICODE CHARACTER
+        //  \x1B, \u1234
+       
+       else if ((unit.bk && (unit.ch == 'x') && 
syntax.get(RESyntax.RE_HEX_CHAR)) ||
+                (unit.bk && (unit.ch == 'u') && 
syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+         CharExpression ce = getCharExpression(pattern, index - 2, pLength, 
syntax);
+         if (ce == null)
+           throw new REException("invalid hex character", 
REException.REG_ESCAPE, index);
+         index = index - 2 + ce.len;
+         addToken(currentToken);
+         currentToken = new RETokenChar(subIndex,ce.ch,insens);
+       }
+
        // NON-SPECIAL CHARACTER (or escape to make literal)
         //  c | \* for example
 
@@ -969,6 +999,106 @@
     return index;
   }
 
+  private static char getEscapedChar(char[] input, int pos, int len, int 
radix) {
+    int ret = 0;
+    for (int i = pos; i < pos + len; i++) {
+       ret = ret * radix + Character.digit(input[i], radix);
+    }
+    return (char)ret;
+  }
+
+  /**
+   * This class represents various expressions for a character.
+   * "a"      : 'a' itself.
+   * "\0123"  : Octal char 0123
+   * "\x1b"   : Hex char 0x1b
+   * "\u1234  : Unicode char \u1234
+   */
+  private static class CharExpression {
+    /** character represented by this expression */
+    char ch;
+    /** String expression */
+    String expr;
+    /** length of this expression */
+    int len;
+    public String toString() { return expr; }
+  }
+
+  private CharExpression getCharExpression(char[] input, int pos, int lim,
+        RESyntax syntax) {
+    CharExpression ce = new CharExpression();
+    char c = input[pos];
+    if (c == '\\') {
+      if (pos + 1 >= lim) return null;
+      c = input[pos + 1];
+      switch(c) {
+      case 't':
+        ce.ch = '\t';
+        ce.len = 2;
+        break;
+      case 'n':
+        ce.ch = '\n';
+        ce.len = 2;
+        break;
+      case 'r':
+        ce.ch = '\r';
+        ce.len = 2;
+        break;
+      case 'x':
+      case 'u':
+        if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+            (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+          int l = 0;
+          int expectedLength = (c == 'x' ? 2 : 4);
+          for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
+            if (i >= lim) break;
+            if (!((input[i] >= '0' && input[i] <= '9') ||
+                  (input[i] >= 'A' && input[i] <= 'F') ||
+                  (input[i] >= 'a' && input[i] <= 'f')))
+                break;
+           l++;
+          }
+          if (l != expectedLength) return null;
+          ce.ch = getEscapedChar(input, pos + 2, l, 16);
+         ce.len = l + 2;
+        }
+        else {
+          ce.ch = c;
+          ce.len = 2;
+        }
+        break;
+      case '0':
+        if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+          int l = 0;
+          for (int i = pos + 2; i < pos + 2 + 3; i++) {
+            if (i >= lim) break;
+           if (input[i] < '0' || input[i] > '7') break;
+            l++;
+          }
+          if (l == 3 && input[pos + 2] > '3') l--;
+          if (l <= 0) return null;
+          ce.ch = getEscapedChar(input, pos + 2, l, 8);
+          ce.len = l + 2;
+        }
+        else {
+          ce.ch = c;
+          ce.len = 2;
+        }
+        break;
+      default:
+        ce.ch = c;
+        ce.len = 2;
+        break;
+      }
+    }
+    else {
+      ce.ch = input[pos];
+      ce.len = 1;
+    }
+    ce.expr = new String(input, pos, ce.len);
+    return ce;
+  }
+
   /**
    * Checks if the regular expression matches the input in its entirety.
    *
Index: classpath/gnu/regexp/RESyntax.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RESyntax.java,v
retrieving revision 1.4
diff -u -r1.4 RESyntax.java
--- classpath/gnu/regexp/RESyntax.java  16 Jan 2006 13:38:25 -0000      1.4
+++ classpath/gnu/regexp/RESyntax.java  17 Jan 2006 15:14:44 -0000
@@ -207,7 +207,22 @@
    */
   public static final int RE_EMBEDDED_FLAGS            = 26;
 
-  private static final int BIT_TOTAL                   = 27;
+  /**
+   * Syntax bit.  Allow octal char (\0377), as in Perl5.
+   */
+  public static final int RE_OCTAL_CHAR                = 27;
+
+  /**
+   * Syntax bit.  Allow hex char (\x1b), as in Perl5.
+   */
+  public static final int RE_HEX_CHAR                  = 28;
+
+  /**
+   * Syntax bit.  Allow Unicode char (\u1234), as in Java 1.4.
+   */
+  public static final int RE_UNICODE_CHAR              = 29;
+
+  private static final int BIT_TOTAL                   = 30;
 
   /**
    * Predefined syntax.
@@ -428,6 +443,8 @@
          .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
          .set(RE_COMMENTS)              // (?#)
          .set(RE_EMBEDDED_FLAGS)         // (?imsx-imsx)
+         .set(RE_OCTAL_CHAR)             // \0377
+         .set(RE_HEX_CHAR)               // \x1b
          .makeFinal();
       
       RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
@@ -437,6 +454,7 @@
       RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
          // XXX
          .set(RE_POSSESSIVE_OPS)         // *+,?+,++,{}+
+         .set(RE_UNICODE_CHAR)           // \u1234
          .makeFinal();
   }

[Prev in Thread]

Current Thread

[Next in Thread]

[cp-patches] RFC: gnu.regexp: support escaped characters, Ito Kazumitsu <=

Prev by Date: [cp-patches] FYI: gnu/regexp/RETokenChar.java fixed
Next by Date: [cp-patches] FYI: malloc and free targetizized
Previous by thread: [cp-patches] FYI: target_generic_network.c fixlet
Next by thread: [cp-patches] FYI: malloc and free targetizized
Index(es):
- Date
- Thread