[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[cp-patches] RFC: gnu.regexp: support escaped characters
From: |
Ito Kazumitsu |
Subject: |
[cp-patches] RFC: gnu.regexp: support escaped characters |
Date: |
Wed, 18 Jan 2006 00:26:30 +0900 (JST) |
This fixes the bug #23212.
ChangeLog:
2006-01-17 Ito Kazumitsu <address@hidden>
Fixes bug #23212
* gnu/regexp/RE.java(initialize): Support escaped characters such as
\0123, \x1B, \u1234.
(getEscapedChar): New method.
(CharExpression): New inner class.
(getCharExpression): New Method.
* gnu/regexp/RESyntax.java(RE_OCTAL_CHAR, RE_HEX_CHAR,
RE_UNICODE_CHAR): New syntax bits.
Index: classpath/gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.10
diff -u -r1.10 RE.java
--- classpath/gnu/regexp/RE.java 16 Jan 2006 13:38:25 -0000 1.10
+++ classpath/gnu/regexp/RE.java 17 Jan 2006 15:14:44 -0000
@@ -409,6 +409,8 @@
else if ((unit.ch == '[') && !(unit.bk || quot)) {
Vector options = new Vector();
boolean negative = false;
+ // FIXME: lastChar == 0 means lastChar is not set. But what if
+ // \u0000 is used as a meaningful character?
char lastChar = 0;
if (index == pLength) throw new
REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
@@ -432,6 +434,13 @@
options.addElement(new RETokenChar(subIndex,lastChar,insens));
lastChar = '-';
} else {
+ if ((ch == '\\') &&
syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ CharExpression ce = getCharExpression(pattern, index, pLength,
syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence",
REException.REG_ESCAPE, index);
+ ch = ce.ch;
+ index = index + ce.len - 1;
+ }
options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
lastChar = 0;
index++;
@@ -440,6 +449,8 @@
if (index == pLength) throw new
REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
int posixID = -1;
boolean negate = false;
+ // FIXME: asciiEsc == 0 means asciiEsc is not set. But what if
+ // \u0000 is used as a meaningful character?
char asciiEsc = 0;
if (("dswDSW".indexOf(pattern[index]) != -1) &&
syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
switch (pattern[index]) {
@@ -460,19 +471,13 @@
break;
}
}
- else if ("nrt".indexOf(pattern[index]) != -1) {
- switch (pattern[index]) {
- case 'n':
- asciiEsc = '\n';
- break;
- case 't':
- asciiEsc = '\t';
- break;
- case 'r':
- asciiEsc = '\r';
- break;
- }
- }
+ else {
+ CharExpression ce = getCharExpression(pattern, index - 1,
pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence",
REException.REG_ESCAPE, index);
+ asciiEsc = ce.ch;
+ index = index - 1 + ce.len - 1;
+ }
if (lastChar != 0) options.addElement(new
RETokenChar(subIndex,lastChar,insens));
if (posixID != -1) {
@@ -806,7 +811,19 @@
else
currentToken = setRepeated(currentToken,0,1,index);
}
+
+ // OCTAL CHARACTER
+ // \0377
+ else if (unit.bk && (unit.ch == '0') &&
syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength,
syntax);
+ if (ce == null)
+ throw new REException("invalid octal character",
REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
// BACKREFERENCE OPERATOR
// \1 \2 ... \9
// not available if RE_NO_BK_REFS is set
@@ -935,6 +952,19 @@
currentToken = new RETokenEnd(subIndex,null);
}
+ // HEX CHARACTER, UNICODE CHARACTER
+ // \x1B, \u1234
+
+ else if ((unit.bk && (unit.ch == 'x') &&
syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (unit.bk && (unit.ch == 'u') &&
syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength,
syntax);
+ if (ce == null)
+ throw new REException("invalid hex character",
REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
// NON-SPECIAL CHARACTER (or escape to make literal)
// c | \* for example
@@ -969,6 +999,106 @@
return index;
}
+ private static char getEscapedChar(char[] input, int pos, int len, int
radix) {
+ int ret = 0;
+ for (int i = pos; i < pos + len; i++) {
+ ret = ret * radix + Character.digit(input[i], radix);
+ }
+ return (char)ret;
+ }
+
+ /**
+ * This class represents various expressions for a character.
+ * "a" : 'a' itself.
+ * "\0123" : Octal char 0123
+ * "\x1b" : Hex char 0x1b
+ * "\u1234 : Unicode char \u1234
+ */
+ private static class CharExpression {
+ /** character represented by this expression */
+ char ch;
+ /** String expression */
+ String expr;
+ /** length of this expression */
+ int len;
+ public String toString() { return expr; }
+ }
+
+ private CharExpression getCharExpression(char[] input, int pos, int lim,
+ RESyntax syntax) {
+ CharExpression ce = new CharExpression();
+ char c = input[pos];
+ if (c == '\\') {
+ if (pos + 1 >= lim) return null;
+ c = input[pos + 1];
+ switch(c) {
+ case 't':
+ ce.ch = '\t';
+ ce.len = 2;
+ break;
+ case 'n':
+ ce.ch = '\n';
+ ce.len = 2;
+ break;
+ case 'r':
+ ce.ch = '\r';
+ ce.len = 2;
+ break;
+ case 'x':
+ case 'u':
+ if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ int l = 0;
+ int expectedLength = (c == 'x' ? 2 : 4);
+ for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
+ if (i >= lim) break;
+ if (!((input[i] >= '0' && input[i] <= '9') ||
+ (input[i] >= 'A' && input[i] <= 'F') ||
+ (input[i] >= 'a' && input[i] <= 'f')))
+ break;
+ l++;
+ }
+ if (l != expectedLength) return null;
+ ce.ch = getEscapedChar(input, pos + 2, l, 16);
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ case '0':
+ if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ int l = 0;
+ for (int i = pos + 2; i < pos + 2 + 3; i++) {
+ if (i >= lim) break;
+ if (input[i] < '0' || input[i] > '7') break;
+ l++;
+ }
+ if (l == 3 && input[pos + 2] > '3') l--;
+ if (l <= 0) return null;
+ ce.ch = getEscapedChar(input, pos + 2, l, 8);
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ default:
+ ce.ch = c;
+ ce.len = 2;
+ break;
+ }
+ }
+ else {
+ ce.ch = input[pos];
+ ce.len = 1;
+ }
+ ce.expr = new String(input, pos, ce.len);
+ return ce;
+ }
+
/**
* Checks if the regular expression matches the input in its entirety.
*
Index: classpath/gnu/regexp/RESyntax.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RESyntax.java,v
retrieving revision 1.4
diff -u -r1.4 RESyntax.java
--- classpath/gnu/regexp/RESyntax.java 16 Jan 2006 13:38:25 -0000 1.4
+++ classpath/gnu/regexp/RESyntax.java 17 Jan 2006 15:14:44 -0000
@@ -207,7 +207,22 @@
*/
public static final int RE_EMBEDDED_FLAGS = 26;
- private static final int BIT_TOTAL = 27;
+ /**
+ * Syntax bit. Allow octal char (\0377), as in Perl5.
+ */
+ public static final int RE_OCTAL_CHAR = 27;
+
+ /**
+ * Syntax bit. Allow hex char (\x1b), as in Perl5.
+ */
+ public static final int RE_HEX_CHAR = 28;
+
+ /**
+ * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4.
+ */
+ public static final int RE_UNICODE_CHAR = 29;
+
+ private static final int BIT_TOTAL = 30;
/**
* Predefined syntax.
@@ -428,6 +443,8 @@
.set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
.set(RE_COMMENTS) // (?#)
.set(RE_EMBEDDED_FLAGS) // (?imsx-imsx)
+ .set(RE_OCTAL_CHAR) // \0377
+ .set(RE_HEX_CHAR) // \x1b
.makeFinal();
RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
@@ -437,6 +454,7 @@
RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
// XXX
.set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+
+ .set(RE_UNICODE_CHAR) // \u1234
.makeFinal();
}
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [cp-patches] RFC: gnu.regexp: support escaped characters,
Ito Kazumitsu <=