[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Help-smalltalk] [rfc] regex rewrite
From: |
Mike Anderson |
Subject: |
Re: [Help-smalltalk] [rfc] regex rewrite |
Date: |
Thu, 02 Jun 2005 21:01:35 +0000 |
User-agent: |
Mozilla Thunderbird 0.7.3 (X11/20040803) |
Paolo Bonzini wrote:
Both Perl and Ruby return values that can be interpreted as true/false
from =~, but both of them also capture the matched expression and
sub-expressions into variables. We don't have that possibility, so I
think =~ should work harder.
That's fine with me, if we add #matched, #ifMatched:, etc. methods to
RegexResults as in your previous suggestion, that avoid the unintuitive
#isNil. To have a bit more efficiency, the RegexResults object for
not-matched is a singleton; this means that RegexResults is now at the
head of a small class hierarchy comprising MatchingRegexResults, and
FailedMatchRegexResults.
I think this works very nicely, and is elegant too.
My final concern is about modifiers. The library allows for them, and
this patch enables the case sensitivity modifier. I can't get any joy
out of /m or /s, however (I haven't tried /x).
The last patch I sent was wrong, as you must have noticed. I don't
*think* I made any foolish mistakes with this one.
Mike
diff -ur orig/examples/re.c mod/examples/re.c
--- orig/examples/re.c 2005-05-27 19:28:20.000000000 +0000
+++ mod/examples/re.c 2005-06-02 20:15:40.399785264 +0000
@@ -61,7 +61,7 @@
}
RegexCacheEntry;
-static RegexCaching lookupRegex (OOP patternOOP,
+static RegexCaching lookupRegex (OOP patternOOP, long options,
struct pre_pattern_buffer **pRegex);
static const char *compileRegex (OOP patternOOP,
struct pre_pattern_buffer *regex);
@@ -69,11 +69,11 @@
static void markRegexAsMRU (int i);
/* Functions exported to Smalltalk */
-static OOP reh_make_cacheable (OOP patternOOP);
+static OOP reh_make_cacheable (OOP patternOOP, char *options_string);
static struct pre_registers *reh_search (OOP srcOOP, OOP patternOOP,
- int from, int to);
-static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to);
+ int from, int to, char
*options_string);
+static int reh_match (OOP srcOOP, OOP patternOOP, int from, int to, char
*options_string);
static void reh_free_registers(struct pre_registers *regs);
static RegexCacheEntry cache[REGEX_CACHE_SIZE];
@@ -81,6 +81,58 @@
/* Smalltalk globals */
static OOP regexClass;
+/* From Ruby's re.c */
+#if 'a' == 97 /* it's ascii */
+static const char casetable[] = {
+ '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
+ '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
+ '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
+ '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
+ /* ' ' '!' '"' '#' '$' '%' '&' ''' */
+ '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
+ /* '(' ')' '*' '+' ',' '-' '.' '/' */
+ '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
+ /* '0' '1' '2' '3' '4' '5' '6' '7' */
+ '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
+ /* '8' '9' ':' ';' '<' '=' '>' '?' */
+ '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
+ /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */
+ '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */
+ '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */
+ '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */
+ '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
+ /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */
+ '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
+ /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */
+ '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
+ /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */
+ '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
+ /* 'x' 'y' 'z' '{' '|' '}' '~' */
+ '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
+ '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
+ '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
+ '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
+ '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
+ '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
+ '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
+ '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
+ '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
+ '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
+ '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
+ '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
+ '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
+ '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
+ '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
+ '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
+ '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
+};
+#else
+ # error >>> "You lose. You will need a translation table for your
character set." <<<
+#endif
+
/* Allocate a buffer to be passed to the regular expression matcher */
struct pre_pattern_buffer *
allocateNewRegex (void)
@@ -136,7 +188,7 @@
* caller will also have to free the buffer pointed to by pRegex.
*/
RegexCaching
-lookupRegex (OOP patternOOP, struct pre_pattern_buffer **pRegex)
+lookupRegex (OOP patternOOP, long options, struct pre_pattern_buffer **pRegex)
{
int i;
RegexCaching result;
@@ -149,7 +201,7 @@
/* Search for the Regex object in the cache */
for (i = 0; i < REGEX_CACHE_SIZE; i++)
- if (cache[i].patternOOP == patternOOP)
+ if ((cache[i].patternOOP == patternOOP) && (cache[i].regex->options ==
options))
break;
if (i < REGEX_CACHE_SIZE)
@@ -170,14 +222,39 @@
}
/* Mark the object as most recently used */
- if (!cache[i].regex)
- cache[i].regex = allocateNewRegex ();
+ if (!cache[i].regex)
+ {
+ cache[i].regex = allocateNewRegex ();
+ cache[i].regex -> options = options;
+ }
markRegexAsMRU (i);
*pRegex = cache[0].regex;
return result;
}
+long
+translate_options_string(char *options_string)
+{
+ char *p = options_string;
+ char c;
+ long result = 0;
+ if (!options_string)
+ return result;
+ while((c = *p++) != 0)
+ {
+ switch (c)
+ {
+ case 'i' : result |= PRE_OPTION_IGNORECASE;
+ case 'x' : result |= PRE_OPTION_EXTENDED;
+ case 'm' : result |= PRE_OPTION_MULTILINE;
+ case 'p' : result |= PRE_OPTION_POSIXLINE;
+ case 's' : result |= PRE_OPTION_SINGLELINE;
+ }
+ }
+ return result;
+}
+
/* Create a Regex object. We look for one that points to the same string
* in the cache (so that we can optimize a loop that repeatedly calls
* asRegex; if none is found, we create one ex-novo.
@@ -185,7 +262,7 @@
* are read-only so that we can support this kind of "interning" them.
*/
OOP
-reh_make_cacheable (OOP patternOOP)
+reh_make_cacheable (OOP patternOOP, char *options_string)
{
OOP regexOOP;
const char *pattern;
@@ -224,7 +301,7 @@
/* Put it in the cache (we must compile it to check that it
* is well-formed).
*/
- lookupRegex (regexOOP, &compiled);
+ lookupRegex (regexOOP, translate_options_string(options_string), &compiled);
if (compileRegex (patternOOP, compiled) != NULL)
return vmProxy->nilOOP;
else
@@ -233,7 +310,7 @@
/* Search helper function */
struct pre_registers *
-reh_search (OOP srcOOP, OOP patternOOP, int from, int to)
+reh_search (OOP srcOOP, OOP patternOOP, int from, int to, char *options_string)
{
int res = 0;
const char *src;
@@ -241,9 +318,9 @@
struct pre_registers *regs;
RegexCaching caching;
- caching = lookupRegex (patternOOP, ®ex);
+ caching = lookupRegex (patternOOP, translate_options_string(options_string),
®ex);
if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
- return NULL;
+ return NULL;
/* now search */
src = &STRING_OOP_AT (OOP_TO_OBJ (srcOOP), 1);
@@ -265,14 +342,14 @@
/* Match helper function */
int
-reh_match (OOP srcOOP, OOP patternOOP, int from, int to)
+reh_match (OOP srcOOP, OOP patternOOP, int from, int to, char *options_string)
{
int res = 0;
const char *src;
struct pre_pattern_buffer *regex;
RegexCaching caching;
-
- caching = lookupRegex (patternOOP, ®ex);
+
+ caching = lookupRegex (patternOOP, translate_options_string(options_string),
®ex);
if (caching != REGEX_CACHE_HIT && compileRegex (patternOOP, regex) != NULL)
return -100;
@@ -289,6 +366,7 @@
void
gst_initModule (VMProxy * proxy)
{
+ pre_set_casetable(casetable);
vmProxy = proxy;
vmProxy->defineCFunc ("reh_search", reh_search);
vmProxy->defineCFunc ("reh_match", reh_match);
diff -ur orig/examples/regex.st mod/examples/regex.st
--- orig/examples/regex.st 2005-06-01 08:46:37.000000000 +0000
+++ mod/examples/regex.st 2005-06-02 20:05:36.411605424 +0000
@@ -223,23 +223,23 @@
Regex class
defineCFunc: 'reh_make_cacheable'
- withSelectorArgs: 'fromString: aString'
+ withSelectorArgs: 'fromString: aPatternString options: aOptionsString'
returning: #smalltalk
- args: #(#smalltalk).
+ args: #(#smalltalk #string).
!
String
defineCFunc: 'reh_match'
- withSelectorArgs: 'lengthOfRegexMatch: pattern from: from to: to'
+ withSelectorArgs: 'lengthOfRegexMatch: pattern from: from to: to
options: aOptionsString'
returning: #int
- args: #(#selfSmalltalk #smalltalk #int #int)
+ args: #(#selfSmalltalk #smalltalk #int #int #string)
!
String
defineCFunc: 'reh_search'
- withSelectorArgs: 'searchRegexInternal: pattern from: from to: to'
+ withSelectorArgs: 'searchRegexInternal: pattern from: from to: to
options: aOptionsString'
returning: CRegexRegisters type
- args: #(#selfSmalltalk #smalltalk #int #int)
+ args: #(#selfSmalltalk #smalltalk #int #int #string)
!
!
@@ -339,12 +339,21 @@
aStream nextPut: each.
].
aStream nextPut: $/.
-! !
+!
+!
"--------------------------------------------------------------------------"
!String methodsFor: 'regex'!
+searchRegexInternal: pattern from: from to: to
+ ^self searchRegexInternal: pattern from: from to: to options: ''
+!
+
+lengthOfRegexMatch: pattern from: from to: to
+ ^self lengthOfRegexMatch: pattern from: from to: to options: ''
+!
+
asRegex
"Answer the receiver, converted to a Regex object."
^Regex fromString: self
@@ -381,6 +390,20 @@
]
!
+searchRegex: pattern options: options
+ | regs |
+ regs := self searchRegexInternal: pattern from: 1 to: self size options:
options.
+ ^regs matchBeg = -1
+ ifTrue: [
+ regs free.
+ FailedMatchRegexResults notMatched
+ ]
+ ifFalse: [
+ [ MatchingRegexResults new initialize: regs subject: self ]
+ ensure: [ regs free ]
+ ]
+!
+
searchRegex: pattern startingAt: anIndex
| regs |
regs := self searchRegexInternal: pattern from: anIndex to: self size.
@@ -480,6 +503,14 @@
^(self lengthOfRegexMatch: pattern from: 1 to: self size) = self size
!
+matchRegex: pattern ignoreCase: aBoolean
+ ^(self
+ lengthOfRegexMatch: pattern
+ from: 1
+ to: self size
+ options: (aBoolean ifTrue: [ 'i' ] ifFalse: [ '' ])) = self size
+!
+
matchRegex: pattern startingAt: idx
^(self lengthOfRegexMatch: pattern from: idx to: self size) > 0
!
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [Help-smalltalk] [rfc] regex rewrite,
Mike Anderson <=