[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Chicken-users] Inconsistencies with regexp package?
From: |
Felix Winkelmann |
Subject: |
Re: [Chicken-users] Inconsistencies with regexp package? |
Date: |
Wed, 12 May 2004 08:38:33 +0200 |
User-agent: |
Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.6) Gecko/20040113 |
address@hidden wrote:
I understood from the reading the documentation of the regexp unit that the
either of the following calls should work
(string-split-fields (regexp "\\d+") "hello 1 cruel 23 world45")
(string-split-fields "\\d+" "hello 1 cruel 23 world45")
However only the second version does, while the first signals and error:
Error: (string-split-fields) bad argument type - not a string: (:sub (:or (:seq
(:between #f 1 #f :digit))))
Ah, you are using pregexp, so I assume this is on Windows, right?
It's a bug. See the attachment.
on the other hand the string-match behaves differently with
(string-match-positions "\\d+" "hello 1 cruel 23 world45") returning #f
while
(string-match-positions (regexp "\\d+") "hello 1 cruel 23 world45") returns
((6 7))
Another bug. Pregexp uses a s-expr representation for regular expressions,
and the procedures from the regex unit have to handle strings or s-exprs
transparently.
Is this just incomplete documentation or an implementation problem?
The latter. Please find a new version of pregexp.scm attached. Just drop
it into whatever directory you used to build the system and run make
again.
Sorry, if this caused any inconveniences.
cheers,
felix
;;;; pregexp.scm - Portable regular expression library using Dorai Sitaram's
PREGEXP package
;Portable regular expressions for Scheme
;Dorai Sitaram
;http://www.ccs.neu.edu/~dorai
;ds26 AT gte.com
;Oct 2, 1999
; (With additions by Felix Winkelmann)
(cond-expand [(not chicken-compile-shared) (declare (unit regex))] [else])
(declare
(fixnum)
(disable-interrupts)
(usual-integrations)
(hide pregexp-reverse! pregexp-read-pattern pregexp-read-branch
pregexp-read-piece pregexp-read-escaped-number pregexp-read-escaped-char
pregexp-read-posix-char-class pregexp-read-cluster-type
pregexp-read-subpattern
pregexp-wrap-quantifier-if-any pregexp-read-nums
pregexp-invert-char-list
pregexp-read-char-list pregexp-string-match pregexp-char-word?
pregexp-at-word-boundary? pregexp-check-if-in-char-class?
pregexp-list-ref
pregexp-match-positions-aux pregexp-replace-aux
pregexp-match pregexp-match-positions pregexp-split pregexp-replace
pregexp-replace*) )
(cond-expand
[paranoia]
[else
(declare
(no-bound-checks)
(bound-to-procedure
##sys#check-string ##sys#check-exact ##sys#make-pointer ##sys#cons
##sys#size ##sys#slot) ) ] )
(cond-expand
[unsafe
(eval-when (compile)
(define-macro (##sys#check-structure . _) '(##core#undefined))
(define-macro (##sys#check-range . _) '(##core#undefined))
(define-macro (##sys#check-pair . _) '(##core#undefined))
(define-macro (##sys#check-list . _) '(##core#undefined))
(define-macro (##sys#check-symbol . _) '(##core#undefined))
(define-macro (##sys#check-string . _) '(##core#undefined))
(define-macro (##sys#check-char . _) '(##core#undefined))
(define-macro (##sys#check-exact . _) '(##core#undefined))
(define-macro (##sys#check-port . _) '(##core#undefined))
(define-macro (##sys#check-number . _) '(##core#undefined))
(define-macro (##sys#check-byte-vector . _) '(##core#undefined)) ) ]
[else] )
(register-feature! 'regex 'pregexp)
(define-constant *pregexp-comment-char* #\;)
(define-constant *pregexp-return-char* #\return)
(define-constant *pregexp-tab-char* #\tab)
(define *pregexp-space-sensitive?* #t)
(define pregexp-reverse!
;the useful reverse! isn't R5RS
(lambda (s)
(let loop ((s s) (r '()))
(if (null? s) r
(let ((d (cdr s)))
(set-cdr! s r)
(loop d s))))))
(define (pregexp-error . args)
(apply ##sys#error "pregexp-error: " args) )
(define pregexp-read-pattern
(lambda (s i n)
(if (>= i n)
(list
(list ':or (list ':seq)) i)
(let loop ((branches '()) (i i))
(if (or (>= i n)
(char=? (string-ref s i) #\)))
(list (cons ':or (pregexp-reverse! branches)) i)
(let ((vv (pregexp-read-branch
s
(if (char=? (string-ref s i) #\|) (+ i 1) i) n)))
(loop (cons (car vv) branches) (cadr vv))))))))
(define pregexp-read-branch
(lambda (s i n)
(let loop ((pieces '()) (i i))
(cond ((>= i n)
(list (cons ':seq (pregexp-reverse! pieces)) i))
((let ((c (string-ref s i)))
(or (char=? c #\|)
(char=? c #\))))
(list (cons ':seq (pregexp-reverse! pieces)) i))
(else (let ((vv (pregexp-read-piece s i n)))
(loop (cons (car vv) pieces) (cadr vv))))))))
(define pregexp-read-piece
(lambda (s i n)
(let ((c (string-ref s i)))
(case c
((#\^) (list ':bos (+ i 1)))
((#\$) (list ':eos (+ i 1)))
((#\.) (pregexp-wrap-quantifier-if-any
(list ':any (+ i 1)) s n))
((#\[) (pregexp-wrap-quantifier-if-any
(case (string-ref s (+ i 1))
((#\^)
(let ((vv (pregexp-read-char-list s (+ i 2) n)))
(list (list ':neg-char (car vv)) (cadr vv))))
(else (pregexp-read-char-list s (+ i 1) n)))
s n))
((#\()
(pregexp-wrap-quantifier-if-any
(pregexp-read-subpattern s (+ i 1) n) s n))
((#\\)
(pregexp-wrap-quantifier-if-any
(cond ((pregexp-read-escaped-number s i n) =>
(lambda (num-i)
(list (list ':backref (car num-i)) (cadr num-i))))
((pregexp-read-escaped-char s i n) =>
(lambda (char-i)
(list (car char-i) (cadr char-i))))
(else (pregexp-error 'pregexp-read-piece "backslash")))
s n))
(else
(if (or *pregexp-space-sensitive?*
(and (not (char-whitespace? c))
(not (char=? c *pregexp-comment-char*))))
(pregexp-wrap-quantifier-if-any
(list c (+ i 1)) s n)
(let loop ((i i) (in-comment? #f))
(if (>= i n) (list ':empty i)
(let ((c (string-ref s i)))
(cond (in-comment?
(loop (+ i 1)
(not (char=? c #\newline))))
((char-whitespace? c)
(loop (+ i 1) #f))
((char=? c *pregexp-comment-char*)
(loop (+ i 1) #t))
(else (list ':empty i))))))))))))
(define pregexp-read-escaped-number
(lambda (s i n)
; s[i] = \
(and (< (+ i 1) n) ;must have at least something following \
(let ((c (string-ref s (+ i 1))))
(and (char-numeric? c)
(let loop ((i (+ i 2)) (r (list c)))
(if (>= i n)
(list (string->number
(list->string (pregexp-reverse! r))) i)
(let ((c (string-ref s i)))
(if (char-numeric? c)
(loop (+ i 1) (cons c r))
(list (string->number
(list->string (pregexp-reverse! r)))
i))))))))))
(define pregexp-read-escaped-char
(lambda (s i n)
; s[i] = \
(and (< (+ i 1) n)
(let ((c (string-ref s (+ i 1))))
(case c
((#\b) (list ':wbdry (+ i 2)))
((#\B) (list ':not-wbdry (+ i 2)))
((#\d) (list ':digit (+ i 2)))
((#\D) (list '(:neg-char :digit) (+ i 2)))
((#\n) (list #\newline (+ i 2)))
((#\r) (list *pregexp-return-char* (+ i 2)))
((#\s) (list ':space (+ i 2)))
((#\S) (list '(:neg-char :space) (+ i 2)))
((#\t) (list *pregexp-tab-char* (+ i 2)))
((#\w) (list ':word (+ i 2)))
((#\W) (list '(:neg-char :word) (+ i 2)))
(else (list c (+ i 2))))))))
(define pregexp-read-posix-char-class
(lambda (s i n)
; lbrack, colon already read
(let ((neg? #f))
(let loop ((i i) (r (list #\:)))
(if (>= i n)
(pregexp-error 'pregexp-read-posix-char-class)
(let ((c (string-ref s i)))
(cond ((char=? c #\^)
(set! neg? #t)
(loop (+ i 1) r))
((char-alphabetic? c)
(loop (+ i 1) (cons c r)))
((char=? c #\:)
(if (or (>= (+ i 1) n)
(not (char=? (string-ref s (+ i 1)) #\])))
(pregexp-error 'pregexp-read-posix-char-class)
(let ((posix-class
(string->symbol
(list->string (pregexp-reverse! r)))))
(list (if neg? (list ':neg-char posix-class)
posix-class)
(+ i 2)))))
(else
(pregexp-error 'pregexp-read-posix-char-class)))))))))
(define pregexp-read-cluster-type
(lambda (s i n)
; s[i-1] = left-paren
(let ((c (string-ref s i)))
(case c
((#\?)
(let ((i (+ i 1)))
(case (string-ref s i)
((#\:) (list '() (+ i 1)))
((#\=) (list '(:lookahead) (+ i 1)))
((#\!) (list '(:neg-lookahead) (+ i 1)))
((#\>) (list '(:no-backtrack) (+ i 1)))
((#\<)
(list (case (string-ref s (+ i 1))
((#\=) '(:lookbehind))
((#\!) '(:neg-lookbehind))
(else (pregexp-error 'pregexp-read-cluster-type)))
(+ i 2)))
(else (let loop ((i i) (r '()) (inv? #f))
(let ((c (string-ref s i)))
(case c
((#\-) (loop (+ i 1) r #t))
((#\i) (loop (+ i 1)
(cons (if inv? ':case-sensitive
':case-insensitive) r) #f))
((#\x)
(set! *pregexp-space-sensitive?* inv?)
(loop (+ i 1) r #f))
((#\:) (list r (+ i 1)))
(else (pregexp-error
'pregexp-read-cluster-type)))))))))
(else (list '(:sub) i))))))
(define pregexp-read-subpattern
(lambda (s i n)
(let* ((remember-space-sensitive? *pregexp-space-sensitive?*)
(ctyp-i (pregexp-read-cluster-type s i n))
(ctyp (car ctyp-i))
(i (cadr ctyp-i))
(vv (pregexp-read-pattern s i n)))
(set! *pregexp-space-sensitive?* remember-space-sensitive?)
(let ((vv-re (car vv))
(vv-i (cadr vv)))
(if (and (< vv-i n)
(char=? (string-ref s vv-i)
#\)))
(list
(let loop ((ctyp ctyp) (re vv-re))
(if (null? ctyp) re
(loop (cdr ctyp)
(list (car ctyp) re))))
(+ vv-i 1))
(pregexp-error 'pregexp-read-subpattern))))))
(define pregexp-wrap-quantifier-if-any
(lambda (vv s n)
(let ((re (car vv)))
(let loop ((i (cadr vv)))
(if (>= i n) vv
(let ((c (string-ref s i)))
(if (and (char-whitespace? c) (not *pregexp-space-sensitive?*))
(loop (+ i 1))
(case c
((#\* #\+ #\? #\{)
(let* ((new-re (list ':between 'minimal?
'at-least 'at-most re))
(new-vv (list new-re 'next-i)))
(case c
((#\*) (set-car! (cddr new-re) 0)
(set-car! (cdddr new-re) #f))
((#\+) (set-car! (cddr new-re) 1)
(set-car! (cdddr new-re) #f))
((#\?) (set-car! (cddr new-re) 0)
(set-car! (cdddr new-re) 1))
((#\{) (let ((pq (pregexp-read-nums s (+ i 1) n)))
(if (not pq)
(pregexp-error
'pregexp-wrap-quantifier-if-any
"left bracket must be followed by
number"))
(set-car! (cddr new-re) (car pq))
(set-car! (cdddr new-re) (cadr pq))
(set! i (caddr pq)))))
(let loop ((i (+ i 1)))
(if (>= i n)
(begin (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))
(let ((c (string-ref s i)))
(cond ((and (char-whitespace? c)
(not *pregexp-space-sensitive?*))
(loop (+ i 1)))
((char=? c #\?)
(set-car! (cdr new-re) #t)
(set-car! (cdr new-vv) (+ i 1)))
(else (set-car! (cdr new-re) #f)
(set-car! (cdr new-vv) i))))))
new-vv))
(else vv)))))))))
;
(define pregexp-read-nums
(lambda (s i n)
; s[i-1] = {
; returns (p q k) where s[k] = }
(let loop ((p '()) (q '()) (k i) (reading 1))
(if (>= k n) (pregexp-error 'pregexp-read-nums))
(let ((c (string-ref s k)))
(cond ((char-numeric? c)
(if (= reading 1)
(loop (cons c p) q (+ k 1) 1)
(loop p (cons c q) (+ k 1) 2)))
((and (char-whitespace? c) (not *pregexp-space-sensitive?*))
(loop p q (+ k 1) reading))
((and (char=? c #\,) (= reading 1))
(loop p q (+ k 1) 2))
((char=? c #\})
(let ((p (string->number (list->string (pregexp-reverse! p))))
(q (string->number (list->string (pregexp-reverse! q)))))
(cond ((and (not p) (= reading 1)) (list 0 #f k))
((= reading 1) (list p p k))
(else (list p q k)))))
(else #f))))))
(define pregexp-invert-char-list
(lambda (vv)
(set-car! (car vv) ':none-of-chars)
vv))
;
(define pregexp-read-char-list
(lambda (s i n)
(let loop ((r '()) (i i))
(if (>= i n)
(pregexp-error 'pregexp-read-char-list
"character class ended too soon")
(let ((c (string-ref s i)))
(case c
((#\]) (if (null? r)
(loop (cons c r) (+ i 1))
(list (cons ':one-of-chars (pregexp-reverse! r))
(+ i 1))))
((#\\)
(let ((char-i (pregexp-read-escaped-char s i n)))
(if char-i (loop (cons (car char-i) r) (cadr char-i))
(pregexp-error 'pregexp-read-char-list "backslash"))))
((#\-) (let ((c-prev (car r)))
(if (char? c-prev)
(loop (cons (list ':char-range c-prev
(string-ref s (+ i 1))) (cdr r))
(+ i 2))
(loop (cons c r) (+ i 1)))))
((#\[) (if (char=? (string-ref s (+ i 1)) #\:)
(let ((posix-char-class-i
(pregexp-read-posix-char-class s (+ i 2) n)))
(loop (cons (car posix-char-class-i) r)
(cadr posix-char-class-i)))
(loop (cons c r) (+ i 1))))
(else (loop (cons c r) (+ i 1)))))))))
;
(define pregexp-string-match
(lambda (s1 s i n sk fk)
(let ((n1 (string-length s1)))
(if (> n1 n) (fk)
(let loop ((j 0) (k i))
(cond ((>= j n1) (sk k))
((>= k n) (fk))
((char=? (string-ref s1 j) (string-ref s k))
(loop (+ j 1) (+ k 1)))
(else (fk))))))))
(define pregexp-char-word?
(lambda (c)
;too restrictive for Scheme but this
;is what \w is in most regexp notations
(or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_))))
(define pregexp-at-word-boundary?
(lambda (s i n)
(or (= i 0) (>= i n)
(let ((c/i (string-ref s i))
(c/i-1 (string-ref s (- i 1))))
(let ((c/i/w? (pregexp-check-if-in-char-class?
c/i ':word))
(c/i-1/w? (pregexp-check-if-in-char-class?
c/i-1 ':word)))
(or (and c/i/w? (not c/i-1/w?))
(and (not c/i/w?) c/i-1/w?)))))))
(define pregexp-check-if-in-char-class?
(lambda (c char-class)
(case char-class
((:any) (not (char=? c #\newline)))
;
((:alnum) (or (char-alphabetic? c) (char-numeric? c)))
((:alpha) (char-alphabetic? c))
((:ascii) (< (char->integer c) 128))
((:blank) (or (char=? c #\space) (char=? c *pregexp-tab-char*)))
((:cntrl) (< (char->integer c) 32))
((:digit) (char-numeric? c))
((:graph) (and (>= (char->integer c) 32)
(not (char-whitespace? c))))
((:lower) (char-lower-case? c))
((:print) (>= (char->integer c) 32))
((:punct) (and (>= (char->integer c) 32)
(not (char-whitespace? c))
(not (char-alphabetic? c))
(not (char-numeric? c))))
((:space) (char-whitespace? c))
((:upper) (char-upper-case? c))
((:word) (or (char-alphabetic? c)
(char-numeric? c)
(char=? c #\_)))
((:xdigit) (or (char-numeric? c)
(char-ci=? c #\a) (char-ci=? c #\b)
(char-ci=? c #\c) (char-ci=? c #\d)
(char-ci=? c #\e) (char-ci=? c #\f)))
(else (pregexp-error 'pregexp-check-if-in-char-class?)))))
(define pregexp-list-ref
(lambda (s i)
;like list-ref but returns #f if index is
;out of bounds
(let loop ((s s) (k 0))
(cond ((null? s) #f)
((= k i) (car s))
(else (loop (cdr s) (+ k 1)))))))
;re is a compiled regexp. It's a list that can't be
;nil. pregexp-match-positions-aux returns a 2-elt list whose
;car is the string-index following the matched
;portion and whose cadr contains the submatches.
;The proc returns false if there's no match.
;Am spelling loop- as loup- because these shouldn't
;be translated into CL loops by scm2cl (although
;they are tail-recursive in Scheme)
(define pregexp-match-positions-aux
(lambda (re s start n i)
(let ((case-sensitive? #t))
(let sub ((re re) (i i) (backrefs '()) (sk list) (fk (lambda () #f)))
;(printf "sub ~s ~s~%" i re)
(cond ((eqv? re ':bos)
(if (= i start) (sk i backrefs) (fk)))
((eqv? re ':eos)
(if (>= i n) (sk i backrefs) (fk)))
((eqv? re ':empty)
(sk i backrefs))
((eqv? re ':wbdry)
(if (pregexp-at-word-boundary? s i n)
(sk i backrefs)
(fk)))
((eqv? re ':not-wbdry)
(if (pregexp-at-word-boundary? s i n)
(fk)
(sk i backrefs)))
((and (char? re) (< i n))
(if ((if case-sensitive? char=? char-ci=?)
(string-ref s i) re)
(sk (+ i 1) backrefs) (fk)))
((and (not (pair? re)) (< i n))
(if (pregexp-check-if-in-char-class?
(string-ref s i) re)
(sk (+ i 1) backrefs) (fk)))
((and (pair? re) (eqv? (car re) ':char-range) (< i n))
(let ((c (string-ref s i)))
(if (let ((c< (if case-sensitive? char<=? char-ci<=?)))
(and (c< (cadr re) c)
(c< c (caddr re))))
(sk (+ i 1) backrefs) (fk))))
((pair? re)
(case (car re)
((:char-range)
(if (>= i n) (fk) (pregexp-error
'pregexp-match-positions-aux)))
((:one-of-chars)
(if (>= i n) (fk)
(let loup-one-of-chars ((chars (cdr re)))
(if (null? chars) (fk)
(sub (car chars) i backrefs sk
(lambda ()
(loup-one-of-chars (cdr chars))))))))
((:neg-char)
(if (>= i n) (fk)
(sub (cadr re) i backrefs
(lambda (i1 backrefs1) (fk))
(lambda () (sk (+ i 1) backrefs)))))
((:seq)
(let loup-seq ((res (cdr re)) (i i) (backrefs backrefs))
(if (null? res) (sk i backrefs)
(sub (car res) i backrefs
(lambda (i1 backrefs1)
(loup-seq (cdr res) i1 backrefs1))
fk))))
((:or)
(let loup-or ((res (cdr re)))
(if (null? res) (fk)
(sub (car res) i backrefs
(lambda (i1 backrefs1)
(or (sk i1 backrefs1)
(loup-or (cdr res))))
(lambda () (loup-or (cdr res)))))))
((:backref)
(let ((backref (pregexp-list-ref backrefs (cadr re))))
(if backref
(pregexp-string-match
(substring s (car backref) (cdr backref))
s i n (lambda (i) (sk i backrefs)) fk)
(sk i backrefs))))
((:sub)
(let* ((sub-backref (cons i i))
(backrefs (append backrefs (list sub-backref))))
(sub (cadr re) i backrefs
(lambda (i1 backrefs1)
(set-cdr! sub-backref i1)
(sk i1 backrefs1)) fk)))
((:lookahead)
(let ((found-it?
(sub (cadr re) i backrefs
list (lambda () #f))))
(if found-it? (sk i backrefs) (fk))))
((:neg-lookahead)
(let ((found-it?
(sub (cadr re) i backrefs
list (lambda () #f))))
(if found-it? (fk) (sk i backrefs))))
((:lookbehind)
(let ((n-actual n)) (set! n i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0 backrefs
list (lambda () #f))))
(set! n n-actual)
(if found-it? (sk i backrefs) (fk)))))
((:neg-lookbehind)
(let ((n-actual n)) (set! n i)
(let ((found-it?
(sub (list ':seq '(:between #f 0 #f :any)
(cadr re) ':eos) 0 backrefs
list (lambda () #f))))
(set! n n-actual)
(if found-it? (fk) (sk i backrefs)))))
((:no-backtrack)
(let ((found-it? (sub (cadr re) i backrefs
list (lambda () #f))))
(if found-it?
(sk (car found-it?) (cadr found-it?))
(fk))))
((:case-sensitive :case-insensitive)
(let ((old case-sensitive?))
(set! case-sensitive?
(eqv? (car re) ':case-sensitive))
(sub (cadr re) i backrefs
(lambda (i1 backrefs1)
(set! case-sensitive? old)
(sk i1 backrefs1))
(lambda ()
(set! case-sensitive? old)
(fk)))))
((:between)
(let* ((maximal? (not (cadr re)))
(p (caddr re)) (q (cadddr re))
(re (car (cddddr re)))
(subpat? (and (pair? re) (eqv? (car re) ':sub))))
(let loup-p ((k 0) (i i) (cbackrefs 'no-match-yet))
(if (< k p)
(sub re i backrefs
(lambda (i1 backrefs1)
(loup-p (+ k 1) i1 backrefs1))
fk)
(let ((q (and q (- q p))))
(let loup-q ((k 0) (i i) (cbackrefs cbackrefs))
(let ((fk (lambda ()
(sk i (if (eqv? cbackrefs
'no-match-yet)
(if subpat?
(append backrefs
(list #f))
backrefs)
cbackrefs)))))
(if (and q (>= k q)) (fk)
(if maximal?
(sub re i backrefs
(lambda (i1 backrefs1)
(or (loup-q (+ k 1) i1 backrefs1)
(fk)))
fk)
(or (fk)
(sub re i backrefs
(lambda (i1 backrefs1)
(loup-q (+ k 1) i1 backrefs1))
fk)))))))))))
(else (pregexp-error 'pregexp-match-positions-aux))))
((>= i n) (fk))
(else (pregexp-error 'pregexp-match-positions-aux)))))))
(define pregexp-replace-aux
(lambda (str ins n backrefs)
(let loop ((i 0) (r ""))
(if (>= i n) r
(let ((c (string-ref ins i)))
(if (char=? c #\\)
(let* ((br-i (pregexp-read-escaped-number ins i n))
(br (if br-i (car br-i)
(if (char=? (string-ref ins (+ i 1)) #\&) 0
#f)))
(i (if br-i (cadr br-i)
(if br (+ i 2)
(+ i 1)))))
(if (not br)
(let ((c2 (string-ref ins i)))
(loop (+ i 1)
(if (char=? c2 #\$) r
(string-append r (string c2)))))
(loop i
(let ((backref (pregexp-list-ref backrefs br)))
(if backref
(string-append r
(substring str (car backref) (cdr backref)))
r)))))
(loop (+ i 1) (string-append r (string c)))))))))
(define pregexp
(lambda (s)
(set! *pregexp-space-sensitive?* #t) ;in case it got corrupted
(list ':sub (car (pregexp-read-pattern s 0 (string-length s))))))
(define regexp pregexp)
(define (regexp? x) (and (pair? x) (eq? ':sub (##sys#slot x 0))))
(define pregexp-match-positions
(lambda (pat str . opt-args)
(let* ((pat (if (string? pat) (pregexp pat) pat))
(start (if (null? opt-args) 0
(let ((start (car opt-args)))
(set! opt-args (cdr opt-args))
start)))
(end (if (null? opt-args) (string-length str)
(car opt-args))))
(let loop ((i start))
(and (<= i end)
(let ((vv (pregexp-match-positions-aux pat str start end i)))
(if vv
(cadr vv)
(loop (+ i 1)))))))))
(define pregexp-match
(lambda (pat str . opt-args)
(let ((ix-prs (apply pregexp-match-positions pat str opt-args)))
(and ix-prs
(map
(lambda (ix-pr)
(and ix-pr
(substring str (car ix-pr) (cdr ix-pr))))
ix-prs)))))
(define pregexp-split
(lambda (pat str)
;split str into substrings, using pat as delimiter
(let ((n (string-length str)))
(let loop ((i 0) (r '()) (picked-up-one-undelimited-char? #f))
(cond ((>= i n) (pregexp-reverse! r))
((pregexp-match-positions pat str i n)
=>
(lambda (y)
(let ((jk (car y)))
(let ((j (car jk)) (k (cdr jk)))
;(printf "j = ~a; k = ~a; i = ~a~n" j k i)
(cond ((= j k)
;(printf "producing ~s~n" (substring str i (+ j 1)))
(loop (+ k 1)
(cons (substring str i (+ j 1)) r) #t))
((and (= j i) picked-up-one-undelimited-char?)
(loop k r #f))
(else
;(printf "producing ~s~n" (substring str i j))
(loop k (cons (substring str i j) r) #f)))))))
(else (loop n (cons (substring str i n) r) #f)))))))
(define pregexp-replace
(lambda (pat str ins)
(let* ((n (string-length str))
(pp (pregexp-match-positions pat str 0 n)))
(if (not pp) str
(let ((ins-len (string-length ins))
(m-i (caar pp))
(m-n (cdar pp)))
(string-append
(substring str 0 m-i)
(pregexp-replace-aux str ins ins-len pp)
(substring str m-n n)))))))
(define pregexp-replace*
(lambda (pat str ins)
(let ((pat (if (string? pat) (pregexp pat) pat))
(n (string-length str))
(ins-len (string-length ins)))
(let loop ((i 0) (r ""))
(let ((pp (pregexp-match-positions pat str i n)))
(if pp
(loop (cdar pp)
(string-append r
(substring str i (caar pp))
(pregexp-replace-aux str ins ins-len pp)))
(string-append r
(substring str i n))))))))
;eof
;;; Chicken API:
(let ([string-append string-append])
(define (prep op rx str start)
(apply op
(if (string? rx)
(string-append "^" rx "$")
`(:seq :bos ,rx :eos) )
str start) )
(define (prep2 op rx str start)
(apply op rx str start) )
(set! string-match (lambda (rx str . start) (prep pregexp-match rx str
start)))
(set! string-match-positions
(lambda (rx str . start)
(let ([r (prep pregexp-match-positions rx str start)])
(and r (map (lambda (p) (list (car p) (cdr p))) r) ) ) ) )
(set! string-search (lambda (rx str . start) (prep2 pregexp-match rx str
start)))
(set! string-search-positions
(lambda (rx str . start)
(let ([r (prep2 pregexp-match-positions rx str start)])
(and r (map (lambda (p) (list (car p) (cdr p))) r) ) ) ) ) )
;;; Split string into fields:
(define string-split-fields
(let ([reverse reverse]
[substring substring]
[string-search-positions string-search-positions] )
(lambda (regexp str . mode-and-start)
(##sys#check-string str 'string-split-fields)
(let* ([argc (length mode-and-start)]
[len (##sys#size str)]
[mode (if (fx> argc 0) (car mode-and-start) #t)]
[start (if (fx> argc 1) (cadr mode-and-start) 0)]
[fini (case mode
[(#:suffix)
(lambda (ms start)
(if (fx< start len)
(##sys#error 'string-split-fields "record does not
end with suffix" str regexp)
(reverse ms) ) ) ]
[(#:infix)
(lambda (ms start)
(if (fx>= start len)
(reverse ms)
(reverse (cons (substring str start len) ms)) ) ) ]
[else (lambda (ms start) (reverse ms)) ] ) ]
[fetch (case mode
[(#:infix #:suffix) (lambda (start from to) (substring
str start from))]
[else (lambda (start from to) (substring str from to))] )
] )
(let loop ([ms '()] [start start])
(let ([m (string-search-positions regexp str start)])
(if m
(let* ([mp (##sys#slot m 0)]
[from (##sys#slot mp 0)]
[to (cadr mp)] )
(if (fx= from to)
(if (fx= to len)
(fini ms start)
(loop (cons (fetch start (fx+ from 1) (fx+ to 2)) ms)
(fx+ to 1)) )
(loop (cons (fetch start from to) ms) to) ) )
(fini ms start) ) ) ) ) ) ) )
;;; Substitute matching strings:
(define string-substitute
(let ([substring substring]
[reverse reverse]
[make-string make-string]
[string-search-positions string-search-positions] )
(lambda (regex subst string . flag)
(##sys#check-string subst 'string-substitute)
(let* ([which (if (pair? flag) (car flag) 1)]
[substlen (##sys#size subst)]
[substlen-1 (fx- substlen 1)]
[result '()]
[total 0] )
(define (push x)
(set! result (cons x result))
(set! total (fx+ total (##sys#size x))) )
(define (substitute matches)
(let loop ([start 0] [index 0])
(if (fx>= index substlen-1)
(push (if (fx= start 0) subst (substring subst start substlen)))
(let ([c (##core#inline "C_subchar" subst index)]
[index+1 (fx+ index 1)] )
(if (char=? c #\\)
(let ([c2 (##core#inline "C_subchar" subst index+1)])
(if (not (char=? #\\ c2))
(let ([mi (list-ref matches (fx- (char->integer c2)
48))])
(push (substring subst start index))
(push (substring string (car mi) (cadr mi)))
(loop (fx+ index 2) index+1) )
(loop start (fx+ index+1 1)) ) )
(loop start index+1) ) ) ) ) )
(define (concatenate strs)
(let ([str (make-string total)])
(let loop ([ss strs] [index 0])
(if (null? ss)
str
(let* ([si (car ss)]
[len (##sys#size si)] )
(##core#inline "C_substring_copy" si str 0 len index)
(loop (cdr ss) (fx+ index len)) ) ) ) ) )
(let loop ([index 0] [count 1])
(let ([matches (string-search-positions regex string index)])
(cond [matches
(let* ([range (car matches)]
[upto (cadr range)] )
(cond [(or (not (fixnum? which)) (fx= count which))
(push (substring string index (car range)))
(substitute matches)
(loop upto #f) ]
[else
(push (substring string index upto))
(loop upto (fx+ count 1)) ] ) ) ]
[else
(push (substring string index (##sys#size string)))
(concatenate (reverse result)) ] ) ) ) ) ) ) )
(define string-substitute*
(let ([string-search-positions string-search-positions])
(lambda (str smap)
(##sys#check-string str 'string-substitute*)
(##sys#check-list smap 'string-substitute*)
(let ([len (##sys#size str)])
(define (collect i from total fs)
(if (fx>= i len)
(##sys#fragments->string
total
(reverse
(if (fx> i from)
(cons (##sys#substring str from i) fs)
fs) ) )
(let loop ([smap smap] [pos len])
(if (null? smap)
(collect pos from (fx+ total (fx- pos i)) fs)
(let* ([p (car smap)]
[sm (car p)]
[st (cdr p)]
[m (string-search-positions sm str i)]
[ma (and m (##sys#slot m 0))] )
(if (and ma (fx= i (##sys#slot ma 0)))
(let ([i2 (##sys#slot (##sys#slot ma 1) 0)])
(when (fx> i from)
(set! fs
(cons (##sys#substring str from i) fs)) )
(collect
i2 i2
(fx+ total (string-length st))
(cons st fs) ) )
(loop (cdr smap)
(if ma
(fxmin pos (##sys#slot ma 0))
pos) ) ) ) ) ) ) )
(collect 0 0 0 '()) ) ) ) )
;;; Some useful things:
(define pattern->regexp
(let ([list->string list->string]
[string->list string->list] )
(lambda (s)
(##sys#check-string s 'pattern->regexp)
(list->string
(let loop ([cs (string->list s)])
(if (null? cs)
'()
(let ([c (car cs)]
[rest (cdr cs)] )
(cond [(char=? c #\*) `(#\. #\* ,@(loop rest))]
[(char=? c #\?) (cons '#\. (loop rest))]
[(or (char-alphabetic? c) (char-numeric? c)) (cons c (loop
rest))]
[else `(#\\ ,c ,@(loop rest))] ) ) ) ) ) ) ) )
(define grep
(let ([string-match string-match])
(lambda (rx lst)
(##sys#check-list lst 'grep)
(let loop ([lst lst])
(if (null? lst)
'()
(let ([x (car lst)]
[r (cdr lst)] )
(if (string-match rx x)
(cons x (loop r))
(loop r) ) ) ) ) ) ) )