guile-sources
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

scheme lexical compiler


From: Lynn Winebarger
Subject: scheme lexical compiler
Date: Wed, 03 Sep 2003 20:01:23 -0500
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.0.1) Gecko/20020830


Put in preparsed regexps and get back one big letrec with all functions
tail-recursive!
Use (regexp->code ,regexp) to compile.

There's an example regexp constructed about halfway through the file.
The first half of the file is the original set of macros I wrote to
convert regexps directly to backtracking closures.  I left it just
in case anyone likes that sort of thing.

There are chezisms.  If you load it into petite chez you'll see what the
output of the examples at the bottom is supposed to be.  I don't know
what will happen in Guile, but it probably won't work on the first pass.

Lynn

;;;  Copyright 2003  Lynn Winebarger
;;;  All Rights Reserved.
;;;  This code is released under the terms of the GPL version 2 or later.
;;;  There is absolutely no warranty!  Use it at your own risk.


;;; RegExp -> epsilon
;;;        | (success)
;;;        | (fail)
;;;        | (catenate RegExp ...)
;;;        | (alternate RegExp ...)
;;;        | (star RegExp ...)
;;;        | (plus RegExp ...)
;;;        | (repeat n RegExp ...)
;;;        | (lookahead RegExp)
;;;        | (action expr0 expr1 ...)
;;;        | (charset negate Literal ...)
;;;        | (charset Literal ...)
;;;        | Literal
;;;
;;; Literal ->   (literal negate lit)
;;;            | (literal lit)
;;;            | (range negate lower upper)
;;;            | (range lower upper)
;;; where string literals specify an exact match
;;;
;;;  Code in an action has access to 5 variables:
;;;     stream, consumed (# of characters), cstack (stack of characters 
consumed),
;;;     sk (success continuation), fk (failure continuation)
;;;  The code generated is all in tail form so the action code should
;;;  either apply one of the supplied continuations _or_ just return into 
whatever
;;;  context called the lexer (i.e. the parser).

(define digit? (lambda (n) (and (integer? n) (>= n 0) (<= n 9))))

;;;; approximately the first half of the file consists of the initial
;;;; macros I wrote to directly expand regexps to nfa code.

(define-syntax syntax-construct-name/template
  (let ([name-builder 
          (lambda args
            (string->symbol
              (apply string-append
                (map (lambda (x)
                       (cond
                         ((symbol? x) (symbol->string x))
                         ((string? x) x)
                         ((char? x) (string x))
                         ((number? x) (number->string x))
                         (else (error 'syntax-construct-name "Can't make a 
string from ~s" x))))
                  args))))])
    (lambda (x)
      (syntax-case x ()
        ((_ tmp (name0 name1 ...) k . args)
         (with-syntax ([new-name (datum->syntax-object (syntax tmp)
                                   (apply name-builder
                                     (map syntax-object->datum (syntax (name0 
name1 ...)))))])
           (syntax
             (k new-name . args))))))))

(define-syntax syntax-infect-names
  (syntax-rules ()
    ((_ tmp names . rest)
     (syntax-infect-names-loop tmp names () . rest))))

(define-syntax syntax-infect-names-loop
  (syntax-rules ()
    ((_ tmp () names . args)
     (syntax-reverse names . args))
    ((_ tmp (name . rest) infected-names . args)
     (syntax-construct-name/template tmp (name) syntax-infect-names-loop-k tmp 
rest infected-names . args))))

(define-syntax syntax-infect-names-loop-k
  (syntax-rules ()
    ((_ new-name tmp rest infected-names . args)
     (syntax-infect-names-loop tmp rest (new-name . infected-names) . args))))

(define-syntax syntax-reverse
  (syntax-rules ()
    ((_ ls . args)
     (syntax-reverse-helper ls () . args))))

(define-syntax syntax-reverse-helper
  (syntax-rules ()
    ((_ () accumulator k . args)
     (k accumulator . args))
    ((_ (first . rest) accumulator . args)
     (syntax-reverse-helper rest (first . accumulator) . args))))

(define-syntax syntax-eval
  (lambda (x)
    (syntax-case x ()
      ((_ const-exp k . args)
       (with-syntax ((constant (datum->syntax-object (syntax _)
                                 (eval (syntax-object->datum (syntax 
const-exp))))))
         (syntax (k constant . args)))))))

(define-syntax lambda@
  (syntax-rules ()
    [(_ () body0 body1 ...) (begin body0 body1 ...)]
    [(_ (formal0 formal1 ...) body0 body1 ...)
     (lambda (formal0)
       (lambda@ (formal1 ...) body0 body1 ...))]))

(define-syntax @    
  (syntax-rules ()
    [(_ arg0) arg0]
    [(_ arg0 arg1 arg2 ...) (@ (arg0 arg1) arg2 ...)]))

(define-syntax lambda@
  (syntax-rules ()
    [(_ . rest) (lambda . rest)]))

(define-syntax @    
  (syntax-rules ()
    [(_ . rest) rest]))

(define-syntax lex::regexp
  (syntax-rules ()
    ((_ re)
     (lambda (s)
       (@ re s 0 '() '()
         (lambda (s n cs d) `(,n ,(list->string (reverse cs)) ,s ,d))
         (lambda () #f))))))

(define-syntax lex::epsilon
  (identifier-syntax (lambda (stream consumed cstack data sk fk) (sk stream 
consumed cstack data))))
                      
(define-syntax lex::alternate
  (syntax-rules ()
    ((_ re)
     re)
    ((_ re1 re2)
     (lambda@ (stream consumed cstack data sk fk)
       (@ re1 stream consumed cstack data
         (lambda (new-stream-1 consumed-1 cstack-1 data-1)
           (@ re2 stream consumed cstack data
             (lambda (new-stream-2 consumed-2 cstack-2 data-2)
               (if (> consumed-2 consumed-1)
                   (sk new-stream-2 consumed-2 cstack-2 data-2)
                   (sk new-stream-1 consumed-1 cstack-1 data-1)))
             (lambda ()
               (sk new-stream-1 consumed-1 cstack-1 data-1))))
         (lambda ()
           (@ re2 stream consumed cstack data sk fk)))))
    ((_ re0 re1 ...)
     (lambda@ (stream consumed cstack data sk fk)
       (@ re0 stream consumed cstack data sk
         (lambda ()
           (@ (lex::alternate re1 ...) stream consumed cstack data sk fk)))))))

(define-syntax lex::catenate
  (syntax-rules ()
    ((_ re)
     re)
    ((_ re1 re2)
     (lambda@ (stream consumed cstack data sk fk)
       (@ re1 stream consumed cstack data
         (lambda (new-stream new-consumed new-cstack new-data)
           (@ re2 new-stream new-consumed new-cstack new-data
             sk fk))
         fk)))
    ((_ re0 re1 ...)
     (lambda@ (stream consumed cstack data sk fk)
       (@ re0 stream consumed cstack data
         (lambda (new-stream new-consumed new-cstack new-data)
           (@ (lex::catenate re1 ...) new-stream new-consumed new-cstack 
new-data sk fk))
         fk)))))

(define-syntax lex::star
  (syntax-rules ()
    ((_ re)
     (letrec ([loop
                (lambda@ (stream consumed cstack data sk fk)
                  (@ re stream consumed cstack data
                    (lambda (new-stream new-consumed new-cstack new-data)
                      (@ loop new-stream new-consumed new-cstack new-data sk 
fk))
                    (lambda ()
                      (sk stream consumed cstack data))))])
       loop))))

(define-syntax lex::plus
  (syntax-rules ()
    ((_ re)
     (lex::catenate re (star re)))))

(define-syntax lex::repeat
  (syntax-rules ()
    ((_ 0 re)
     (lambda@ (stream consumed cstack data sk fk)
       (sk stream consumed cstack data)))
    ((_ 1 re)
     re)
    ((_ n re)
     (syntax-eval (1- n) build-repeat n re))))

(define-syntax build-repeat
  (syntax-rules ()
    ((_ n-1 n re)
     (lambda (stream consumed cstack data sk fk)
       (@ re stream consumed cstack data
         (lambda (new-stream new-consumed new-cstack new-data)
           (@ (lex::repeat n-1 re) new-stream new-consumed new-cstack new-data 
sk fk))
         fk)))))

(define-syntax lex::range
  (syntax-rules (negate)
    ((_ lower upper)
     (syntax-eval (char? lower) range-lower-char?-k lower upper))
    ((_ negate lower upper)
     (syntax-eval (char? lower) range-lower-char?-k lower upper negate))))

(define-syntax range-lower-char?-k
  (syntax-rules ()
    ((_ #t lower upper . rest)
     (syntax-eval (char? upper) range-upper-char?-k-1 lower upper . rest))
    ((_ #f lower upper . rest)
     (syntax-eval (number? lower) range-lower-number?-k lower upper . rest))))

(define-syntax range-upper-char?-k-1
  (syntax-rules (negate)
    ((_ #t lower upper)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda (c new-stream)
           (if (and (char<=? c upper) (char>=? c lower))
               (sk new-stream (1+ consumed) (cons c cstack) data)
               (fk)))
         fk)))
    ((_ #t lower upper negate)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda (c new-stream)
           (if (and (char<=? c upper) (char>=? c lower))
               (fk)
               (sk new-stream (1+ consumed) (cons c cstack) data)))
         fk)))
    ((_ #f lower upper . rest)
     (syntax-eval (digit? upper) convert-upper-k lower upper . rest))))

(define-syntax convert-upper-k
  (syntax-rules ()
    ((_ #t lower upper . rest)
     (syntax-eval (string-ref (number->string upper) 0) converted-upper-k lower 
. rest))))

(define-syntax converted-upper-k
  (syntax-rules (negate)
    ((_ upper lower)
     (lex::range lower upper))
    ((_ upper lower negate)
     (lex::range negate lower upper))))

(define-syntax range-lower-number?-k
  (syntax-rules ()
    ((_ #t lower upper . rest)
     (syntax-eval? (number? upper) range-upper-number?-k lower upper . rest))))

(define-syntax range-upper-number?-k
  (syntax-rules (negate)
    ((_ #t lower upper)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda (c new-stream)
           (let ([cn (char->integer c)])
             (if (and (<= cn upper) (>= cn lower))
               (sk new-stream (1+ consumed) (cons c cstack) data)
               (fk))))
         fk)))
    ((_ #t lower upper negate)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda (c new-stream)
           (let ([cn (char->integer c)])
             (if (and (<= cn upper) (>= cn lower))
               (fk)
               (sk new-stream (1+ consumed) (cons c cstack) data))))
         fk)))
    ((_ #f lower upper . rest)
     (syntax-eval (char? upper) range-upper-char?-k-2 lower upper . rest))))

(define-syntax range-upper-char?-k-2
  (syntax-rules ()
    ((_ #t lower upper . rest)
     (syntax-eval (if (digit? lower) (string-ref (number->string lower) 0) #f) 
range-lower-digit-k lower upper . rest))))

(define-syntax range-lower-digit-k
  (syntax-rules (negate)
    ((_ #f lower upper)
     (syntax-error 'range "upper is a character and lower is not a digit in ~s" 
'(range lower upper)))
    ((_ lower bogus-lower upper)
     (lex::range lower upper))
    ((_ lower bogus-lower upper negate)
     (lex::range negate lower upper))))

(define-syntax lex::literal
  (syntax-rules ()
    ((_ l)
     (syntax-eval (char? l) literal-char?-k l))))

(define-syntax literal-char?-k
  (syntax-rules ()
    ((_ #t l)
     (char-literal l))
    ((_ #f l)
     (syntax-eval (number? l) literal-number?-k l))))

(define-syntax literal-number?-k
  (syntax-rules ()
    ((_ #t l)
     (syntax-eval (integer->char l) char-literal))
    ((_ #f l)
     (syntax-eval (string? l) literal-string?-k l))))

(define-syntax literal-string?-k
  (syntax-rules ()
    ((_ #t l)
     (syntax-eval (string->list l) literal-string-chars-k))))

(define-syntax literal-string-chars-k
  (syntax-rules ()
    ((_ (c ...))
     (lex::catenate (lex::char-literal c) ...))))

(define-syntax lex::char-literal
  (syntax-rules (negate)
    ((_ l)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda@ (c new-stream)
           (if (char=? c l)
               (sk new-stream (1+ consumed) (cons c cstack) data)
               (fk)))
         fk)))
    ((_ negate l)
     (lambda@ (stream consumed cstack data sk fk)
       (stream-read stream
         (lambda@ (c new-stream)
           (if (char=? c l)
               (fk)
               (sk new-stream (1+ consumed) (cons c cstack) data)))
         fk)))))


(define-syntax lex::charset
  (syntax-rules (negate)
    ((_ negate (op . rands) ...)
     (lex::alternate (op negate . rands) ...))
    ((_ . args)
     (lex::alternate . args))))

     
;; ignore and proceed if the supplied regular expression is matched,
;; fail if it is not
(define-syntax lex::lookahead
  (syntax-rules ()
    ((_ re)
     (lambda@ (stream consumed cstack data sk fk)
       (@ re stream consumed cstack data
         (lambda (new-stream new-consumed new-cstack new-data)
           (sk stream consumed cstack data))
         fk)))))

;;;
(define-syntax lex::action
  (lambda (x)
    ;; stupid syntax-rules throws away the operator and the expression might 
not have a pure wrapper
    (syntax-case x ()
      ((_ expr . rest)
       (syntax (syntax-infect-names _ (stream consumed cstack data sk fk) 
action-k expr . rest))))))

(define-syntax action-k
  (syntax-rules ()
    ((_ formals expr . rest)
     (lambda formals expr . rest))))


(define-syntax inert-operators
  (syntax-rules ()
    ((_ op ...)
     (begin
       (define-syntax op
         (syntax-rules ()
           ((_ arg (... ...))
            `(op ,arg (... ...)))))
       ...))))

(inert-operators regexp epsilon catenate alternate star plus repeat lookahead 
charset literal range)
(define-syntax action
  (syntax-rules ()
    ((_ body ...)
     '(action body ...))))

;; functional input streams

(define make-stream
  (lambda (port)
    (cons (file-position port) port)))

(define stream-read
  (lambda (s sk fk)
    (file-position (cdr s) (car s))
    (let ([c (read-char (cdr s))])
      (if (eof-object? c)
          (fk)
          (sk c (make-stream (cdr s)))))))

(define my-string-port-handler
  (lambda (s)
    (let ([l (string-length s)]
          [p 0]
          [eof (read-char (open-input-string ""))])
      (lambda args
        (record-case args
          ((block-read) (port string count)
           (error 'my-string-port-handler "block-read unsupported"))
          ((block-write) (port string count)
           (error 'my-string-port-handler "block-write unsupported"))
          ((char-ready?) (port)
           (if (< p l) #t #f))
          ((clear-input-port) (port)
           (error 'my-string-port-handler "clear-input-port unsupported"))
          ((clear-output-port) (port)
           (error 'my-string-port-handler "clear-output-port unsupported"))
          ((close-port) (port)
           ;; nothing required
           #t)
          ((file-position) (port . position)
           (if (null? position)
               p
               (if (and (<= (car position) l) (>= (car position) 0))
                   (set! p (car position))
                   (error 'my-string-port-handler "~s is out of bounds" (car 
position)))))
          ((file-length) (port)
           l)
          ((flush-output-port) (port)
           (error 'my-string-port-handler "flush-output-port unsupported"))
          ((peek-char) (port)
           (string-ref s p))
          ((port-name) (port)
           "#<my-string-port>")
          ((read-char) (port)
           (if (>= p l)
               eof
               (let ([c (string-ref s p)])
                 (set! p (1+ p))
                 c)))
          ((unread-char) (char port)
           (if (> p 0)
               (set! p (1- p))
               (error 'my-string-port-handler "can't unread-char at beginning 
of string port")))
          ((write-char) (char port)
           (error 'my-string-port-handler "write-char unsupported")))))))

(define make-my-string-port
  (lambda (s)
    (make-input-port (my-string-port-handler s) "")))

(define make-string-stream
  (lambda (s)
    (make-stream (make-my-string-port s))))

(define make-file-stream
  (lambda (name)
    (make-stream (open-input-file name))))

(define-syntax define-regexp
  (syntax-rules ()
    ((_ name defn)
     (define-syntax name (identifier-syntax defn)))))

(define-syntax define-literal-regexps
  (syntax-rules ()
    ((_ (name lit ...) ...)
     (begin
       (define-regexp name (charset (literal lit) ...))
       ...))))

;; the non-alphanumerics on my keyboard
(define-literal-regexps
  (ws #\space #\tab)
  (space #\space)
  (tab #\tab)
  (nl #\newline)
  (bang #\!)
  (at #\@)
  (sharp #\#)
  (dollar #\$)
  (percent #\%)
  (carat #\^)
  (amp #\&)
  (asterick #\*)
  (lparen #\()
  (rparen #\))
  (underscore #\_)
  (minus-sign #\-)
  (equal-sign #\=)
  (plus-sign #\+)
  (backslash #\\)
  (vertical-bar #\|)
  (slash #\/)
  (huh #\?)
  (comma #\,)
  (period #\.)
  (lt #\<)
  (gt #\>)
  (apostrophe #\')
  (backquote #\`)
  (doublequote #\")
  (tilde #\~)
  (colon #\:)
  (semicolon #\;)
  (lsquare #\[)
  (rsquare #\])
  (lcurly #\{)
  (rcurly #\})
  )

(define-regexp alphabetic (charset (range #\a #\z) (range #\A #\Z)))
(define-regexp digit (range #\0 #\9))
(define-regexp alphanum (charset alphabetic digit))
(define-regexp id-follow-specials (charset bang at dollar percent carat amp 
asterick
                                    underscore minus-sign equal-sign plus-sign
                                    backslash slash huh lt gt tilde colon))
(define-regexp id-single-specials (charset asterick minus-sign plus-sign 
equal-sign slash lt gt))
(define-regexp id (alternate
                    (catenate alphabetic
                      (star (charset alphanum id-follow-specials)))
                    id-single-specials))

(define-syntax define-simple-lexer
  (syntax-rules ()
    ((_ lexer-name re-name ...)
     (define lexer-name
       (alternate
         (catenate re-name '(succeed) (action `(re-name ,(list->string (reverse 
cstack)) ,stream ,consumed ,data ,sk ,fk)))
         ...)))))

(define-simple-lexer lexer
  ws nl id sharp backslash vertical-bar comma
  period apostrophe backquote doublequote semicolon
  lsquare rsquare lparen rparen lcurly rcurly
  bang at dollar percent carat amp underscore huh tilde colon)


(define simplest-string-parser
  (lambda (lexer str)
    (let loop ([s (make-string-stream str)]
               [acc '()])
      (let ([result (lexer s 0 '() '()
                      (lambda args args)
                      (lambda () #f))])
        (if result
            (let ([tok (car result)]
                  [tok-string (cadr result)]
                  [new-stream (caddr result)]
                  [consumed (cadddr result)])
              (loop new-stream (cons  `(,tok . ,tok-string) acc)))
            (reverse acc))))))


;;; Here's the interesting stuff


(define merge-following-positions-bindings
  (lambda (b1 b2)
    (let loop ([ls1 b1]
               [ls2 b2]
               [result '()])
      (cond
        [(null? ls1) (append result ls2)]
        [(null? ls2) (append result ls1)]
        [else
          (let ([key (caar ls1)]
                [val (cdar ls1)])
            (let ([pr (assv key ls2)])
              (if pr
                  (loop (cdr ls1) (remq pr ls2)
                    `((,key . ,(sorted-union val (cdr pr))) . ,result))
                  (loop (cdr ls1) ls2
                    `(,(car ls1) . ,result)))))]))))


(define max-char (integer->char 255))
(define min-char (integer->char 0))

(define decrement-char
  (lambda (c)
    (if (char>? c min-char)
        (integer->char (1- (char->integer c)))
        c)))

(define increment-char
  (lambda (c)
    (if (char<? c max-char)
        (integer->char (1+ (char->integer c)))
        c)))

(define fold
  (lambda (op ls)
    (let loop ([result (car ls)]
               [rest (cdr ls)])
      (if (null? rest)
          result
          (loop (op result (car rest)) (cdr rest))))))


;;; cs must contain only simplified literals
(define eliminate-charsets
  (lambda (cs)
    (let loop ([queue cs]
               [stack '()])
      (if (null? queue)
          (reverse stack)
          (record-case (car queue)
            ((charset) cs
             (loop (append cs (cdr queue)) stack))
            ((fail) ignore (loop (cdr queue) stack))
            (else
              (loop (cdr queue) (cons (car queue) stack))))))))

;;; A simplified literal:
;;;   contains no negations
;;;   charsets are non-empty, contain no charsets only simplified literals or 
ranges
;;;   ranges are non-trivial (upper - lower >= 2)

(define simplify-positive-literal
  (lambda (l)
    (record-case l
      ((succeed fail action negate-action) ignore l)
      ((literal) (lit)
       (if (number? lit)
           `(literal ,(integer->char lit))
           l))
      ((range) (lower upper)
       (cond
         [(and (char? lower) (char? upper))
          (cond
            [(char<? upper lower) '(fail)]
            [(char=? upper lower) `(literal ,upper)]
            [else l])]
         [(char? lower)
          (simplify-positive-literal `(range ,lower ,(integer->char upper)))]
         [(char? upper)
          (simplify-positive-literal `(range ,(integer->char lower) ,upper))]
         [else
           (simplify-positive-literal `(range ,(integer->char lower) 
,(integer->char upper)))]))
      ((charset) cs
       (cond
         [(null? cs) '(fail)]
         [(null? (cdr cs)) (car cs)]
         [else
           ;; this function is only called after all elements of cs have been 
through simplify-literal
           ;; and charsets have been eliminated
           (fold union-literals cs)])))))

(define simplify-literal
  (lambda (l)
    (record-case l
      ((succeed fail action negate-action) ignore l)
      ((literal range) (f . rest)
       (simplify-positive-literal
         (if (eq? f 'negate)
             (negate-literal `(,(car l) . ,rest))
             l)))
      ((charset) cs
       (if (null? cs)
           '(fail)
           (simplify-positive-literal
             (if (eq? (car cs) 'negate)
                 (let ([s (eliminate-charsets (map simplify-literal (cdr cs)))])
                   (cond
                     [(null? s) (negate-literal '(fail))]
                     [(null? (cdr s)) (negate-literal (car s))]
                     [else (negate-literal `(charset . ,s))]))
                 `(charset . ,(eliminate-charsets (map simplify-literal 
cs))))))))))


(define wrap-disjoint-literal-list
  (lambda (ls)
    (cond
      [(null? ls) '(fail)]
      [(null? (cdr ls)) (car ls)]
      [else `(charset . ,ls)])))

(define union-charset/simple-lit
  (lambda (cs-in sl)
    (let loop ([cs (cdr cs-in)]
               [tried '()])
      (if (null? cs)
          (if (eq? (car sl) 'succeed)
              `(,@cs-in (succeed))    ;;; succeed as a last resort
              `(charset ,sl . ,(cdr cs-in)))
          (let ([u (union-literals sl (car cs))])
            (if (equal? u (car cs))
                ;; this assumes that union-literals will return (car cs) in the 
same
                ;; order as it went in if (car cs) subsumes u entirely
                cs-in
                (record-case u
                  ((charset) u-cs
                   (if (member sl u-cs)
                       ;; no compaction gained by this union
                       (loop (cdr cs) (cons (car cs) tried))
                       ;; we've found something sl compactifies with, try to 
compact the rest
                       ;;    must not allow recursive charset
                       (let ([possibly-smaller (fold union-literals `(,@u-cs . 
,cs))])
                         (record-case possibly-smaller
                           ((charset) ps-cs
                            ;; ps-cs must be simple and have no adjacency with 
tried
                            (wrap-disjoint-literal-list
                              ;; note that since the lit did not join with 
previous entries,
                              ;; and (cdr cs-in) is already disjoint, we only 
need to try to
                              ;; compactify with the remaining elements of the 
list (i.e. no adjacency)
                              (append (reverse tried) ps-cs)))
                           (else (wrap-disjoint-literal-list
                                   (append (reverse tried) 
`(,possibly-smaller))))))))
                  (else
                    ;; same comments regarding simplicity of result as above
                    (let ([possibly-smaller (fold union-literals `(,u . ,cs))])
                      (record-case possibly-smaller
                        ((charset) ps-cs
                         (wrap-disjoint-literal-list
                           (append (reverse tried) ps-cs)))
                        (else (wrap-disjoint-literal-list
                                (append (reverse tried) 
`(,possibly-smaller))))))))))))))

;; the following 3 operations only maintain simplification, they do
;; not attempt to introduce it.

;;;  When we add actions, there is no way to construct a complement - in 
particular,
;;; when we construct the complement of fail, we don't construct all possible 
actions.
;;; So the interaction between actions and non-actions is delicate
;;; There is a class of literals called negate-action which exists for the
;;; sole purpose of dealing with set differences with actions as elements.
;;;
;;; fail is still the empty set/bottom, though.
;;;   Best way to think about them is as two completely disjoint universes.
;;; To negate "fail", we will use the negate-action with an empty body.
;;; All interesting actions will intersect with that (so that {(action . body)} 
- {} != {})
;;;


;;;  Isn't this horrible?
(define union-literals
  (lambda (l1 l2)
    (record-case l1
      ((succeed) ignore
       (record-case l2
         ((succeed fail) ignore l1)
         ((charset) cs
          (fold union-literals l1 cs))
         (else `(charset ,l2 ,l1))))
      ((fail) ignore l2)
      ((action) body1
       (record-case l2
         ((succeed) ignore
          `(charset ,l1 succeed))
         ((fail) ignore l1)
         ((action) body2
          (if (equal? body1 body2)
              l1
              `(charset ,l1 ,l2)))
         ((charset) cs2
          (union-charset/simple-lit l2 l1))
         (else `(charset ,l1 ,l2))))
      ((negate-action) body1
       (record-case l2
         ((succeed) ignore
          `(charset ,l1 ,l2))
         ((fail) ignore l1)
         ((negate-action) body2
          (if (equal? body1 body2)
              l1
              `(charset ,l1 ,l2)))
         ((charset) cs2
          (union-charset/simple-lit l2 l1))
         (else `(charset ,l1 ,l2))))
      ((literal) (lit1)
       (record-case l2
         ((succeed) ignore `(charset ,l1 ,l2))
         ((fail) ignore l1)
         ((action negate-action) body
          `(charset ,l1 ,l2))
         ((literal) (lit2)
          (cond
            [(eqv? lit1 lit2) l1]
            [(eqv? (char- lit1 lit2) 1)
             `(range ,lit2 ,lit1)]
            [(eqv? (char- lit2 lit1) 1)
             `(range ,lit1 ,lit2)]
            [else `(charset ,l1 ,l2)]))
         ((range) (lower upper)
          (cond
            [(and (char<=? lit1 upper) (char>=? lit1 lower))
             l2]
            [(eqv? (char- lit1 upper) 1)
             `(range ,lower ,lit1)]
            [(eqv? (char- lower lit1) 1)
             `(range ,lit1 ,upper)]
            [else `(charset ,l1 ,l2)]))
         ((charset) cs
          (union-charset/simple-lit l2 l1))))
      ((range) (lower1 upper1)
       (record-case l2
         ((succeed) ignore `(charset ,l1 ,l2))
         ((fail) ignore l1)
         ((action negate-action) body
          `(charset ,l1 ,l2))
         ((literal) (lit2)
          (union-literals l2 l1))
         ((range) (lower2 upper2)
          (cond
            [(char=? lower2 (increment-char upper1)) `(range ,lower1 ,upper2)]
            [(char=? lower1 (increment-char upper2)) `(range ,lower2 ,upper1)]
            [(char>? lower2 upper1) `(charset ,l1 ,l2)]
            [(char>? lower1 upper2) `(charset ,l1 ,l2)]
            [(char<=? upper1 upper2)
             (cond
               [(char<=? lower2 lower1) l2]
               [else `(range ,lower1 ,upper2)])]
            [(char<=? upper2 upper1)
             (cond
               [(char<=? lower1 lower2) l1]
               [else `(range ,lower2 ,upper1)])]))
         ((charset) cs
          (union-charset/simple-lit l2 l1))))
    ((charset) cs1
       (record-case l2
         ((fail) ignore l1)
         ((succeed action negate-action literal range) ignore
          (union-charset/simple-lit l1 l2))
         ((charset) cs2
          ;; correct at each step by simplicity maintenance
          (fold union-literals `(,@cs1 . ,cs2))))))))

(define intersect-literals
  (lambda (l1 l2)
    (record-case l1
      ((succeed) ignore
       (record-case l2
         ((succeed) ignore l1)
         (else '(fail))))
      ((fail) ignore '(fail))
      ((action) body1
       (record-case l2
         ((action) body2
          (if (equal? body1 body2)
              l1
              '(fail)))
         ((negate-action) body2
          (if (not (equal? body1 body2))
              l1
              '(fail)))
         (else '(fail))))
      ((negate-action) body1
       (record-case l2
         ((negate-action) body2
          (if (equal? body1 body2)
              l1
              '(fail)))
         ((action) body2
          (if (not (equal? body1 body2))
              l1
              '(fail)))
         (else '(fail))))
      ((literal) (lit1)
       (record-case l2
         ((succeed fail action negate-action) ignore '(fail))
         ((literal) (lit2)
          (if (eqv? lit1 lit2) l1 '(fail)))
         ((range) (lower upper)
          (if (and (char<=? lit1 upper) (char>=? lit1 lower))
              `(,l1)
              '(fail)))
         ((charset) cs
          (let loop ([cs cs])
            (if (null? cs)
                '(fail)
                (if (equal? (intersect-literals l1 (car cs)) '(fail))
                    (loop (cdr cs))
                    l1))))))
      ((range) (lower1 upper1)
       (record-case l2
         ((succeed fail action negate-action) ignore '(fail))
         ((literal) (lit2)
          (intersect-literals l2 l1))
         ((range) (lower2 upper2)
          (cond
            [(char>? lower2 upper1) '(fail)]
            [(char>? lower1 upper2) '(fail)]
            [(char<=? upper1 upper2)
             (cond
               [(char<=? lower2 lower1) l1]
               [else `(range ,lower2 ,upper1)])]
            [(char<=? upper2 upper1)
             (cond
               [(char<=? lower1 lower2) l2]
               [else `(range ,lower1 ,upper2)])]))
         ((charset) cs2
          ;;  DeMorgan's Law
          (fold union-literals
            (map (lambda (x) (intersect-literals l1 x)) cs2)))))
      ((charset) cs1
       (record-case l2
         ((succeed fail action negate-action) ignore '(fail))
         ((literal) (lit2)
          (intersect-literals l2 l1))
         ((range) (lower2 upper2)
          (intersect-literals l2 l1))
         ((charset) cs2
          ;; DeMorgan's Law + recursion
          (fold union-literals
            (map (lambda (x) (intersect-literals l1 x)) cs2))))))))
                      
(define negate-literal
  (lambda (l)
    (record-case l
      ((succeed) ignore `(charset (range ,min-char ,max-char) (negate-action)))
      ((fail) ignore `(charset (range ,min-char ,max-char) (negate-action) 
succeed))
      ((action) body `(negate-action . ,body))
      ((negate-action) body `(action . ,body))
      ((literal) (l)
       (cond
         [(char=? l min-char)
          `(range ,(increment-char min-char) ,max-char)]
         [(char=? l max-char)
          `(range ,min-char ,(decrement-char max-char))]
         [(char=? l (increment-char min-char))
          `(charset (literal ,min-char) (range ,(increment-char l) ,max-char))]
         [(char=? l (decrement-char max-char))
          `(charset (range ,min-char ,(decrement-char l)) (literal ,max-char))]
         [else
           `(charset (range ,min-char ,(decrement-char l)) (range 
,(increment-char l) ,max-char))]))
      ((range) (lower upper)
       (cond
         [(char=? lower min-char)
          (cond
            [(char=? upper max-char)
             '(fail)]
            [(char=? upper (decrement-char max-char))
             `(literal ,max-char)]
            [else
              `(range ,(increment-char upper) ,max-char)])]
         [(char=? upper max-char)
          (cond
            [(char=? lower (increment-char min-char))
             `(literal ,min-char)]
            [else
              `(range ,min-char ,(decrement-char max-char))])]
         [(char=? lower (increment-char min-char))
          (cond
            [(char=? upper (decrement-char max-char))
             `(charset (literal ,min-char) (literal ,max-char))]
            [else
             `(charset (literal ,min-char) (range ,(increment-char upper) 
,max-char))])]
         [(char=? upper (decrement-char max-char))
          `(charset (range ,min-char ,(decrement-char lower)) (literal 
,max-char))]
         [else
           `(charset (range ,min-char ,(decrement-char lower)) (range 
,(increment-char upper) ,max-char))]))
      ((charset) cs
       (fold intersect-literals (map negate-literal cs))))))

(define satisfy
  (lambda (p ls)
    (let loop ([ls ls]
               [acc '()])
      (if (null? ls)
          (reverse acc)
          (loop (cdr ls)
            (let ([r (p (car ls))])
              (if r
                  (cons r acc)
                  acc)))))))

(define action?
  (lambda (r)
    (record-case r
      ((action negate-action) ignore #t)
      (else #f))))

(define non-action?
  (lambda (r)
    (record-case r
      ((action negate-action) ignore #f)
      (else #t))))

;;; compute the functions nullable, firstpos, lastpos, and followpos
;;;   from red dragon, p. 138 - except all in one fell swoop.
(define generate-positions
  (lambda (r i k)
    (let generate-positions ([r `(catenate (succeed) ,r (succeed))]   ; add 
initial and final success states - necessary!
                             [i i]
                             [p '()]
                             [k k]) ; just to be explicit
      (set! r (record-case r
                ((literal range charset) x
                 (simplify-literal r))
                (else r)))
      (record-case r
        ;; k is of the form (lambda (new-record nullable? firstpos lastpos 
followpos paths position-lits new-i) body ...)
        ((epsilon) () (k '(epsilon) #t '() '() '() '() '() i))
        ((fail) whatever (k '(fail) #f '() '() '() '() '() i))
        ((succeed) whatever
         (k `(succeed ,i) #f `(,i) `(,i) '() `((,i . ,p)) `((,i (succeed))) (1+ 
i)))
        ((literal) (l)
         (k `(literal ,i ,l) #f `(,i) `(,i) '() `((,i . ,p)) `((,i ,r)) (1+ i)))
        ((range) (lower upper)
         (k `(range ,i ,lower ,upper) #f `(,i) `(,i) '() `((,i . ,p)) `((,i 
,r)) (1+ i)))
        ((charset) cs
         (k `(charset ,i . ,cs) #f `(,i) `(,i) '() `((,i . ,p)) `((,i ,r)) (1+ 
i)))
        ((action) body
         (k `(action ,i . ,body) #f `(,i) `(,i) '() `((,i . ,p)) `((,i (action 
,i . ,body))) (1+ i)))
        ((lookahead) (re)
         (let ([identifier (gensym)])
           (generate-positions
             `(catenate (action
                          ;; this side-effect is bad form (filling up symbol 
tables)
                          ;; but a proper answer could be constructed using 
data to pass around bindings
                          (define ,identifer `(,stream ,consumed ,cstack ,data))
                          (sk stream consumed data sk fk))
                (catenate ,re (success))
                (action (apply sk ,identifier)))
             i p k)))
        ((repeat) (n re)
         (if (= n 0)
             (generate-postions '(epsilon) i p k)
             (if (= n 1)
                 (generate-postions re i p k)
                 (generate-postions `(catenate ,re (repeat ,(1- n) ,re)) i p 
k))))
        ((repeat-at-most) (n re)
         (if (= n 0)
             (generate-postions '(epsilon) i p k)
             (if (= n 1)
                 (generate-postions `(alternate (epsilon) ,re) i p k)
                 (generate-postions `(alternate (repeat-at-most ,(1- n) ,re) 
(repeat ,n ,re)) i p k))))
        ((range-repeat) (lower upper re)
         (generate-postions `(catenate (repeat ,lower ,re) (repeat-at-most ,(- 
upper lower))) i p k))
        ((plus) (re)
         (generate-postions `(catenate ,re (star ,re)) i p k))
        ((question) (re)
         (generate-positions `(repeat-at-most 1 ,re) i p k))
        ((alternate) (expr1 . rest)
         (if (null? rest)
             (generate-positions expr1 i p k)
             (let ([expr2 (if (null? (cdr rest))
                              (car rest)
                              `(alternate . ,rest))])
               (generate-positions expr1 i (cons 0 p)
                 (lambda (expr1 nullable1 first-positions-1 last-positions-1 
fp1 paths1 plits1 i)
                   (generate-positions expr2 i (cons 1 p)
                     (lambda (expr2 nullable2 first-positions-2 
last-positions-2 fp2 paths2 plits2 i)
                       (cond
                         [(eq? (car expr1) 'fail)
                          (k expr2 nullable2 first-positions-2 last-positions-2 
fp2 paths2 plits2 i)]
                         [(eq? (car expr2) 'fail)
                          (k expr1 nullable1 first-positions-1 last-positions-1 
fp1 paths1 plits1 i)]
                         [else
                           (let ([nullable (or nullable1 nullable2)]
                                 [first-positions (sorted-union 
first-positions-1 first-positions-2)]
                                 [last-positions  (sorted-union 
last-positions-2 last-positions-1)]
                                 [paths (append paths1 paths2)]
                                 [plits (append plits1 plits2)]
                                 [fp (merge-following-positions-bindings fp1 
fp2)])
                             (k `(alternate ,nullable ,first-positions 
,last-positions ,expr1 ,expr2)
                               nullable first-positions last-positions fp paths 
plits i))]))))))))
        ((catenate) (expr1 . rest)
         (if (null? rest)
             (generate-positions expr1 i p k)
             (let ([expr2 (if (null? (cdr rest))
                              (car rest)
                              `(catenate . ,rest))])
               (generate-positions expr1 i (cons 0 p)
                 (lambda (expr1 nullable1 first-positions-1 last-positions-1 
fp1 paths1 plits1 i)
                   (if (eq? (car expr1) 'fail)
                       (k expr1 nullable1 first-positions-1 last-positions-1 
fp1 paths1 plits1 i)
                       (generate-positions expr2 i (cons 1 p)
                         (lambda (expr2 nullable2 first-positions-2 
last-positions-2 fp2 paths2 plits2 i)
                           (if (eq? (car expr2) 'fail)
                               (k expr2 nullable2 first-positions-2 
last-positions-2 fp2 paths2 plits2 i)
                               (let ([nullable (and nullable1 nullable2)]
                                     [first-positions (if nullable1
                                                          (sorted-union 
first-positions-1 first-positions-2)
                                                          first-positions-1)]
                                     [last-positions (if nullable2
                                                         (sorted-union 
last-positions-2 last-positions-1)
                                                         last-positions-2)]
                                     [fp (merge-following-positions-bindings
                                           (map (lambda (x) (cons x 
first-positions-2)) last-positions-1)
                                           (merge-following-positions-bindings 
fp1 fp2))]
                                     [paths (append paths1 paths2)]
                                     [plits (append plits1 plits2)])
                                 (k `(catenate ,nullable ,first-positions 
,last-positions ,expr1 ,expr2)
                                   nullable first-positions last-positions fp 
paths plits i)))))))))))
        ((star) (expr)
         (generate-positions expr i (cons 0 p)
           (lambda (expr nullable first-positions last-positions fp paths plits 
i)
             (if (eq? (car expr) 'fail)
                 (k expr nullable first-positions last-positions fp paths plits 
i)
                 (k `(star #t ,first-positions ,last-positions ,expr) #t
                   first-positions last-positions
                   (merge-following-positions-bindings
                     (map (lambda (x) (cons x first-positions)) last-positions)
                     fp)
                   paths
                   plits
                   i)))))
         ))))

(define gen-poses
  (lambda (r)
    (generate-positions r 1
      (lambda (expr nullable firstpos lastpos followpos paths plits i)
        `(,expr ,nullable ,firstpos ,lastpos ,followpos ,paths ,plits ,i)))))


;;; Modify the followposls in the following way
;;; For all i such that literal(i) is not an action or i is in the root set
;;;   Let A = { j <- followpos(i) | literal(j) is an action }
;;;      Order A by the index of the action (left to right in original 
expression)
;;;     Let B = followpos(i) - A
;;;         N = { #A new positions }  ; for actions in A
;;;         m = a new position            ; for the transitions in B and the 
union{followpos(i)|i <- A}
;;;            "clone" positions in A by setting literal(N[k]) = literal(A[k]) 
for 1 <= k <= #A
;;;                    and followpos(N[k]) = N[k+1] for 1 <= k < #A
;;;                    and followpos(i) = N[1]
;;;                    and followpos(N[#A]) = B union { followpos(p) | p <- A }
;;;
;;; Interestingly, this construction is dual to the subset construction - 
instead of replacing individual
;;; states with sets of states occupied simultaneously, we replace individual 
actions with the
;;; sequence of actions following any given position.  To unify these 
sequences, not only would the
;;; actual actions have to be the same, but the set of non-action positions in 
the original followpos(i)
;;; would have to agree with the non-action positions in the original 
followpos(j).  In otherwords,
;;; only if followpos(i) = followpos(j).  Note that the order of any particular 
subset of actions is
;;; determined by where they appear in the original expression (to mimic the 
ordering effects of depth-first search),
;;; so in that sense this is also a subset construction.

;;;   ****IMPORTANT NOTE****
;;;   Note that actions should only follow a (success) match if we want greedy 
semantics.  Otherwise,
;;;   actions will be fired too soon.
;;;     Also, the root set must not contain an action for this to work. 
`(catenate (succeed) ,re)
;;;   is a good transformation that makes that true.

(define action<? (lambda (x y) (< (cadr x) (cadr y))))
(define iota (lambda (start n) (let loop ([i (+ start n)] [r '()]) (if (>= i 
start) (loop (1- i) `(,i . ,r)) r))))

(define linearize-actions
  (lambda (followposls plits new-state k)
    (let ([after (lambda (x)
                   (let ([r (assv x followposls)])
                     (if r
                         (cdr r)
                         '())))]
          [literal (lambda (x) (cadr (assv x plits)))])
      (let loop ([ls (satisfy (lambda (pr)
                                (let ([l (literal (car pr))])
                                  (if (non-action? l) pr #f)))
                             followposls)]
                 [lits plits]
                 [new new-state]
                 [acc '()])
        (if (null? ls)
            (k (sort (lambda (x y) (< (car x) (car y))) acc)
              (sort (lambda (x y) (< (car x) (car y))) lits)
              new)
            (let ([i (car (car ls))]
                  [fs (cdr (car ls))])
              (if (null? fs)
                  (loop (cdr ls) lits new (cons (car ls) acc))
                  (let ([A (sort <
                             (satisfy (lambda (x) 
                                        (let ([l (literal x)])
                                          (if (action? l) x #f)))
                               fs))])
                    (if (null? A)
                        (loop (cdr ls) lits new (cons (car ls) acc))
                        (let* ([B (satisfy (lambda (x) 
                                             (let ([l (literal x)])
                                               (if (non-action? l) x #f)))
                                    fs)]
                               [l (length A)]
                               [N (iota new (1- l))]
                               [new (+ new l)]
                               [newlits (map (lambda (n a) `(,n ,(literal a))) 
N A)]
                               [newfps `((,i ,(car N))
                                         (,(1- new) . ,(fold sorted-union (cons 
B (map after A)))) .
                                         ,(map (lambda (n+1) `(,(1- n+1) ,n+1)) 
(cdr N)))])
                          (loop (cdr ls) (append newlits lits) new (append 
newfps acc))))))))))))

(define regexp->linearized-actions 
  (lambda (r)
    (generate-positions r 1
      (lambda (expr nullable firstpos lastpos followpos paths plits i)
        (linearize-actions firstpos followpos plits i
          (lambda args args))))))

(define sorted-union
  (letrec ([loop-through-equals
             (lambda (ls1 ls2)
               (cond
                 [(null? ls1) ls2]
                 [(null? ls1) ls1]
                 [(= (car ls1) (car ls2))
                  (cons (car ls1) (loop-through-equals (cdr ls1) (cdr ls2)))]
                 [(< (car ls1) (car ls2))
                  (insert ls1 ls2)]
                 [else (insert ls2 ls1)]))]
           [insert
             (lambda (ls1 ls2)
               (cond
                 [(null? ls1) ls2]
                 [(< (car ls1) (car ls2))
                  (cons (car ls1) (insert (cdr ls1) ls2))]
                 [else (loop-through-equals ls1 ls2)]))])
    loop-through-equals))

(define build-dfa
  (lambda (root-set follow-poses plits k)
    (let ([fposes (lambda (i)
                    (let ([r (assv i follow-poses)])
                      (if r
                          (cdr r)
                          '())))]
          [char (lambda (i)
                  (cadr (assv i plits)))])
      (let loop ([marked '()]
                 [fresh `(,root-set)]
                 [dfa '()]
                 [label 1])
        (if (null? fresh)
            (k (sort (lambda (x y) (< (car x) (car y))) (translate-targets dfa 
marked))
              (sort (lambda (x y) (< (cdr x) (cdr y))) marked))
            (let uloop ([ls (car fresh)]
                        [fls (map fposes (car fresh))]
                        [transitions '()])
              (if (null? fls)
                  (let ([new-marked `((,(car fresh) . ,label) . ,marked)])
                    (merge-transitions transitions new-marked (cdr fresh)
                      (lambda (transitions new-states)
                        (loop new-marked new-states
                          `((,label . ,transitions) . ,dfa)
                          (1+ label)))))
                  (uloop (cdr ls)
                    (cdr fls)
                    `((,(char (car ls)) .  ,(car fls)) . ,transitions)))))))))

(define translate-targets
  (lambda (dfa marked)
    (let loop ([dls dfa]
               [new-dfa '()])
      (if (null? dls)
          (reverse new-dfa)
          (let inner-loop ([tls (cdar dls)]
                           [new-tls '()])
            (if (null? tls)
                (loop (cdr dls) `((,(caar dls) . ,(reverse new-tls)) . , 
new-dfa))
                (if (null? (cdar tls))  ; should only occur if (caar tls) = 
(succeed)
                    (inner-loop (cdr tls) (cons (car tls) new-tls))
                    (if (number? (cdar tls))  ; check if the target's already a 
number of a subset, rather than a subset
                        (inner-loop (cdr tls) (cons (car tls) new-tls))
                        (inner-loop (cdr tls)
                          `((,(caar tls) . ,(cdr (assoc (cdar tls) marked))) . 
,new-tls))))))))))

;;; this version handles arbitrary simplified literals
(define merge-transitions
  (lambda (tls marked unmarked k)
    (let ([disjoint-transitions (refine-transitions tls)])
      (k disjoint-transitions
        (let find-states ([ls disjoint-transitions]
                          [states unmarked])
          (if (null? ls)
              states
              (find-states (cdr ls)
                (if (or (null? (cdar ls)) (assoc (cdar ls) marked) (member 
(cdar ls) states))
                  states
                  (cons (cdar ls) states)))))))))

(define empty-literal?
  (lambda (r)
    (eq? (car r) 'fail)))

(define refine-transitions
  (lambda (ls)
    (let loop ([current (car ls)]
               [remaining (cdr ls)]
               [stack '()]
               [disjoint '()])
      (cond
        [(null? remaining)
         (if (null? stack)
             (cons current disjoint)
             (let ([q (reverse stack)])
               (loop (car q) (cdr q) '() (cons current disjoint))))]
        [else
          (let ([top (car remaining)]
                [rest (cdr remaining)])
            (let ([m (intersect-literals (car current) (car top))])
              (if (empty-literal? m)
                  (loop current rest (cons top stack) disjoint)
                  (let ([d1 (intersect-literals (negate-literal (car current)) 
(car top))]
                        [d2 (intersect-literals (negate-literal (car top)) (car 
current))])
                    (let ([b1 (if (empty-literal? d2)
                                  #f
                                  (cons d2 (cdr current)))]
                          [b2 (cons m (sorted-union (cdr current) (cdr top)))]
                          [b3  (if (empty-literal? d1)
                                   #f
                                   (cons d1 (cdr top)))])
                      (if b1
                          (loop b1 rest
                            (if b3
                                `(,b2 ,b3 . ,stack)
                                `(,b2 . ,stack))
                            disjoint)
                          (loop b2 rest
                            (if b3
                                `(,b3 . ,stack)
                                stack)
                            disjoint)))))))]))))

(define regexp->dfa
  (lambda (r)
    (generate-positions r 1
      (lambda (expr nullable firstpos lastpos followpos paths plits new)
        (linearize-actions followpos plits new
          (lambda (followpos plits new)
            (build-dfa firstpos followpos plits
              (lambda (dfa sets/states)
                `(,dfa ,sets/states ,new)))))))))

(define extract-actions
  (lambda (r)
    (record-case r
      ((fail succeed literal range) ignore '())
      ((action negate-action) body `(,r))
      ((charset) cs
       (sort (lambda (x y) (< (cadr x) (cadr y)))
         (apply append (map extract-actions cs)))))))

(define extract-nonactions
  (lambda (r)
    (record-case r
      ((fail succeed literal range) ignore `(,r)
      ((action negate-action) body '())
      ((charset) cs
       (apply append (map extract-nonactions cs)))))))

(define state-name
  (lambda (i)
    (string->symbol
      (string-append "state-"
        (number->string i)))))

(define expand-non-action-table
  (lambda (table states code->state new-state k)
    (let recurse ([q table]
                  [states states]
                  [code code->state]
                  [new new-state]
                  [k k])
      (if (null? q)
          (k '(fk) states code new)
          (let ([c (caar q)]
                [target (if (number? (cdar q)) (state-name (cdar q)) #f)])
            ;;; notice the linearity of the dfa occurs where have
            ;;; ,target where the straightforward nfa would have sk
            ;;; and the call to loop where the nfa would have (fk)
            (record-case c
              ((succeed) ignore
               (if (null? (cdr q))
                   (k (if target
                          `(,target stream consumed cstack data sk fk)
                          `(sk stream consumed cstack data))
                     states code new)
                   ;;; the simplification should prevent this from going into 
an infinite recursion
                   (recurse (append (cdr q) `(,(car q))) states code new k)))
              ((fail) ignore
               (recurse (cdr q) states code new k))
              ((literal) (l)
               (recurse (cdr q) states code new
                 (lambda (failure states code new)
                   (if (not (equal? '(fk) failure))
                       (let ([failure-code
;;;                            `(trace-lambda ,new (c new-stream stream 
consumed cstack data sk fk)
                               `(lambda (c new-stream stream consumed cstack 
data sk fk)
                                  ,failure)])
                         (let ([r (assoc failure-code code)])
                           (let ([new-states (if r
                                                 states
                                                 (cons `(,(state-name new) 
,failure-code) states))]
                                 [new-code (if r
                                               code
                                               (cons `(,failure-code . 
,(state-name new)) code))]
                                 [new-new (if r new (1+ new))]
                                 [fail (if r (cdr r) (state-name new))])
                             (k `(if (char=? c ,l)
                                    (,target new-stream (1+ consumed) (cons c 
cstack) data sk
                                      (lambda () (,fail c new-stream stream 
consumed cstack data sk fk)))
                                    (,fail c new-stream stream consumed cstack 
data sk fk))
                               new-states new-code new-new))))
                       (k `(if (char=? c ,l)
                               (,target new-stream (1+ consumed) (cons c 
cstack) data sk fk)
                               (fk))
                         states code new)))))
              ((range) (lower upper)
               (recurse (cdr q) states code new
                 (lambda (failure states code new)
                   (if (not (equal? '(fk) failure))
                       (let ([failure-code
;;;                            `(trace-lambda ,new (c new-stream stream 
consumed cstack data sk fk)
                               `(lambda (c new-stream stream consumed cstack 
data sk fk)
                                  ,failure)])
                         (let ([r (assoc failure-code code)])
                           (let ([new-states (if r
                                                 states
                                                 (cons `(,(state-name new) 
,failure-code) states))]
                                 [new-code (if r
                                               code
                                               (cons `(,failure-code . 
,(state-name new)) code))]
                                 [new-new (if r new (1+ new))]
                                 [fail (if r (cdr r) (state-name new))])
                             (k `(if (and (char<=? c ,upper) (char>=? c ,lower))
                                     (,target new-stream (1+ consumed) (cons c 
cstack) data sk
                                       (lambda () (,fail c new-stream stream 
consumed cstack data sk fk)))
                                     (,fail c new-stream stream consumed cstack 
data sk fk))
                               new-states new-code new-new))))
                       (k `(if (and (char<=? c ,upper) (char>=? c ,lower))
                               (,target new-stream (1+ consumed) (cons c 
cstack) data sk fk)
                               (fk))
                         states code new)))))
              ((charset) cs
               (recurse (append (map (lambda (x) `(,x . ,(cdar q))) cs) (cdr 
q)) states code new k))))))))


;;; I've left a trace-lambda line commented out to make it obvious how to
;;; produce machines that print out their transitions

(define expand-non-action-state
  (lambda (state transition-table states code new k)
    (cond
      [(null? transition-table)
;;;       `(trace-lambda ,(state-name state) (stream consumed cstack data sk 
fk) (fk))]
       `(lambda (stream consumed cstack data sk fk) (fk))]
      [(and (null? (cdr transition-table))
            (equal? '(succeed) (caar transition-table)))
       ;; this is to avoid doing a useless and possibly expensive stream-read
       (let ([target (cdar transition-table)])
         (k
;;;        `(trace-lambda ,(state-name state) (stream consumed cstack data sk 
fk)
           (if (null? target)
               `(lambda (stream consumed cstack data sk fk)
                  (sk stream consumed cstack data))
;                  `(,(state-name target) stream consumed cstack data sk fk)  ; 
reduce this
               (state-name target))
           states code new))]
      [else
        (expand-non-action-table transition-table states code new
          (lambda (test-and-goto states code new)
            (k 
;;;          `(trace-lambda ,(state-name state) (stream consumed cstack data sk 
fk)
              `(lambda  (stream consumed cstack data sk fk)
                 (stream-read stream
                   (lambda (c new-stream)
                     ,test-and-goto)
                   ;; if the last available transition is (success), then we 
succeed
                   ;; when reading the stream fails (because we're only putting 
off
                   ;; success to get the greediest match).
                   ,(let ([r (assoc '(succeed) transition-table)])
                      (if r
                          (if (null? (cdr r))
                              '(lambda () (sk stream consumed cstack data))
                              `(lambda () (,(state-name (cdr r)) stream 
consumed cstack data sk fk)))
                          'fk))))
              states code new)))])))

(define expand-action-state
  (lambda (action i next)
;;; `(trace-lambda ,(state-name i) (stream consumed cstack data sk fk)
    `(lambda (stream consumed cstack data sk fk)
       (let ([action-index ,(cadr action)]
             [sk (lambda (stream consumed cstack data)
                   (,(state-name next) stream consumed cstack data sk fk))])
         . ,(cddr action)))))

;; assumes 1 is the start state and that the dfa is ordered by state number
;; also that states are simple, charsets don't contain actions by the 
linearization
;; construction above, so it's easy to tell which is which.
(define dfa->code
  (lambda (dfa new)
    (let loop ([ls dfa]
               [states '()]
               [code->states '()]
               [new new])
      (if (null? ls)
          `(letrec ,(reverse states) ,(state-name 1))
          (let ([state (caar ls)]
                [table (cdar ls)]
                [name (state-name (caar ls))])
            (if (null? table)
                (let ([code `(lambda (stream consumed cstack data sk fk) (fk))])
                  (loop (cdr ls)  `((,name ,code) . ,states) `((,code . ,name) 
. ,code->states) new))
                (if (and (null? (cdr table)) (action? (caar table)))
                    (let ([code (expand-action-state (caar table) state (cdar 
table))])
                      (loop (cdr ls) `((,name ,code) . ,states) `((,code . 
,name) . ,code->states) new))
                    (expand-non-action-state state table states code->states new
                      (lambda (code states code->states new)
                        (loop (cdr ls)  `((,name ,code) . ,states) `((,code . 
,name) . ,code->states) new))))))))))

(define regexp->code
  (lambda (r)
    (generate-positions r 1
      (lambda (expr nullable firstpos lastpos followpos paths plits new)
        (linearize-actions followpos plits new
          (lambda (followpos plits new)
            (build-dfa firstpos followpos plits
              (lambda (dfa sets/states)
                (dfa->code dfa new)))))))))

(define run-re-string
  (lambda (f s)
    (f (make-string-stream s) 0 '() '()
      (lambda (stream consumed cstack data) `(,(list->string (reverse cstack)) 
,consumed ,stream))
      (lambda () #f))))

(begin
  ;; this is the main example from the Red Dragon book
  (define test-re '(catenate
                     (star (alternate (literal #\a) (literal #\b)))
                     (literal #\a)
                     (literal #\b)
                     (literal #\b)))
  (pretty-print test-re)
  (define test-re-code (regexp->code test-re))
  (pretty-print test-re-code)
  (define test-re-proc (eval test-re-code))
  (pretty-print `(run-re-string test-re-proc "aaabbaabbaabb"))
  (pretty-print (run-re-string test-re-proc "aaabbaabbaabb")))

(define lexer-code (regexp->code lexer))
(define lexer-func (eval lexer-code))

(define test1 '(simplest-string-parser lexer-func "foo bar      `(a b c) { 
what? #\\a"))
(display (format "test1: ~s~n" test1))
(pretty-print (eval test1))

reply via email to

[Prev in Thread] Current Thread [Next in Thread]