;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; f i x - r e . e l ;; ;; Fix ill-conditioned regular expressions. ;; ;; Written by Alan Mackenzie, 2015-02. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Format of the Abstract Syntax Table ;; ;; The AST is an internal representation of the regular expression string. ;; Elements are represented as follows: ;; ;; 1. Ordinary characters are represented as themselves. So "abc" will be ;; (?a ?b ?c) ;; ;; 2. Enclosing parentheses are represented as a list whose car is a symbol ;; whose print-name is the start of the list, and each of whose alternatives ;; is itself a list. So "\$ab\\|cd\$" will be ('\$ (?a ?b) (?c ?d)). Neither ;; "\\|" or "\$" is explicitly represented. Openers like "\$?6:" are ;; handled. ;; ;; 3. The top level of the AST is a special case of 2. whose symbol is '|. ;; Thus the entire regexp "ab\|cd" is represented by ('| (?a ?b) (?c ?d)). ;; ;; 4. Character alternatives have '[ in the car, and the internals of the ;; brackets in the cdr. So "[^])]" becomes ('[ . (?^ ?] ?$). In place of a ;; character there may be an "escape list". Character classes (like ;; [:alnum:]) are handled vaguely properly. The terminating "]" is not ;; explicitly represented. ;; ;; 5. +, *, and ? are represented by conses with the pertinent symbol in the ;; car and the repreated/optional expression in the cdr. So "a+" becomes ;; ('+ . ?a), \$foo\\|bar\$* would be ('* . ('\\( (?f ?o ?o) (?b ?a ?r)))). ;; ;; 6. Backslashed characters are represented by a list whose first element is ;; . So "\_ (length elts) 1) head) (setq elts (cdr elts) elt (car elts)) (setq head (fix-re--common-head head elt))) head)) (defun fix-re--common-head (e0 e1) "E0 and E1 could be any type of element." (let ( ) (cond ((or (atom e0) (atom e1)) (and (equal e0 e1) e0)) ((and (fix-re--is-\( e0) (eq (car e0) (car e1))) (let* ((e0-common (fix-re--common-head-from-alts e0)) (e1-common (fix-re--common-head-from-alts e1))) (cons (car e0) (fix-re--common-head e0-common e1-common)))) ((and (symbolp (car e0)) (eq (car e0) (car e1))) (and (equal e0 e1) e0)) ((not (or (symbolp (car e0)) (symbolp (car e1)))) (let (acc elt) (while (and e0 e1 (setq elt (fix-re--common-head (car e0) (car e1))) (equal elt (car e0))) (push elt acc) (setq elt nil) (setq e0 (cdr e0) e1 (cdr e1))) (if elt (push elt acc)) (nreverse acc))) (t nil)))) (defun fix-re--remove-head (head ptr) "Remove HEAD from front of element which is the cadr of PTR, returning the remains. HEAD is a list of elements which are known to match the head of ELT." (let ((elt (cadr ptr)) ) (while head (cond ((equal (car head) (car elt)) (setq head (cdr head) elt (cdr elt)) (fix-re--chop (cadr ptr) 'car)) ((fix-re--is-$ (car elt)) (mapc (lambda (e) (fix-re--remove-head head e)) (cdar elt)) (setq head nil)) (t (error "fix-re--remove-head: head = %s, ptr = %s" head ptr)))) )) (defun fix-re--RA|RB->R\(A|B$ (ptr ad in-alt) "Transform any $RA\|RB\|RC$ into R$A\|B\|C$, or $R\(?:A\|B\|C$\). FIXME!!! Correct this doc string, since *ptr isn't necessarily an alternatives list. Extract the head common to all the alternatives in the list which is pointed to by PTR and AD, and insert it into the tree structure before that list. If IN-ALT is non-nil, additionally wrap the final expression in $..$. (We need to do this when the enclosing tree element is itself a \( construct. Otherwise, rather than creating a sequence, we would end up with alternatives.) Return non-nil when a transformation was done, else nil." (let ((alt-list (fix-re--ptr-get ptr ad)) ) (when (and (fix-re--is-\( alt-list) (> (length alt-list) 2)) ; i.e. the construct has a \| (let ((head (fix-re--common-head-from-alts alt-list)) (inner-ptr ptr) (inner-ad ad) ) (when head (mapl (lambda (elt-ptr) (when (cdr elt-ptr) (fix-re--remove-head head elt-ptr))) alt-list) (when in-alt (fix-re--wrap-in-\( (car alt-list) ptr ad) (setcar alt-list '\\\(\?:) (setq inner-ptr (cadar ptr)) (setq inner-ad 'car)) (fix-re--splice-list head inner-ptr inner-ad) t ))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun fix-re--R+R*->R+ (ptr ad) "Transform any R+R* to R+. All combinations of +, *, and ? are handled. Return 'shortened, t or nil. ... " (if (eq ad 'cdr) (error "fix-re--R+R*->R+ got 'cdr")) (let ((e0 (if (eq ad 'car) (car ptr) (cadr ptr))) (e1 (if (eq ad 'car) (cadr ptr) (caddr ptr)))) (when (and (consp e0) (consp e1)) (let ((op0 (car e0)) (op1 (car e1)) ;; (link (if (eq ad 'car) (cddr ptr) (cdddr ptr))) op ) (when (and (memq op0 '(+ * \?)) (memq op1 '(+ * \?)) (equal (cdr e0) (cdr e1))) ; Both Rs are the same. (cond ((and (eq op0 '\?) (eq op1 '\?)) ; Cant combine R?R? nil) ((and (eq op0 '+) (eq op1 '+)) ; R+R+ -> RR+ (fix-re--chop-+* ptr 'car) t) (t (setq op (cond ((or (eq op0 '+) (eq op1 '+)) '+) ((or (eq op0 '*) (eq op1 '*)) '*))) (setcar e0 op) (fix-re--chop (if (eq ad 'car) ptr (cdr ptr)) 'cadr) 'shortened)) ))))) (defun fix-re--multi-R+R*->R+ (ptr ad) "Transform R+R*R?.... into R+. PTR/AD point at the first R." (let (res result ) (while (and (>= (length ptr) (if (eq ad 'car) 2 3)) (setq res (fix-re--R+R*->R+ ptr ad)) (if res (setq result t)) (eq res 'shortened))) result)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun fix-re--matches-empty-p (tree) "Return non-nil when the empty string matches TREE." (let (cur ) (cond ((fix-re--is-$ tree) (or (null (cdr tree)) (progn (setq cur (cdr tree)) (while (and cur (not (or (null cur) (fix-re--matches-empty-p (car cur))))) (setq cur (cdr cur))) cur))) ((atom tree) (not tree)) ((memq (car tree) '(* \?))) ((eq (car tree) '+) (fix-re--matches-empty-p (cdr tree))) ((eq (car tree) '\[) nil) ((eq (car tree) ?\$ nil) (t ; Sequntial element. (setq cur tree) (while (and cur (fix-re--matches-empty-p (car cur))) (setq cur (cdr cur))) (not cur))))) (defun fix-re--de-emptify (ptr ad) "Destructively change PTR so that the tree it points to no longer matches the empty string, but matches any non-empty string, and no others, that the original PTR did. Expresssions like ^, $, \<, ...., which \"match the empty string\" at particular places, are left unchanged." (let ((tree (fix-re--ptr-get ptr ad)) cur cur-ptr cur-ad ) (cond ((or (null tree) (atom tree) (memq (car tree) '(?\[)) ) nil) ((fix-re--is-$ tree) (setq cur-ptr tree cur-ad 'car) (while (setq cur (fix-re--ptr-next cur-ptr cur-ad)) (fix-re--de-emptify cur-ptr cur-ad)) ) ((eq (car tree) '+) (fix-re--de-emptify tree 'cdr)) ((eq (car tree) '*) (fix-re--de-emptify tree 'cdr) (setcar tree '+)) ; <------- ((eq (car tree) '\?) (fix-re--de-emptify tree 'cdr) (fix-re--chop-+* ptr ad)) ; <------- ((eq (car tree) ?\$ nil) (t ; Sequential element. (when (fix-re--matches-empty-p tree) ; i.e. all elements match the empty string. (if (cdr tree) ; EF -> (EF@|E@), (where E@ is de-emptified E). (let ((new (list (copy-tree (car tree))))); E EF (fix-re--de-emptify new 'car) ; E@ EF (fix-re--de-emptify tree 'cdr) ; E@ EF@ (fix-re--wrap-list-in-\( '\\\(\?: tree 'car) ; E@ \(address@hidden) (setq tree (car (fix-re--ptr-get ptr ad))) (fix-re--insert-after new tree 'cadr)) ; \(address@hidden|address@hidden) (fix-re--de-emptify tree 'car))))))) (defun fix-re--do-R*ER*-transform (R0-ptr R0-ad empty0-ptr empty1-ptr R1-ptr) "Transformation R*ER* -> R*(address@hidden)? or R*ER+ -> R*(address@hidden|R). Here, empty0/1-ad and R1-ad are always 'cadr, so we omit these from the parameter list. R0/R1-ptr point to the enclosing R*, R+ expressions, empty0/1-ptr to the first and last expressions between them, all of which match the empty string." (let* ((R0 (fix-re--ptr-get R0-ptr R0-ad)) (R0-R (cdr R0)) (R0-+* (car R0)) (R1-+* (car (fix-re--ptr-get R1-ptr 'cadr))) new link ) ;; First "de-emptify" the empty expression sequence. (if (eq empty0-ptr empty1-ptr) ;; Special case: just one empty matcher (fix-re--de-emptify empty0-ptr 'cadr) ;; General case: temporarily chop off the link on the last of ;; the list of empty matchers, so as to de-emptify them as a ;; sequence. (setq link (fix-re--bind-link-nil empty1-ptr 'cadr)) (fix-re--de-emptify empty0-ptr 'cdr) (fix-re--restore-link link empty0-ptr 'cadr) ;; We've now lost R1-ptr. Restore it. The deemptification will have ;; left everything between empty0 and empty1 as a single list element. ;; So what now follows empty0 is R1. ;; We have also lost empty1-ptr, but we don't need it any more. (setq R1-ptr (cdr empty0-ptr))) ; R1-ad remains 'cadr. ;; Now transform according to whether we have R* or R+ or a mixture. ;; Do ER* -> (address@hidden). ;; First set the link on R* to nil, to "terminate" the list. This link ;; will be set on the newly formed $..$ construct. (setq link (fix-re--bind-link-nil R1-ptr 'cadr)) (fix-re--wrap-list-in-$ '\\\(\?: empty0-ptr 'cadr) ;; empty0-ptr/ad now points at the new \(..$. (fix-re--restore-link link empty0-ptr 'cadr) (if (eq R1-+* '*) ;; Apply ? to $..$ to give R*(address@hidden)? (fix-re--+*ify '\? empty0-ptr 'cadr) ;; Change R*(address@hidden) to R*(address@hidden|R) (setq new (copy-tree R0-R)) ; R R*(address@hidden) (fix-re--insert-after new (cadr empty0-ptr) 'cadr) ; R*(address@hidden|R) ))) (defun fix-re--R+ER*->R+\(address@hidden)\? (ptr ad) "Do R+ER* -> R+(address@hidden)? or R+ER+ -> R+(address@hidden|R) on the whole list. PTR/AD point at the first element of the sequential list. Here, E is a non-empty sequence of elements which are matched by the empty string, E@ is the \"de-emptified\" version of E." ;; We must perform the loop rightmost transformations first. To see this, ;; consider R*ER*FR* done leftmost first. The first transformation takes us ;; to R*(address@hidden)?FR*. We're now stuck, as the middle R* is no longer ;; "exposed" to the last R*, and the end expression is still ill-formed. ;; Done rightmost first, R*ER*FR* -> R*ER*(address@hidden)? -> R*(address@hidden)?(address@hidden)?, which ;; is well-formed. (let (res) (let ((ptr ptr) (ad ad)) (when (fix-re--ptr-next ptr ad) (setq res (fix-re--R+ER*->R+\(address@hidden)\? ptr ad)))) (let* ((elt-ptr ptr) (elt-ad ad) (elt (fix-re--ptr-get elt-ptr elt-ad)) R0-R empty0-ptr empty1-ptr R1-ptr ; No need for ..-ad's, since ; these will always be 'cadr. R1-+* ) (or ;; Isy `elt' R+ or R*? (when (and (consp elt) (memq (car elt) '(+ *))) (setq R0-R (cdr elt)) ;; Is the next element one matching the empty string, and which ;; isn't R+ or R*? (setq elt (fix-re--ptr-next elt-ptr elt-ad)) (when (and elt (fix-re--matches-empty-p elt) (not (and (consp elt) (memq (car elt) '(+ *)) (equal (cdr elt) R0-R)))) (setq empty0-ptr elt-ptr ; Remember first empty. -ad is implicitly 'cadr empty1-ptr elt-ptr ) ; Remember last empty. ;; Read the elements which match empty, but aren't R+ or R*. (while (and (setq elt (fix-re--ptr-next elt-ptr elt-ad)) (fix-re--matches-empty-p elt) (not (and (consp elt) (memq (car elt) '(+ *)) (equal (cdr elt) R0-R)))) (setq empty1-ptr elt-ptr)) ;; Have we found the matching R+ or R*? (when (and elt (consp elt) (memq (car elt) '(+ *)) (equal (cdr elt) R0-R)) ;; Yes. We're in business. (fix-re--do-R*ER*-transform ptr ad empty0-ptr empty1-ptr elt-ptr) t))) res)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; R*(R*A|B) -> R*(A|B) (defun fix-re--do-R+$R*A|B$-transform (R-rep alt) "Attempt a R+(R*A|B) -> R+(A|B) transformation. R-REP is a cons representing either R+ or R*. ALT represents a form of the form $..\|..\|...$. " (let* ((R-R (cdr R-rep)) (R-+* (car R-rep)) (ptr alt) (ad 'cadr) ; Point to the second elt. of the list, the first being '\\$ (elt (fix-re--ptr-get ptr ad)) res car-elt elt-+* ) (while elt ; (R*A) (when (and (consp elt) ; This should always be true (setq car-elt (car elt)) ; This is now R*A (consp car-elt) (memq (car car-elt) '(+ *)) (equal (cdr car-elt) R-R)) (setq elt-+* (car car-elt)) (if (eq elt-+* '*) (fix-re--chop elt 'car) (fix-re--chop-+* elt 'car)) (setq res t)) (setq elt (fix-re--ptr-next ptr ad))) res)) (defun fix-re--R+\(R*A|B$->R*$A|B$ (ptr ad) "Do the transition on every pertinent element pairs in the sequence. PTR/AD point to the first element in the sequential list." (let ((elt (fix-re--ptr-get ptr ad)) R-rep res) (while elt (if (and (consp elt) (memq (car elt) '(+ *))) (progn (setq R-rep elt elt (fix-re--ptr-next ptr ad)) (when (fix-re--is-$ elt) (if (fix-re--do-R+\(R*A|B$-transform R-rep elt) (setq res t)) (setq elt (fix-re--ptr-next ptr ad)))) (setq elt (fix-re--ptr-next ptr ad)))) res)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Top level AST transformation routines. (defun fix-re--transform (ptr ad in-alt) "Transform the expression pointed at by PTR/AD. Return non-nil when we did something. IN-ALT is non-zero iff the expression is directly contained in $..$." (let ((e (fix-re--ptr-get ptr ad)) cur res cur-ptr cur-ad ) (when (and (consp e) (not (eq (car e) ?\[)) ; character alternative, nothing to do. (not (eq (car e) ?\e))) ; escape pair/triple, nothing to do. ;; First recurse on all subexpressions (cond ((fix-re--is-$ e) (setq cur e) (while (cdr cur) (fix-re--setres (fix-re--transform cur 'cadr t)) (setq cur (cdr cur)))) ((symbolp (car e)) ; +, *, ?. (fix-re--setres (fix-re--transform e 'cdr nil))) (t ; Sequential list. (fix-re--setres (fix-re--transform e 'car nil)) (setq cur e) (while (cdr cur) (fix-re--setres (fix-re--transform cur 'cadr nil)) (setq cur (cdr cur))))) ;; Now operate on the supplied list. (cond ((fix-re--is-\( e) ;; Extract any common head from inside \(..$. ;; We're sending in a pointer to e to fix-re--RA|RB... (fix-re--setres (fix-re--RA|RB->R$A|B$ ptr ad in-alt)) ) ((symbolp (car e)) ; +, *, ? ) ; Nothing to do here - (t ; Sequential list. (setq cur-ptr e cur-ad 'car) (while (progn (fix-re--setres (fix-re--multi-R+R*->R+ cur-ptr cur-ad)) (fix-re--ptr-next cur-ptr cur-ad))) (setq cur-ptr e cur-ad 'car) (fix-re--setres (fix-re--R+ER*->R+$address@hidden)\? e 'car)) (fix-re--setres (fix-re--R+\(R*A|B$->R*$A|B$ e 'car))))) res)) (defun fix-re--transform-AST (tree) "The top-level transformation function. TREE will have '| as its car. Return the transformed (or newly expanded) tree. " (let ((ptr (list tree))) (fix-re--transform ptr 'car t))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Entry point routines. (defun fix-re-test (re) "Attempt to fix the regexp RE. If no fix is needed, return nil, otherwise return the fixed regexp. Note: this function doesn't touch the cache `fix-re--cache'." (string-match re "") ; Throw an error if re is invalid. (let* ((ast (fix-re--build-AST re))) (and (fix-re--transform-AST ast) (fix-re--dump-AST ast)))) (defcustom fix-re-cache-limit 40 "Maximum number of entries in the cache for `fix-re'." :type 'integer :group 'isearch) (defvar fix-re--cache-overflowed nil "Non-nil when the association list `fix-re--cache' has overflowed its size limit `fix-re--cache-limit'") (defvar fix-re--cache nil "An association list linking input regexps with fixed regexps. The key of each element is the input regexp. The value is nil if the key regexp is OK, otherwise it's the replacement regexp.") (defun fix-re (re) "Check the regexp RE for certain solecisms, and if any are found, fix them. Return the fixed regexp, if a fix was done, otherwise RE. Regexps passed to `fix-re' for the first time are inserted into a cache `fix-re--cache', so that calling `fix-re' repeatedly with an argument is fast. The limit on `fix-re--cache''s size is the configurable option `fix-re-cache-limit'. The typical errors corrected are ...... FIXME!!! If any fixes were needed, return the fixed regexp, otherwise return RE." (let ((elt (assoc re fix-re--cache)) fix) (if elt (progn (unless (eq elt (car fix-re--cache)) (setq fix-re--cache (delq elt fix-re--cache)) (push elt fix-re--cache)) (or (cdr elt) re)) (setq fix (fix-re-test re)) (when (> (length fix-re--cache) (max fix-re-cache-limit 1)) (or fix-re--cache-overflowed (setq fix-re--cache-overflowed re)) (setq fix-re--cache (butlast fix-re--cache))) (push (cons re fix) fix-re--cache) (or fix re))))