bug-gnu-emacs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#36237: Support (rx (and (regexp EXPR) (regexp-quote EXPR)))


From: Noam Postavsky
Subject: bug#36237: Support (rx (and (regexp EXPR) (regexp-quote EXPR)))
Date: Sat, 15 Jun 2019 19:43:30 -0400

X-Debbugs-CC: Kévin Le Gouguec <kevin.legouguec@gmail.com>, Stefan Monnier 
<monnier@iro.umontreal.ca>
Severity: wishlist
Tags: patch

Currently, if you want to construct a regexp which includes a runtime
values using rx, there are two options:

- Use the (eval FORM) subform.  But if using using the rx macro, FORM is
  evaluated at macroexpansion time, which is awkward.  If using
  rx-to-string, then FORM can't access the lexical environment, which is
  also awkward.
- Build a list at runtime and pass to rx-to-string.  This requires the
  whole rx translation infrastructure at runtime, which is sad.

The patch below allows the rx macro to generate a concat expression
instead of just a plain string.  So the example from
https://debbugs.gnu.org/35564#53 would become

    (let ((start (max 0 (1- pos)))
          (char (string (aref command pos)))) ; need string for `regexp-quote'.
      (and (string-match
            (rx (or (seq (or bos blank)
                         (group-n 1 (regexp-quote char))
                         (or eos blank))
                    (seq ?` (group-n 1 (regexp-quote char)) ?`)))
            command start)
           (= pos (match-beginning 1))))

The rx call in the above macroexpands into:

    (concat "\\(?:\\`\\|[[:blank:]]\\)" "\\(?" "1" ":"
            (regexp-quote char)
            "\\)" "\\(?:\\'\\|[[:blank:]]\\)" "\\|" "`" "\\(?" "1" ":"
            (regexp-quote char)
            "\\)" "`")

Which will be optimal once we apply the patch from #14769 "optimize
`concat's literals".

>From 6b6c6d8997d02236a4e53ccbe1f6a4b362d9b86c Mon Sep 17 00:00:00 2001
From: Noam Postavsky <npostavs@gmail.com>
Date: Fri, 14 Jun 2019 08:43:17 -0400
Subject: [PATCH] Support (rx (and (regexp EXPR) (regexp-quote EXPR)))

* lisp/emacs-lisp/rx.el (rx-regexp): Allow non-string forms.
(rx-constituents): Add regexp-quote constituent, which is like a plain
STRING form, but allows arbitrary lisp expressions.
(rx-regexp-quote): New function.
(rx-compile-to-lisp): New variable.
(rx-subforms): New helper function for handling subforms, including
non-constant case.
(rx-group-if, rx-and, rx-or, rx-=, rx->=, rx-repeat, rx-submatch)
(rx-submatch-n, rx-kleene, rx-atomic-p): Use it to handle non-constant
subforms.
(rx): Document new form, wrap non-constant forms with concat call.
* test/lisp/emacs-lisp/rx-tests.el (rx-tests--match): New macro.
(rx-nonstring-expr, rx-nonstring-expr-non-greedy): New tests.
* etc/NEWS: Announce changes.
---
 etc/NEWS                         |   6 ++
 lisp/emacs-lisp/rx.el            | 189 +++++++++++++++++++++++++--------------
 test/lisp/emacs-lisp/rx-tests.el |  41 +++++++++
 3 files changed, 171 insertions(+), 65 deletions(-)

diff --git a/etc/NEWS b/etc/NEWS
index 723f0a0fb0..bce755a211 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1380,12 +1380,18 @@ when given in a string.  Previously, '(any 
"\x80-\xff")' would match
 characters U+0080...U+00FF.  Now the expression matches raw bytes in
 the 128...255 range, as expected.
 
+---
 *** The rx 'or' and 'seq' forms no longer require any arguments.
 (or) produces a regexp that never matches anything, while (seq)
 matches the empty string, each being an identity for the operation.
 This also works for their aliases: '|' for 'or'; ':', 'and' and
 'sequence' for 'seq'.
 
+---
+*** 'regexp' and new 'regexp-quote' accept arbirtray lisp as arguments.
+In this case, 'rx' will generate code which produces a regexp string
+at runtime, instead of a constant string.
+
 ** Frames
 
 +++
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index 8ef78fd69e..0b7765322b 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -47,9 +47,11 @@
 
 ;; Rx translates a sexp notation for regular expressions into the
 ;; usual string notation.  The translation can be done at compile-time
-;; by using the `rx' macro.  It can be done at run-time by calling
-;; function `rx-to-string'.  See the documentation of `rx' for a
-;; complete description of the sexp notation.
+;; by using the `rx' macro.  The `regexp' and `regexp-quote' accept
+;; non-constant expressions, in which case `rx' will translate to a
+;; `concat' expression.  Translation can be done fully at run-time by
+;; calling function `rx-to-string'.  See the documentation of `rx' for
+;; a complete description of the sexp notation.
 ;;
 ;; Some examples of string regexps and their sexp counterparts:
 ;;
@@ -78,8 +80,8 @@
 ;;         (+ (? ?\n)) (any " \t"))
 ;;
 ;; (concat "^\\(?:" something-else "\\)")
-;; (rx (and line-start (eval something-else))), statically or
-;; (rx-to-string '(and line-start ,something-else)), dynamically.
+;; (rx (and line-start (regexp something-else))), statically or
+;; (rx-to-string `(and line-start ,something-else)), dynamically.
 ;;
 ;; (regexp-opt '(STRING1 STRING2 ...))
 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
@@ -176,6 +178,7 @@ (defvar rx-constituents              ;Not `const' because 
some modes extend it.
     (not-syntax                . (rx-not-syntax 1 1)) ; sregex
     (category          . (rx-category 1 1 rx-check-category))
     (eval              . (rx-eval 1 1))
+    (regexp-quote      . (rx-regexp-quote 1 1 stringp))
     (regexp            . (rx-regexp 1 1 stringp))
     (regex             . regexp)       ; sregex
     (digit             . "[[:digit:]]")
@@ -302,6 +305,10 @@ (defvar rx-greedy-flag t
   "Non-nil means produce greedy regular expressions for `zero-or-one',
 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 
+(defvar rx-compile-to-lisp nil
+  "Nil means return a regexp as a string.
+Non-nil means we may return a lisp form which produces a
+string (used for `rx' macro).")
 
 (defun rx-info (op head)
   "Return parsing/code generation info for OP.
@@ -344,7 +351,7 @@ (defun rx-check (form)
               (> nargs max-args))
       (error "rx form `%s' accepts at most %d args"
             (car form) max-args))
-    (when (not (null type-pred))
+    (when type-pred
       (dolist (sub-form (cdr form))
        (unless (funcall type-pred sub-form)
          (error "rx form `%s' requires args satisfying `%s'"
@@ -360,19 +367,21 @@ (defun rx-group-if (regexp group)
    ;; for concatenation
    ((eq group ':)
     (if (rx-atomic-p
-        (if (string-match
-             "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
-            (substring regexp 0 (match-beginning 0))
-          regexp))
-       (setq group nil)))
+         (if (and (stringp regexp)
+                  (string-match
+                   "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp))
+             (substring regexp 0 (match-beginning 0))
+           regexp))
+        (setq group nil)))
    ;; for OR
    ((eq group '|) (setq group nil))
    ;; do anyway
    ((eq group t))
    ((rx-atomic-p regexp t) (setq group nil)))
-  (if group
-      (concat "\\(?:" regexp "\\)")
-    regexp))
+  (cond ((and group (stringp regexp))
+         (concat "\\(?:" regexp "\\)"))
+        (group `("\\(?:" ,@regexp "\\)"))
+        (t regexp)))
 
 
 (defvar rx-parent)
@@ -384,7 +393,7 @@ (defun rx-and (form)
 FORM is of the form `(and FORM1 ...)'."
   (rx-check form)
   (rx-group-if
-   (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
+   (rx-subforms (cdr form) ':)
    (and (memq rx-parent '(* t)) rx-parent)))
 
 
@@ -396,7 +405,7 @@ (defun rx-or (form)
     ((null (cdr form)) regexp-unmatchable)
     ((cl-every #'stringp (cdr form))
      (regexp-opt (cdr form) nil t))
-    (t (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")))
+    (t (rx-subforms (cdr form) '| "\\|")))
    (and (memq rx-parent '(: * t)) rx-parent)))
 
 
@@ -669,7 +678,10 @@ (defun rx-= (form)
   (unless (and (integerp (nth 1 form))
               (> (nth 1 form) 0))
     (error "rx `=' requires positive integer first arg"))
-  (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+  (let ((subform (rx-form (nth 2 form) '*)))
+    (if (stringp subform)
+        (format "%s\\{%d\\}" subform (nth 1 form))
+      `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
 
 
 (defun rx->= (form)
@@ -679,7 +691,10 @@ (defun rx->= (form)
   (unless (and (integerp (nth 1 form))
               (> (nth 1 form) 0))
     (error "rx `>=' requires positive integer first arg"))
-  (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+  (let ((subform (rx-form (nth 2 form) '*)))
+    (if (stringp subform)
+        (format "%s\\{%d,\\}" subform (nth 1 form))
+      `(,@subform ,(format "\\{%d,\\}" (nth 1 form))))))
 
 
 (defun rx-** (form)
@@ -700,7 +715,10 @@ (defun rx-repeat (form)
         (unless (and (integerp (nth 1 form))
                      (> (nth 1 form) 0))
           (error "rx `repeat' requires positive integer first arg"))
-        (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
+         (let ((subform (rx-form (nth 2 form) '*)))
+           (if (stringp subform)
+               (format "%s\\{%d\\}" subform (nth 1 form))
+             `(,@subform ,(format "\\{%d\\}" (nth 1 form))))))
        ((or (not (integerp (nth 2 form)))
             (< (nth 2 form) 0)
             (not (integerp (nth 1 form)))
@@ -708,30 +726,26 @@ (defun rx-repeat (form)
             (< (nth 2 form) (nth 1 form)))
         (error "rx `repeat' range error"))
        (t
-        (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
-                (nth 1 form) (nth 2 form)))))
+         (let ((subform (rx-form (nth 3 form) '*)))
+           (if (stringp subform)
+               (format "%s\\{%d,%d\\}" subform (nth 1 form) (nth 2 form))
+             `(,@subform ,(format "\\{%d,%d\\}" (nth 1 form) (nth 2 
form))))))))
 
 
 (defun rx-submatch (form)
   "Parse and produce code from FORM, which is `(submatch ...)'."
-  (concat "\\("
-          (if (= 2 (length form))
-              ;; Only one sub-form.
-              (rx-form (cadr form))
-            ;; Several sub-forms implicitly concatenated.
-            (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
-          "\\)"))
+  (let ((subforms (rx-subforms (cdr form) ':)))
+    (if (stringp subforms)
+        (concat "\\(" subforms "\\)")
+      `("\\(" ,@subforms "\\)"))))
 
 (defun rx-submatch-n (form)
   "Parse and produce code from FORM, which is `(submatch-n N ...)'."
-  (let ((n (nth 1 form)))
-    (concat "\\(?" (number-to-string n) ":"
-           (if (= 3 (length form))
-               ;; Only one sub-form.
-               (rx-form (nth 2 form))
-             ;; Several sub-forms implicitly concatenated.
-             (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
-           "\\)")))
+  (let ((n (nth 1 form))
+        (subforms (rx-subforms (cddr form) ':)))
+    (if (stringp subforms)
+        (concat "\\(?" (number-to-string n) ":" subforms "\\)")
+      `("\\(?" ,(number-to-string n) ":" ,@subforms "\\)"))))
 
 (defun rx-backref (form)
   "Parse and produce code from FORM, which is `(backref N)'."
@@ -759,9 +773,12 @@ (defun rx-kleene (form)
                      (t "?")))
        (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
                  ((memq (car form) '(+ +? 1+ one-or-more))  "+")
-                 (t "?"))))
+                  (t "?")))
+        (subform (rx-form (cadr form) '*)))
     (rx-group-if
-     (concat (rx-form (cadr form) '*) op suffix)
+     (if (stringp subform)
+         (concat subform op suffix)
+       `(,@subform ,(concat op suffix)))
      (and (memq rx-parent '(t *)) rx-parent))))
 
 
@@ -789,15 +806,18 @@ (defun rx-atomic-p (r &optional lax)
 be detected without much effort.  A guarantee of no false
 negatives would require a theoretic specification of the set
 of all atomic regexps."
-  (let ((l (length r)))
-    (cond
-     ((<= l 1))
-     ((= l 2) (= (aref r 0) ?\\))
-     ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
-     ((null lax)
+  (if (and rx-compile-to-lisp
+           (not (stringp r)))
+      nil ;; Runtime value, we must assume non-atomic.
+    (let ((l (length r)))
       (cond
-       ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
-       ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
+       ((<= l 1))
+       ((= l 2) (= (aref r 0) ?\\))
+       ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
+       ((null lax)
+        (cond
+         ((string-match "\\`\\[\\^?]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*]\\'" r))
+         ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r))))))))
 
 
 (defun rx-syntax (form)
@@ -853,9 +873,23 @@ (defun rx-greedy (form)
 
 (defun rx-regexp (form)
   "Parse and produce code from FORM, which is `(regexp STRING)'."
-  (rx-check form)
-  (rx-group-if (cadr form) rx-parent))
-
+  (cond ((stringp form)
+         (rx-group-if (cadr form) rx-parent))
+        (rx-compile-to-lisp
+         ;; Always group non string forms, since we can't be sure they
+         ;; are atomic.
+         (rx-group-if (cdr form) t))
+        (t (rx-check form))))
+
+(defun rx-regexp-quote (form)
+  "Parse and produce code from FORM, which is `(regexp-quote STRING-EXP)'."
+  (cond ((stringp form)
+         ;; This is allowed(?), but makes little sense, you could just
+         ;; use STRING directly.
+         (rx-group-if (regexp-quote (cadr form)) rx-parent))
+        (rx-compile-to-lisp
+         (rx-group-if (list form) rx-parent))
+        (t (rx-check form))))
 
 (defun rx-form (form &optional parent)
   "Parse and produce code for regular expression FORM.
@@ -886,6 +920,27 @@ (defun rx-form (form &optional parent)
      (t
       (error "rx syntax error at `%s'" form)))))
 
+(defun rx-subforms (subforms &optional parent regexp-op)
+  (let ((listify (lambda (x)
+                   (if (listp x) (copy-sequence x)
+                     (list x))))
+        (subregexps (cond ((cdr subforms)
+                           (mapcar (lambda (x) (rx-form x parent)) subforms))
+                          (subforms
+                           ;; Single form, no need for grouping.
+                           (list (rx-form (car subforms))))
+                          ;; Zero forms.
+                          (t ""))))
+    (cond ((or (not rx-compile-to-lisp)
+               (cl-every #'stringp subregexps))
+           (mapconcat #'identity subregexps regexp-op))
+          (regexp-op
+           (nconc (funcall listify (car subregexps))
+                  (cl-mapcan (lambda (x)
+                               (cons regexp-op (funcall listify x)))
+                             (cdr subregexps))))
+          (t (cl-mapcan listify subregexps)))))
+
 
 ;;;###autoload
 (defun rx-to-string (form &optional no-group)
@@ -901,8 +956,12 @@ (defmacro rx (&rest regexps)
 REGEXPS is a non-empty sequence of forms of the sort listed below.
 
 Note that `rx' is a Lisp macro; when used in a Lisp program being
-compiled, the translation is performed by the compiler.
-See `rx-to-string' for how to do such a translation at run-time.
+compiled, the translation is performed by the compiler.  The
+`regexp-quote' and `regexp' accept forms that will evaluate to
+strings, in addition to constant strings.  If REGEXPS include
+such forms, then the result is an expression which returns a
+regexp string, rather than a regexp string directly.  See
+`rx-to-string' for performing translation completely at run-time.
 
 The following are valid subforms of regular expressions in sexp
 notation.
@@ -910,6 +969,10 @@ (defmacro rx (&rest regexps)
 STRING
      matches string STRING literally.
 
+`(regexp-quote STRING)'
+     matches STRING literally, where STRING is any lisp
+     expression that evaluates to a string.
+
 CHAR
      matches character CHAR literally.
 
@@ -1208,12 +1271,16 @@ (defmacro rx (&rest regexps)
 
 `(regexp REGEXP)'
      include REGEXP in string notation in the result."
-  (cond ((null regexps)
-        (error "No regexp"))
-       ((cdr regexps)
-        (rx-to-string `(and ,@regexps) t))
-       (t
-        (rx-to-string (car regexps) t))))
+  (let* ((rx-compile-to-lisp t)
+         (re (cond ((null regexps)
+                    (error "No regexp"))
+                   ((cdr regexps)
+                    (rx-to-string `(and ,@regexps) t))
+                   (t
+                    (rx-to-string (car regexps) t)))))
+    (if (stringp re)
+        re
+      `(concat ,@re))))
 
 
 (pcase-defmacro rx (&rest regexps)
@@ -1275,14 +1342,6 @@ (pcase-defmacro rx (&rest regexps)
                      for var in vars
                      collect `(app (match-string ,i) ,var)))))
 
-;; ;; sregex.el replacement
-
-;; ;;;###autoload (provide 'sregex)
-;; ;;;###autoload (autoload 'sregex "rx")
-;; (defalias 'sregex 'rx-to-string)
-;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
-;; (defalias 'sregexq 'rx)
-
 (provide 'rx)
 
 ;;; rx.el ends here
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 6f392d616d..d457f6919d 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -115,5 +115,46 @@ (ert-deftest rx-seq ()
   ;; Test zero-argument `seq'.
   (should (equal (rx (seq)) "")))
 
+(defmacro rx-tests--match (regexp string &optional match)
+  (macroexp-let2 nil strexp string
+    `(ert-info ((format "Matching %S to %S" ',regexp ,strexp))
+       (should (string-match ,regexp ,strexp))
+       ,@(when match
+           `((should (equal (match-string 0 ,strexp) ,match)))))))
+
+(ert-deftest rx-nonstring-expr ()
+  (let ((bee "b")
+        (vowel "[aeiou]"))
+    (rx-tests--match (rx "a" (regexp-quote bee) "c") "abc")
+    (rx-tests--match (rx "a" (regexp bee) "c") "abc")
+    (rx-tests--match (rx "a" (or (regexp bee) "xy") "c") "abc")
+    (rx-tests--match (rx "a" (or "xy" (regexp bee)) "c") "abc")
+    (should-not (string-match (rx (or (regexp bee) "xy")) ""))
+    (rx-tests--match (rx "a" (= 3 (regexp bee)) "c") "abbbc")
+    (rx-tests--match (rx "x" (= 3 (regexp vowel)) "z") "xeoez")
+    (should-not (string-match (rx "x" (= 3 (regexp vowel)) "z") "xe[]z"))
+    (rx-tests--match (rx "x" (= 3 (regexp-quote vowel)) "z")
+                     "x[aeiou][aeiou][aeiou]z")
+    (rx-tests--match (rx "x" (repeat 1 (regexp vowel)) "z") "xaz")
+    (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xaz")
+    (rx-tests--match (rx "x" (repeat 1 2 (regexp vowel)) "z") "xauz")
+    (rx-tests--match (rx "x" (>= 1 (regexp vowel)) "z") "xaiiz")
+    (rx-tests--match (rx "x" (** 1 2 (regexp vowel)) "z") "xaiz")
+    (rx-tests--match (rx "x" (group (regexp vowel)) "z") "xaz")
+    (rx-tests--match (rx "x" (group-n 1 (regexp vowel)) "z") "xaz")
+    (rx-tests--match (rx "x" (? (regexp vowel)) "z") "xz")))
+
+(ert-deftest rx-nonstring-expr-non-greedy ()
+  "`rx's greediness can't affect runtime regexp parts."
+  (let ((ad-min "[ad]*?")
+        (ad-max "[ad]*")
+        (ad "[ad]"))
+    (rx-tests--match (rx "c" (regexp ad-min) "a") "cdaaada" "cda")
+    (rx-tests--match (rx "c" (regexp ad-max) "a") "cdaaada" "cdaaada")
+    (rx-tests--match (rx "c" (minimal-match (regexp ad-max)) "a") "cdaaada" 
"cdaaada")
+    (rx-tests--match (rx "c" (maximal-match (regexp ad-min)) "a") "cdaaada" 
"cda")
+    (rx-tests--match (rx "c" (minimal-match (0+ (regexp ad))) "a") "cdaaada" 
"cda")
+    (rx-tests--match (rx "c" (maximal-match (0+ (regexp ad))) "a") "cdaaada" 
"cdaaada")))
+
 (provide 'rx-tests)
 ;; rx-tests.el ends here.
-- 
2.11.0


reply via email to

[Prev in Thread] Current Thread [Next in Thread]