LCOV - code coverage report
Current view: top level - lisp/emacs-lisp - rx.el (source / functions) Hit Total Coverage
Test: tramp-tests-after.info Lines: 132 355 37.2 %
Date: 2017-08-30 10:12:24 Functions: 19 45 42.2 %

          Line data    Source code
       1             : ;;; rx.el --- sexp notation for regular expressions
       2             : 
       3             : ;; Copyright (C) 2001-2017 Free Software Foundation, Inc.
       4             : 
       5             : ;; Author: Gerd Moellmann <gerd@gnu.org>
       6             : ;; Maintainer: emacs-devel@gnu.org
       7             : ;; Keywords: strings, regexps, extensions
       8             : 
       9             : ;; This file is part of GNU Emacs.
      10             : 
      11             : ;; GNU Emacs is free software: you can redistribute it and/or modify
      12             : ;; it under the terms of the GNU General Public License as published by
      13             : ;; the Free Software Foundation, either version 3 of the License, or
      14             : ;; (at your option) any later version.
      15             : 
      16             : ;; GNU Emacs is distributed in the hope that it will be useful,
      17             : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
      18             : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      19             : ;; GNU General Public License for more details.
      20             : 
      21             : ;; You should have received a copy of the GNU General Public License
      22             : ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
      23             : 
      24             : ;;; Commentary:
      25             : 
      26             : ;; This is another implementation of sexp-form regular expressions.
      27             : ;; It was unfortunately written without being aware of the Sregex
      28             : ;; package coming with Emacs, but as things stand, Rx completely
      29             : ;; covers all regexp features, which Sregex doesn't, doesn't suffer
      30             : ;; from the bugs mentioned in the commentary section of Sregex, and
      31             : ;; uses a nicer syntax (IMHO, of course :-).
      32             : 
      33             : ;; This significantly extended version of the original, is almost
      34             : ;; compatible with Sregex.  The only incompatibility I (fx) know of is
      35             : ;; that the `repeat' form can't have multiple regexp args.
      36             : 
      37             : ;; Now alternative forms are provided for a degree of compatibility
      38             : ;; with Olin Shivers' attempted definitive SRE notation.  SRE forms
      39             : ;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
      40             : ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
      41             : ;; Some forms are inconsistent with SRE, either for historical reasons
      42             : ;; or because of the implementation -- simple translation into Emacs
      43             : ;; regexp strings.  These include: any, word.  Also, case-sensitivity
      44             : ;; and greediness are controlled by variables external to the regexp,
      45             : ;; and you need to feed the forms to the `posix-' functions to get
      46             : ;; SRE's POSIX semantics.  There are probably more difficulties.
      47             : 
      48             : ;; Rx translates a sexp notation for regular expressions into the
      49             : ;; usual string notation.  The translation can be done at compile-time
      50             : ;; by using the `rx' macro.  It can be done at run-time by calling
      51             : ;; function `rx-to-string'.  See the documentation of `rx' for a
      52             : ;; complete description of the sexp notation.
      53             : ;;
      54             : ;; Some examples of string regexps and their sexp counterparts:
      55             : ;;
      56             : ;; "^[a-z]*"
      57             : ;; (rx (and line-start (0+ (in "a-z"))))
      58             : ;;
      59             : ;; "\n[^ \t]"
      60             : ;; (rx (and "\n" (not blank))), or
      61             : ;; (rx (and "\n" (not (any " \t"))))
      62             : ;;
      63             : ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
      64             : ;; (rx "*** EOOH ***\n")
      65             : ;;
      66             : ;; "\\<\\(catch\\|finally\\)\\>[^_]"
      67             : ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
      68             : ;;          (not (any ?_))))
      69             : ;;
      70             : ;; "[ \t\n]*:\\([^:]+\\|$\\)"
      71             : ;; (rx (and (zero-or-more (in " \t\n")) ":"
      72             : ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
      73             : ;;
      74             : ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
      75             : ;; (rx (and line-start
      76             : ;;          "content-transfer-encoding:"
      77             : ;;          (+ (? ?\n)) blank
      78             : ;;          "quoted-printable"
      79             : ;;          (+ (? ?\n)) blank))
      80             : ;;
      81             : ;; (concat "^\\(?:" something-else "\\)")
      82             : ;; (rx (and line-start (eval something-else))), statically or
      83             : ;; (rx-to-string '(and line-start ,something-else)), dynamically.
      84             : ;;
      85             : ;; (regexp-opt '(STRING1 STRING2 ...))
      86             : ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
      87             : ;; calls `regexp-opt' as needed.
      88             : ;;
      89             : ;; "^;;\\s-*\n\\|^\n"
      90             : ;; (rx (or (and line-start ";;" (0+ space) ?\n)
      91             : ;;         (and line-start ?\n)))
      92             : ;;
      93             : ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
      94             : ;; (rx (and "$Id: "
      95             : ;;          (1+ (not (in " ")))
      96             : ;;          " "
      97             : ;;          (submatch (1+ (not (in " "))))
      98             : ;;          " "))
      99             : ;;
     100             : ;; "\\\\\\\\\\[\\w+"
     101             : ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
     102             : ;;
     103             : ;; etc.
     104             : 
     105             : ;;; History:
     106             : ;;
     107             : 
     108             : ;;; Code:
     109             : 
     110             : ;; FIXME: support macros.
     111             : 
     112             : (defvar rx-constituents              ;Not `const' because some modes extend it.
     113             :   '((and                . (rx-and 1 nil))
     114             :     (seq                . and)          ; SRE
     115             :     (:                  . and)          ; SRE
     116             :     (sequence           . and)          ; sregex
     117             :     (or                 . (rx-or 1 nil))
     118             :     (|                  . or)           ; SRE
     119             :     (not-newline        . ".")
     120             :     (nonl               . not-newline)  ; SRE
     121             :     (anything           . (rx-anything 0 nil))
     122             :     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
     123             :     (any                . ".")          ; sregex
     124             :     (in                 . any)
     125             :     (char               . any)          ; sregex
     126             :     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
     127             :     (not                . (rx-not 1 1 rx-check-not))
     128             :     (repeat             . (rx-repeat 2 nil))
     129             :     (=                  . (rx-= 2 nil))    ; SRE
     130             :     (>=                      . (rx->= 2 nil))   ; SRE
     131             :     (**                 . (rx-** 2 nil))   ; SRE
     132             :     (submatch           . (rx-submatch 1 nil)) ; SRE
     133             :     (group              . submatch)     ; sregex
     134             :     (submatch-n         . (rx-submatch-n 2 nil))
     135             :     (group-n            . submatch-n)
     136             :     (zero-or-more       . (rx-kleene 1 nil))
     137             :     (one-or-more        . (rx-kleene 1 nil))
     138             :     (zero-or-one        . (rx-kleene 1 nil))
     139             :     (\?                 . zero-or-one)  ; SRE
     140             :     (\??                . zero-or-one)
     141             :     (*                  . zero-or-more) ; SRE
     142             :     (*?                 . zero-or-more)
     143             :     (0+                 . zero-or-more)
     144             :     (+                  . one-or-more)  ; SRE
     145             :     (+?                 . one-or-more)
     146             :     (1+                 . one-or-more)
     147             :     (optional           . zero-or-one)
     148             :     (opt                . zero-or-one)  ; sregex
     149             :     (minimal-match      . (rx-greedy 1 1))
     150             :     (maximal-match      . (rx-greedy 1 1))
     151             :     (backref            . (rx-backref 1 1 rx-check-backref))
     152             :     (line-start         . "^")
     153             :     (bol                . line-start)   ; SRE
     154             :     (line-end           . "$")
     155             :     (eol                . line-end)     ; SRE
     156             :     (string-start       . "\\`")
     157             :     (bos                . string-start) ; SRE
     158             :     (bot                . string-start) ; sregex
     159             :     (string-end         . "\\'")
     160             :     (eos                . string-end)   ; SRE
     161             :     (eot                . string-end)   ; sregex
     162             :     (buffer-start       . "\\`")
     163             :     (buffer-end         . "\\'")
     164             :     (point              . "\\=")
     165             :     (word-start         . "\\<")
     166             :     (bow                . word-start)   ; SRE
     167             :     (word-end           . "\\>")
     168             :     (eow                . word-end)     ; SRE
     169             :     (word-boundary      . "\\b")
     170             :     (not-word-boundary  . "\\B")      ; sregex
     171             :     (symbol-start       . "\\_<")
     172             :     (symbol-end         . "\\_>")
     173             :     (syntax             . (rx-syntax 1 1))
     174             :     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
     175             :     (category           . (rx-category 1 1 rx-check-category))
     176             :     (eval               . (rx-eval 1 1))
     177             :     (regexp             . (rx-regexp 1 1 stringp))
     178             :     (regex              . regexp)       ; sregex
     179             :     (digit              . "[[:digit:]]")
     180             :     (numeric            . digit)        ; SRE
     181             :     (num                . digit)        ; SRE
     182             :     (control            . "[[:cntrl:]]") ; SRE
     183             :     (cntrl              . control)       ; SRE
     184             :     (hex-digit          . "[[:xdigit:]]") ; SRE
     185             :     (hex                . hex-digit)      ; SRE
     186             :     (xdigit             . hex-digit)      ; SRE
     187             :     (blank              . "[[:blank:]]")  ; SRE
     188             :     (graphic            . "[[:graph:]]")  ; SRE
     189             :     (graph              . graphic)        ; SRE
     190             :     (printing           . "[[:print:]]")  ; SRE
     191             :     (print              . printing)       ; SRE
     192             :     (alphanumeric       . "[[:alnum:]]")  ; SRE
     193             :     (alnum              . alphanumeric)   ; SRE
     194             :     (letter             . "[[:alpha:]]")
     195             :     (alphabetic         . letter)       ; SRE
     196             :     (alpha              . letter)       ; SRE
     197             :     (ascii              . "[[:ascii:]]") ; SRE
     198             :     (nonascii           . "[[:nonascii:]]")
     199             :     (lower              . "[[:lower:]]") ; SRE
     200             :     (lower-case         . lower)         ; SRE
     201             :     (punctuation        . "[[:punct:]]") ; SRE
     202             :     (punct              . punctuation)   ; SRE
     203             :     (space              . "[[:space:]]") ; SRE
     204             :     (whitespace         . space)         ; SRE
     205             :     (white              . space)         ; SRE
     206             :     (upper              . "[[:upper:]]") ; SRE
     207             :     (upper-case         . upper)         ; SRE
     208             :     (word               . "[[:word:]]")        ; inconsistent with SRE
     209             :     (wordchar           . word)          ; sregex
     210             :     (not-wordchar       . "\\W"))
     211             :   "Alist of sexp form regexp constituents.
     212             : Each element of the alist has the form (SYMBOL . DEFN).
     213             : SYMBOL is a valid constituent of sexp regular expressions.
     214             : If DEFN is a string, SYMBOL is translated into DEFN.
     215             : If DEFN is a symbol, use the definition of DEFN, recursively.
     216             : Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
     217             : FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
     218             : are the minimum and maximum number of arguments the function-form
     219             : sexp constituent SYMBOL may have in sexp regular expressions.
     220             : MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
     221             : all arguments must satisfy PREDICATE.")
     222             : 
     223             : 
     224             : (defconst rx-syntax
     225             :   '((whitespace         . ?-)
     226             :     (punctuation        . ?.)
     227             :     (word               . ?w)
     228             :     (symbol             . ?_)
     229             :     (open-parenthesis   . ?\()
     230             :     (close-parenthesis  . ?\))
     231             :     (expression-prefix  . ?\')
     232             :     (string-quote       . ?\")
     233             :     (paired-delimiter   . ?$)
     234             :     (escape             . ?\\)
     235             :     (character-quote    . ?/)
     236             :     (comment-start      . ?<)
     237             :     (comment-end        . ?>)
     238             :     (string-delimiter   . ?|)
     239             :     (comment-delimiter  . ?!))
     240             :   "Alist mapping Rx syntax symbols to syntax characters.
     241             : Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
     242             : symbol in `(syntax SYMBOL)', and CHAR is the syntax character
     243             : corresponding to SYMBOL, as it would be used with \\s or \\S in
     244             : regular expressions.")
     245             : 
     246             : 
     247             : (defconst rx-categories
     248             :   '((consonant                  . ?0)
     249             :     (base-vowel                 . ?1)
     250             :     (upper-diacritical-mark     . ?2)
     251             :     (lower-diacritical-mark     . ?3)
     252             :     (tone-mark                  . ?4)
     253             :     (symbol                     . ?5)
     254             :     (digit                      . ?6)
     255             :     (vowel-modifying-diacritical-mark . ?7)
     256             :     (vowel-sign                 . ?8)
     257             :     (semivowel-lower            . ?9)
     258             :     (not-at-end-of-line         . ?<)
     259             :     (not-at-beginning-of-line   . ?>)
     260             :     (alpha-numeric-two-byte     . ?A)
     261             :     (chinese-two-byte           . ?C)
     262             :     (chinse-two-byte            . ?C) ;; A typo in Emacs 21.1-24.3.
     263             :     (greek-two-byte             . ?G)
     264             :     (japanese-hiragana-two-byte . ?H)
     265             :     (indian-two-byte            . ?I)
     266             :     (japanese-katakana-two-byte . ?K)
     267             :     (korean-hangul-two-byte     . ?N)
     268             :     (cyrillic-two-byte          . ?Y)
     269             :     (combining-diacritic        . ?^)
     270             :     (ascii                      . ?a)
     271             :     (arabic                     . ?b)
     272             :     (chinese                    . ?c)
     273             :     (ethiopic                   . ?e)
     274             :     (greek                      . ?g)
     275             :     (korean                     . ?h)
     276             :     (indian                     . ?i)
     277             :     (japanese                   . ?j)
     278             :     (japanese-katakana          . ?k)
     279             :     (latin                      . ?l)
     280             :     (lao                        . ?o)
     281             :     (tibetan                    . ?q)
     282             :     (japanese-roman             . ?r)
     283             :     (thai                       . ?t)
     284             :     (vietnamese                 . ?v)
     285             :     (hebrew                     . ?w)
     286             :     (cyrillic                   . ?y)
     287             :     (can-break                  . ?|))
     288             :   "Alist mapping symbols to category characters.
     289             : Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
     290             : symbol in `(category SYMBOL)', and CHAR is the category character
     291             : corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
     292             : regular expression strings.")
     293             : 
     294             : 
     295             : (defvar rx-greedy-flag t
     296             :   "Non-nil means produce greedy regular expressions for `zero-or-one',
     297             : `zero-or-more', and `one-or-more'.  Dynamically bound.")
     298             : 
     299             : 
     300             : (defun rx-info (op head)
     301             :   "Return parsing/code generation info for OP.
     302             : If OP is the space character ASCII 32, return info for the symbol `?'.
     303             : If OP is the character `?', return info for the symbol `??'.
     304             : See also `rx-constituents'.
     305             : If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
     306             : a standalone symbol."
     307           8 :   (cond ((eq op ? ) (setq op '\?))
     308           8 :         ((eq op ??) (setq op '\??)))
     309           8 :   (let (old-op)
     310          20 :     (while (and (not (null op)) (symbolp op))
     311          12 :       (setq old-op op)
     312          12 :       (setq op (cdr (assq op rx-constituents)))
     313          12 :       (when (if head (stringp op) (consp op))
     314             :         ;; We found something but of the wrong kind.  Let's look for an
     315             :         ;; alternate definition for the other case.
     316           0 :         (let ((new-op
     317           0 :                (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
     318           0 :                                             rx-constituents))))))
     319           0 :           (if (and new-op (not (if head (stringp new-op) (consp new-op))))
     320          12 :               (setq op new-op))))))
     321           8 :   op)
     322             : 
     323             : 
     324             : (defun rx-check (form)
     325             :   "Check FORM according to its car's parsing info."
     326           4 :   (unless (listp form)
     327           4 :     (error "rx `%s' needs argument(s)" form))
     328           4 :   (let* ((rx (rx-info (car form) 'head))
     329           4 :          (nargs (1- (length form)))
     330           4 :          (min-args (nth 1 rx))
     331           4 :          (max-args (nth 2 rx))
     332           4 :          (type-pred (nth 3 rx)))
     333           4 :     (when (and (not (null min-args))
     334           4 :                (< nargs min-args))
     335           0 :       (error "rx form `%s' requires at least %d args"
     336           4 :              (car form) min-args))
     337           4 :     (when (and (not (null max-args))
     338           4 :                (> nargs max-args))
     339           0 :       (error "rx form `%s' accepts at most %d args"
     340           4 :              (car form) max-args))
     341           4 :     (when (not (null type-pred))
     342           2 :       (dolist (sub-form (cdr form))
     343           2 :         (unless (funcall type-pred sub-form)
     344           0 :           (error "rx form `%s' requires args satisfying `%s'"
     345           4 :                  (car form) type-pred))))))
     346             : 
     347             : 
     348             : (defun rx-group-if (regexp group)
     349             :   "Put shy groups around REGEXP if seemingly necessary when GROUP
     350             : is non-nil."
     351           5 :   (cond
     352             :    ;; for some repetition
     353           5 :    ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
     354             :    ;; for concatenation
     355           5 :    ((eq group ':)
     356           0 :     (if (rx-atomic-p
     357           0 :          (if (string-match
     358           0 :               "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
     359           0 :              (substring regexp 0 (match-beginning 0))
     360           0 :            regexp))
     361           0 :         (setq group nil)))
     362             :    ;; for OR
     363           5 :    ((eq group '|) (setq group nil))
     364             :    ;; do anyway
     365           5 :    ((eq group t))
     366           5 :    ((rx-atomic-p regexp t) (setq group nil)))
     367           5 :   (if group
     368           0 :       (concat "\\(?:" regexp "\\)")
     369           5 :     regexp))
     370             : 
     371             : 
     372             : (defvar rx-parent)
     373             : ;; dynamically bound in some functions.
     374             : 
     375             : 
     376             : (defun rx-and (form)
     377             :   "Parse and produce code from FORM.
     378             : FORM is of the form `(and FORM1 ...)'."
     379           1 :   (rx-check form)
     380           1 :   (rx-group-if
     381           3 :    (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
     382           1 :    (and (memq rx-parent '(* t)) rx-parent)))
     383             : 
     384             : 
     385             : (defun rx-or (form)
     386             :   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
     387           1 :   (rx-check form)
     388           1 :   (rx-group-if
     389           1 :    (if (memq nil (mapcar 'stringp (cdr form)))
     390           3 :        (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
     391           1 :      (regexp-opt (cdr form)))
     392           1 :    (and (memq rx-parent '(: * t)) rx-parent)))
     393             : 
     394             : 
     395             : (defun rx-anything (form)
     396             :   "Match any character."
     397           0 :   (if (consp form)
     398           0 :       (error "rx `anything' syntax error: %s" form))
     399           0 :   (rx-or (list 'or 'not-newline ?\n)))
     400             : 
     401             : 
     402             : (defun rx-any-delete-from-range (char ranges)
     403             :   "Delete by side effect character CHAR from RANGES.
     404             : Only both edges of each range is checked."
     405           0 :   (let (m)
     406           0 :     (cond
     407           0 :      ((memq char ranges) (setq ranges (delq char ranges)))
     408           0 :      ((setq m (assq char ranges))
     409           0 :       (if (eq (1+ char) (cdr m))
     410           0 :           (setcar (memq m ranges) (1+ char))
     411           0 :         (setcar m (1+ char))))
     412           0 :      ((setq m (rassq char ranges))
     413           0 :       (if (eq (1- char) (car m))
     414           0 :           (setcar (memq m ranges) (1- char))
     415           0 :         (setcdr m (1- char)))))
     416           0 :     ranges))
     417             : 
     418             : 
     419             : (defun rx-any-condense-range (args)
     420             :   "Condense by side effect ARGS as range for Rx `any'."
     421           1 :   (let (str
     422             :         l)
     423             :     ;; set STR list of all strings
     424             :     ;; set L list of all ranges
     425           2 :     (mapc (lambda (e) (cond ((stringp e) (push e str))
     426           2 :                             ((numberp e) (push (cons e e) l))
     427           1 :                             (t (push e l))))
     428           1 :           args)
     429             :     ;; condense overlapped ranges in L
     430           1 :     (let ((tail (setq l (sort l #'car-less-than-car)))
     431             :           d)
     432           1 :       (while (setq d (cdr tail))
     433           0 :         (if (>= (cdar tail) (1- (caar d)))
     434           0 :             (progn
     435           0 :               (setcdr (car tail) (max (cdar tail) (cdar d)))
     436           0 :               (setcdr tail (cdr d)))
     437           1 :           (setq tail d))))
     438             :     ;; Separate small ranges to single number, and delete dups.
     439           1 :     (nconc
     440           1 :      (apply #'nconc
     441           1 :             (mapcar (lambda (e)
     442           1 :                       (cond
     443           1 :                        ((= (car e) (cdr e)) (list (car e)))
     444           0 :                        ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
     445           1 :                        ((list e))))
     446           1 :                     l))
     447           1 :      (delete-dups str))))
     448             : 
     449             : 
     450             : (defun rx-check-any-string (str)
     451             :   "Check string argument STR for Rx `any'."
     452           2 :   (let ((i 0)
     453             :         c1 c2 l)
     454           2 :     (if (= 0 (length str))
     455           2 :         (error "String arg for Rx `any' must not be empty"))
     456           2 :     (while (string-match ".-." str i)
     457             :       ;; string before range: convert it to characters
     458           0 :       (if (< i (match-beginning 0))
     459           0 :           (setq l (nconc
     460           0 :                    l
     461           0 :                    (append (substring str i (match-beginning 0)) nil))))
     462             :       ;; range
     463           0 :       (setq i (match-end 0)
     464           0 :             c1 (aref str (match-beginning 0))
     465           0 :             c2 (aref str (1- i)))
     466           0 :       (cond
     467           0 :        ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
     468           2 :        ((= c1 c2) (setq l (nconc l (list c1))))))
     469             :     ;; rest?
     470           2 :     (if (< i (length str))
     471           2 :         (setq l (nconc l (append (substring str i) nil))))
     472           2 :     l))
     473             : 
     474             : 
     475             : (defun rx-check-any (arg)
     476             :    "Check arg ARG for Rx `any'."
     477           2 :    (cond
     478           2 :     ((integerp arg) (list arg))
     479           2 :     ((symbolp arg)
     480           0 :      (let ((translation (condition-case nil
     481           0 :                             (rx-form arg)
     482           0 :                           (error nil))))
     483           0 :        (if (or (null translation)
     484           0 :                (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
     485           0 :            (error "Invalid char class `%s' in Rx `any'" arg))
     486           0 :        (list (substring translation 1 -1)))) ; strip outer brackets
     487           2 :     ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
     488           0 :      (list arg))
     489           2 :     ((stringp arg) (rx-check-any-string arg))
     490           0 :     ((error
     491           2 :       "rx `any' requires string, character, char pair or char class args"))))
     492             : 
     493             : 
     494             : (defun rx-any (form)
     495             :   "Parse and produce code from FORM, which is `(any ARG ...)'.
     496             : ARG is optional."
     497           1 :   (rx-check form)
     498           1 :   (let* ((args (rx-any-condense-range
     499           1 :                 (apply
     500           1 :                  #'nconc
     501           1 :                  (mapcar #'rx-check-any (cdr form)))))
     502             :          m
     503             :          s)
     504           1 :     (cond
     505             :      ;; single close bracket
     506             :      ;;  => "[]...-]" or "[]...--.]"
     507           1 :      ((memq ?\] args)
     508             :       ;; set ] at the beginning
     509           0 :       (setq args (cons ?\] (delq ?\] args)))
     510             :       ;; set - at the end
     511           0 :       (if (or (memq ?- args) (assq ?- args))
     512           0 :           (setq args (nconc (rx-any-delete-from-range ?- args)
     513           0 :                             (list ?-)))))
     514             :      ;; close bracket starts a range
     515             :      ;;  => "[]-....-]" or "[]-.--....]"
     516           1 :      ((setq m (assq ?\] args))
     517             :       ;; bring it to the beginning
     518           0 :       (setq args (cons m (delq m args)))
     519           0 :       (cond ((memq ?- args)
     520             :              ;; to the end
     521           0 :              (setq args (nconc (delq ?- args) (list ?-))))
     522           0 :             ((setq m (assq ?- args))
     523             :              ;; next to the bracket's range, make the second range
     524           0 :              (setcdr args (cons m (delq m (cdr args)))))))
     525             :      ;; bracket in the end range
     526             :      ;;  => "[]...-]"
     527           1 :      ((setq m (rassq ?\] args))
     528             :       ;; set ] at the beginning
     529           0 :       (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
     530             :       ;; set - at the end
     531           0 :       (if (or (memq ?- args) (assq ?- args))
     532           0 :           (setq args (nconc (rx-any-delete-from-range ?- args)
     533           0 :                             (list ?-)))))
     534             :      ;; {no close bracket appears}
     535             :      ;;
     536             :      ;; bring single bar to the beginning
     537           1 :      ((memq ?- args)
     538           0 :       (setq args (cons ?- (delq ?- args))))
     539             :      ;; bar start a range, bring it to the beginning
     540           1 :      ((setq m (assq ?- args))
     541           0 :       (setq args (cons m (delq m args))))
     542             :      ;;
     543             :      ;; hat at the beginning?
     544           1 :      ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
     545           0 :       (setq args (if (cdr args)
     546           0 :                      `(,(cadr args) ,(car args) ,@(cddr args))
     547           0 :                    (nconc (rx-any-delete-from-range ?^ args)
     548           1 :                           (list ?^))))))
     549             :     ;; some 1-char?
     550           1 :     (if (and (null (cdr args)) (numberp (car args))
     551           1 :              (or (= 1 (length
     552           1 :                        (setq s (regexp-quote (string (car args))))))
     553           0 :                  (and (equal (car args) ?^) ;; unnecessary predicate?
     554           1 :                       (null (eq rx-parent '!)))))
     555           1 :         s
     556           0 :       (concat "["
     557           0 :               (mapconcat
     558           0 :                (lambda (e) (cond
     559           0 :                             ((numberp e) (string e))
     560           0 :                             ((consp e)
     561           0 :                              (if (and (= (1+ (car e)) (cdr e))
     562             :                                       ;; rx-any-condense-range should
     563             :                                       ;; prevent this case from happening.
     564           0 :                                       (null (memq (car e) '(?\] ?-)))
     565           0 :                                       (null (memq (cdr e) '(?\] ?-))))
     566           0 :                                  (string (car e) (cdr e))
     567           0 :                                (string (car e) ?- (cdr e))))
     568           0 :                             (e)))
     569           0 :                args
     570           0 :                nil)
     571           1 :               "]"))))
     572             : 
     573             : 
     574             : (defun rx-check-not (arg)
     575             :   "Check arg ARG for Rx `not'."
     576           1 :   (unless (or (and (symbolp arg)
     577           0 :                    (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
     578           0 :                                  (condition-case nil
     579           0 :                                      (rx-form arg)
     580           1 :                                    (error ""))))
     581           1 :               (eq arg 'word-boundary)
     582           1 :               (and (consp arg)
     583           1 :                    (memq (car arg) '(not any in syntax category))))
     584           1 :     (error "rx `not' syntax error: %s" arg))
     585             :   t)
     586             : 
     587             : 
     588             : (defun rx-not (form)
     589             :   "Parse and produce code from FORM.  FORM is `(not ...)'."
     590           1 :   (rx-check form)
     591           1 :   (let ((result (rx-form (cadr form) '!))
     592             :         case-fold-search)
     593           1 :     (cond ((string-match "\\`\\[^" result)
     594           0 :            (cond
     595           0 :             ((equal result "[^]") "[^^]")
     596           0 :             ((and (= (length result) 4) (null (eq rx-parent '!)))
     597           0 :              (regexp-quote (substring result 2 3)))
     598           0 :             ((concat "[" (substring result 2)))))
     599           1 :           ((eq ?\[ (aref result 0))
     600           0 :            (concat "[^" (substring result 1)))
     601           1 :           ((string-match "\\`\\\\[scbw]" result)
     602           0 :            (concat (upcase (substring result 0 2))
     603           0 :                    (substring result 2)))
     604           1 :           ((string-match "\\`\\\\[SCBW]" result)
     605           0 :            (concat (downcase (substring result 0 2))
     606           0 :                    (substring result 2)))
     607             :           (t
     608           1 :            (concat "[^" result "]")))))
     609             : 
     610             : 
     611             : (defun rx-not-char (form)
     612             :   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
     613           0 :   (rx-check form)
     614           0 :   (rx-not `(not (in ,@(cdr form)))))
     615             : 
     616             : 
     617             : (defun rx-not-syntax (form)
     618             :   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
     619           0 :   (rx-check form)
     620           0 :   (rx-not `(not (syntax ,@(cdr form)))))
     621             : 
     622             : 
     623             : (defun rx-trans-forms (form &optional skip)
     624             :   "If FORM's length is greater than two, transform it to length two.
     625             : A form (HEAD REST ...) becomes (HEAD (and REST ...)).
     626             : If SKIP is non-nil, allow that number of items after the head, i.e.
     627             : `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
     628           0 :   (unless skip (setq skip 0))
     629           0 :   (let ((tail (nthcdr (1+ skip) form)))
     630           0 :     (if (= (length tail) 1)
     631           0 :         form
     632           0 :       (let ((form (copy-sequence form)))
     633           0 :         (setcdr (nthcdr skip form) (list (cons 'and tail)))
     634           0 :         form))))
     635             : 
     636             : 
     637             : (defun rx-= (form)
     638             :   "Parse and produce code from FORM `(= N ...)'."
     639           0 :   (rx-check form)
     640           0 :   (setq form (rx-trans-forms form 1))
     641           0 :   (unless (and (integerp (nth 1 form))
     642           0 :                (> (nth 1 form) 0))
     643           0 :     (error "rx `=' requires positive integer first arg"))
     644           0 :   (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
     645             : 
     646             : 
     647             : (defun rx->= (form)
     648             :   "Parse and produce code from FORM `(>= N ...)'."
     649           0 :   (rx-check form)
     650           0 :   (setq form (rx-trans-forms form 1))
     651           0 :   (unless (and (integerp (nth 1 form))
     652           0 :                (> (nth 1 form) 0))
     653           0 :     (error "rx `>=' requires positive integer first arg"))
     654           0 :   (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
     655             : 
     656             : 
     657             : (defun rx-** (form)
     658             :   "Parse and produce code from FORM `(** N M ...)'."
     659           0 :   (rx-check form)
     660           0 :   (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
     661             : 
     662             : 
     663             : (defun rx-repeat (form)
     664             :   "Parse and produce code from FORM.
     665             : FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
     666           0 :   (rx-check form)
     667           0 :   (if (> (length form) 4)
     668           0 :       (setq form (rx-trans-forms form 2)))
     669           0 :   (if (null (nth 2 form))
     670           0 :       (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
     671           0 :   (cond ((= (length form) 3)
     672           0 :          (unless (and (integerp (nth 1 form))
     673           0 :                       (> (nth 1 form) 0))
     674           0 :            (error "rx `repeat' requires positive integer first arg"))
     675           0 :          (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
     676           0 :         ((or (not (integerp (nth 2 form)))
     677           0 :              (< (nth 2 form) 0)
     678           0 :              (not (integerp (nth 1 form)))
     679           0 :              (< (nth 1 form) 0)
     680           0 :              (< (nth 2 form) (nth 1 form)))
     681           0 :          (error "rx `repeat' range error"))
     682             :         (t
     683           0 :          (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
     684           0 :                  (nth 1 form) (nth 2 form)))))
     685             : 
     686             : 
     687             : (defun rx-submatch (form)
     688             :   "Parse and produce code from FORM, which is `(submatch ...)'."
     689           0 :   (concat "\\("
     690           0 :           (if (= 2 (length form))
     691             :               ;; Only one sub-form.
     692           0 :               (rx-form (cadr form))
     693             :             ;; Several sub-forms implicitly concatenated.
     694           0 :             (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
     695           0 :           "\\)"))
     696             : 
     697             : (defun rx-submatch-n (form)
     698             :   "Parse and produce code from FORM, which is `(submatch-n N ...)'."
     699           0 :   (let ((n (nth 1 form)))
     700           0 :     (concat "\\(?" (number-to-string n) ":"
     701           0 :             (if (= 3 (length form))
     702             :                 ;; Only one sub-form.
     703           0 :                 (rx-form (nth 2 form))
     704             :               ;; Several sub-forms implicitly concatenated.
     705           0 :               (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
     706           0 :             "\\)")))
     707             : 
     708             : (defun rx-backref (form)
     709             :   "Parse and produce code from FORM, which is `(backref N)'."
     710           0 :   (rx-check form)
     711           0 :   (format "\\%d" (nth 1 form)))
     712             : 
     713             : (defun rx-check-backref (arg)
     714             :   "Check arg ARG for Rx `backref'."
     715           0 :   (or (and (integerp arg) (>= arg 1) (<= arg 9))
     716           0 :       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
     717             : 
     718             : (defun rx-kleene (form)
     719             :   "Parse and produce code from FORM.
     720             : FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
     721             : `zero-or-more' etc.  operators.
     722             : If OP is one of `*', `+', `?', produce a greedy regexp.
     723             : If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
     724             : If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
     725             : is non-nil."
     726           0 :   (rx-check form)
     727           0 :   (setq form (rx-trans-forms form))
     728           0 :   (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
     729           0 :                       ((memq (car form) '(*? +? ??)) "?")
     730           0 :                       (rx-greedy-flag "")
     731           0 :                       (t "?")))
     732           0 :         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
     733           0 :                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
     734           0 :                   (t "?"))))
     735           0 :     (rx-group-if
     736           0 :      (concat (rx-form (cadr form) '*) op suffix)
     737           0 :      (and (memq rx-parent '(t *)) rx-parent))))
     738             : 
     739             : 
     740             : (defun rx-atomic-p (r &optional lax)
     741             :   "Return non-nil if regexp string R is atomic.
     742             : An atomic regexp R is one such that a suffix operator
     743             : appended to R will apply to all of R.  For example, \"a\"
     744             : \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
     745             : \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
     746             : 
     747             : This function may return false negatives, but it will not
     748             : return false positives.  It is nevertheless useful in
     749             : situations where an efficiency shortcut can be taken only if a
     750             : regexp is atomic.  The function can be improved to detect
     751             : more cases of atomic regexps.  Presently, this function
     752             : detects the following categories of atomic regexp;
     753             : 
     754             :   a group or shy group:  \\(...\\)
     755             :   a character class:     [...]
     756             :   a single character:    a
     757             : 
     758             : On the other hand, false negatives will be returned for
     759             : regexps that are atomic but end in operators, such as
     760             : \"a+\".  I think these are rare.  Probably such cases could
     761             : be detected without much effort.  A guarantee of no false
     762             : negatives would require a theoretic specification of the set
     763             : of all atomic regexps."
     764           5 :   (let ((l (length r)))
     765           5 :     (cond
     766           5 :      ((<= l 1))
     767           5 :      ((= l 2) (= (aref r 0) ?\\))
     768           5 :      ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
     769           4 :      ((null lax)
     770           0 :       (cond
     771           0 :        ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r))
     772           5 :        ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
     773             : 
     774             : 
     775             : (defun rx-syntax (form)
     776             :   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
     777           0 :   (rx-check form)
     778           0 :   (let* ((sym (cadr form))
     779           0 :          (syntax (cdr (assq sym rx-syntax))))
     780           0 :     (unless syntax
     781             :       ;; Try sregex compatibility.
     782           0 :       (cond
     783           0 :        ((characterp sym) (setq syntax sym))
     784           0 :        ((symbolp sym)
     785           0 :         (let ((name (symbol-name sym)))
     786           0 :           (if (= 1 (length name))
     787           0 :               (setq syntax (aref name 0))))))
     788           0 :       (unless syntax
     789           0 :         (error "Unknown rx syntax `%s'" sym)))
     790           0 :     (format "\\s%c" syntax)))
     791             : 
     792             : 
     793             : (defun rx-check-category (form)
     794             :   "Check the argument FORM of a `(category FORM)'."
     795           0 :   (unless (or (integerp form)
     796           0 :               (cdr (assq form rx-categories)))
     797           0 :     (error "Unknown category `%s'" form))
     798             :   t)
     799             : 
     800             : 
     801             : (defun rx-category (form)
     802             :   "Parse and produce code from FORM, which is `(category SYMBOL)'."
     803           0 :   (rx-check form)
     804           0 :   (let ((char (if (integerp (cadr form))
     805           0 :                   (cadr form)
     806           0 :                 (cdr (assq (cadr form) rx-categories)))))
     807           0 :     (format "\\c%c" char)))
     808             : 
     809             : 
     810             : (defun rx-eval (form)
     811             :   "Parse and produce code from FORM, which is `(eval FORM)'."
     812           0 :   (rx-check form)
     813           0 :   (rx-form (eval (cadr form)) rx-parent))
     814             : 
     815             : 
     816             : (defun rx-greedy (form)
     817             :   "Parse and produce code from FORM.
     818             : If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
     819             : `+', and `?' operators will be used in FORM1.  If FORM is
     820             : `(maximal-match FORM1)', greedy operators will be used."
     821           0 :   (rx-check form)
     822           0 :   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
     823           0 :     (rx-form (cadr form) rx-parent)))
     824             : 
     825             : 
     826             : (defun rx-regexp (form)
     827             :   "Parse and produce code from FORM, which is `(regexp STRING)'."
     828           0 :   (rx-check form)
     829           0 :   (rx-group-if (cadr form) rx-parent))
     830             : 
     831             : 
     832             : (defun rx-form (form &optional rx-parent)
     833             :   "Parse and produce code for regular expression FORM.
     834             : FORM is a regular expression in sexp form.
     835             : RX-PARENT shows which type of expression calls and controls putting of
     836             : shy groups around the result and some more in other functions."
     837           6 :   (cond
     838           6 :    ((stringp form)
     839           2 :     (rx-group-if (regexp-quote form)
     840           2 :                  (if (and (eq rx-parent '*) (< 1 (length form)))
     841           2 :                      rx-parent)))
     842           4 :    ((integerp form)
     843           0 :     (regexp-quote (char-to-string form)))
     844           4 :    ((symbolp form)
     845           0 :     (let ((info (rx-info form nil)))
     846           0 :       (cond ((stringp info)
     847           0 :              info)
     848           0 :             ((null info)
     849           0 :              (error "Unknown rx form `%s'" form))
     850             :             (t
     851           0 :              (funcall (nth 0 info) form)))))
     852           4 :    ((consp form)
     853           4 :     (let ((info (rx-info (car form) 'head)))
     854           4 :       (unless (consp info)
     855           4 :         (error "Unknown rx form `%s'" (car form)))
     856           4 :       (funcall (nth 0 info) form)))
     857             :    (t
     858           6 :     (error "rx syntax error at `%s'" form))))
     859             : 
     860             : 
     861             : ;;;###autoload
     862             : (defun rx-to-string (form &optional no-group)
     863             :   "Parse and produce code for regular expression FORM.
     864             : FORM is a regular expression in sexp form.
     865             : NO-GROUP non-nil means don't put shy groups around the result."
     866           1 :   (rx-group-if (rx-form form) (null no-group)))
     867             : 
     868             : 
     869             : ;;;###autoload
     870             : (defmacro rx (&rest regexps)
     871             :   "Translate regular expressions REGEXPS in sexp form to a regexp string.
     872             : REGEXPS is a non-empty sequence of forms of the sort listed below.
     873             : 
     874             : Note that `rx' is a Lisp macro; when used in a Lisp program being
     875             : compiled, the translation is performed by the compiler.
     876             : See `rx-to-string' for how to do such a translation at run-time.
     877             : 
     878             : The following are valid subforms of regular expressions in sexp
     879             : notation.
     880             : 
     881             : STRING
     882             :      matches string STRING literally.
     883             : 
     884             : CHAR
     885             :      matches character CHAR literally.
     886             : 
     887             : `not-newline', `nonl'
     888             :      matches any character except a newline.
     889             : 
     890             : `anything'
     891             :      matches any character
     892             : 
     893             : `(any SET ...)'
     894             : `(in SET ...)'
     895             : `(char SET ...)'
     896             :      matches any character in SET ....  SET may be a character or string.
     897             :      Ranges of characters can be specified as `A-Z' in strings.
     898             :      Ranges may also be specified as conses like `(?A . ?Z)'.
     899             : 
     900             :      SET may also be the name of a character class: `digit',
     901             :      `control', `hex-digit', `blank', `graph', `print', `alnum',
     902             :      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
     903             :      `word', or one of their synonyms.
     904             : 
     905             : `(not (any SET ...))'
     906             :      matches any character not in SET ...
     907             : 
     908             : `line-start', `bol'
     909             :      matches the empty string, but only at the beginning of a line
     910             :      in the text being matched
     911             : 
     912             : `line-end', `eol'
     913             :      is similar to `line-start' but matches only at the end of a line
     914             : 
     915             : `string-start', `bos', `bot'
     916             :      matches the empty string, but only at the beginning of the
     917             :      string being matched against.
     918             : 
     919             : `string-end', `eos', `eot'
     920             :      matches the empty string, but only at the end of the
     921             :      string being matched against.
     922             : 
     923             : `buffer-start'
     924             :      matches the empty string, but only at the beginning of the
     925             :      buffer being matched against.  Actually equivalent to `string-start'.
     926             : 
     927             : `buffer-end'
     928             :      matches the empty string, but only at the end of the
     929             :      buffer being matched against.  Actually equivalent to `string-end'.
     930             : 
     931             : `point'
     932             :      matches the empty string, but only at point.
     933             : 
     934             : `word-start', `bow'
     935             :      matches the empty string, but only at the beginning of a word.
     936             : 
     937             : `word-end', `eow'
     938             :      matches the empty string, but only at the end of a word.
     939             : 
     940             : `word-boundary'
     941             :      matches the empty string, but only at the beginning or end of a
     942             :      word.
     943             : 
     944             : `(not word-boundary)'
     945             : `not-word-boundary'
     946             :      matches the empty string, but not at the beginning or end of a
     947             :      word.
     948             : 
     949             : `symbol-start'
     950             :      matches the empty string, but only at the beginning of a symbol.
     951             : 
     952             : `symbol-end'
     953             :      matches the empty string, but only at the end of a symbol.
     954             : 
     955             : `digit', `numeric', `num'
     956             :      matches 0 through 9.
     957             : 
     958             : `control', `cntrl'
     959             :      matches ASCII control characters.
     960             : 
     961             : `hex-digit', `hex', `xdigit'
     962             :      matches 0 through 9, a through f and A through F.
     963             : 
     964             : `blank'
     965             :      matches space and tab only.
     966             : 
     967             : `graphic', `graph'
     968             :      matches graphic characters--everything except whitespace, ASCII
     969             :      and non-ASCII control characters, surrogates, and codepoints
     970             :      unassigned by Unicode.
     971             : 
     972             : `printing', `print'
     973             :      matches whitespace and graphic characters.
     974             : 
     975             : `alphanumeric', `alnum'
     976             :      matches alphabetic characters and digits.  (For multibyte characters,
     977             :      it matches according to Unicode character properties.)
     978             : 
     979             : `letter', `alphabetic', `alpha'
     980             :      matches alphabetic characters.  (For multibyte characters,
     981             :      it matches according to Unicode character properties.)
     982             : 
     983             : `ascii'
     984             :      matches ASCII (unibyte) characters.
     985             : 
     986             : `nonascii'
     987             :      matches non-ASCII (multibyte) characters.
     988             : 
     989             : `lower', `lower-case'
     990             :      matches anything lower-case.
     991             : 
     992             : `upper', `upper-case'
     993             :      matches anything upper-case.
     994             : 
     995             : `punctuation', `punct'
     996             :      matches punctuation.  (But at present, for multibyte characters,
     997             :      it matches anything that has non-word syntax.)
     998             : 
     999             : `space', `whitespace', `white'
    1000             :      matches anything that has whitespace syntax.
    1001             : 
    1002             : `word', `wordchar'
    1003             :      matches anything that has word syntax.
    1004             : 
    1005             : `not-wordchar'
    1006             :      matches anything that has non-word syntax.
    1007             : 
    1008             : `(syntax SYNTAX)'
    1009             :      matches a character with syntax SYNTAX.  SYNTAX must be one
    1010             :      of the following symbols, or a symbol corresponding to the syntax
    1011             :      character, e.g. `\\.' for `\\s.'.
    1012             : 
    1013             :      `whitespace'               (\\s- in string notation)
    1014             :      `punctuation'              (\\s.)
    1015             :      `word'                     (\\sw)
    1016             :      `symbol'                   (\\s_)
    1017             :      `open-parenthesis'         (\\s()
    1018             :      `close-parenthesis'        (\\s))
    1019             :      `expression-prefix'        (\\s')
    1020             :      `string-quote'             (\\s\")
    1021             :      `paired-delimiter'         (\\s$)
    1022             :      `escape'                   (\\s\\)
    1023             :      `character-quote'          (\\s/)
    1024             :      `comment-start'            (\\s<)
    1025             :      `comment-end'              (\\s>)
    1026             :      `string-delimiter'         (\\s|)
    1027             :      `comment-delimiter'        (\\s!)
    1028             : 
    1029             : `(not (syntax SYNTAX))'
    1030             :      matches a character that doesn't have syntax SYNTAX.
    1031             : 
    1032             : `(category CATEGORY)'
    1033             :      matches a character with category CATEGORY.  CATEGORY must be
    1034             :      either a character to use for C, or one of the following symbols.
    1035             : 
    1036             :      `consonant'                        (\\c0 in string notation)
    1037             :      `base-vowel'                       (\\c1)
    1038             :      `upper-diacritical-mark'           (\\c2)
    1039             :      `lower-diacritical-mark'           (\\c3)
    1040             :      `tone-mark'                        (\\c4)
    1041             :      `symbol'                           (\\c5)
    1042             :      `digit'                            (\\c6)
    1043             :      `vowel-modifying-diacritical-mark' (\\c7)
    1044             :      `vowel-sign'                       (\\c8)
    1045             :      `semivowel-lower'                  (\\c9)
    1046             :      `not-at-end-of-line'               (\\c<)
    1047             :      `not-at-beginning-of-line'         (\\c>)
    1048             :      `alpha-numeric-two-byte'           (\\cA)
    1049             :      `chinese-two-byte'                 (\\cC)
    1050             :      `greek-two-byte'                   (\\cG)
    1051             :      `japanese-hiragana-two-byte'       (\\cH)
    1052             :      `indian-tow-byte'                  (\\cI)
    1053             :      `japanese-katakana-two-byte'       (\\cK)
    1054             :      `korean-hangul-two-byte'           (\\cN)
    1055             :      `cyrillic-two-byte'                (\\cY)
    1056             :      `combining-diacritic'              (\\c^)
    1057             :      `ascii'                            (\\ca)
    1058             :      `arabic'                           (\\cb)
    1059             :      `chinese'                          (\\cc)
    1060             :      `ethiopic'                         (\\ce)
    1061             :      `greek'                            (\\cg)
    1062             :      `korean'                           (\\ch)
    1063             :      `indian'                           (\\ci)
    1064             :      `japanese'                         (\\cj)
    1065             :      `japanese-katakana'                (\\ck)
    1066             :      `latin'                            (\\cl)
    1067             :      `lao'                              (\\co)
    1068             :      `tibetan'                          (\\cq)
    1069             :      `japanese-roman'                   (\\cr)
    1070             :      `thai'                             (\\ct)
    1071             :      `vietnamese'                       (\\cv)
    1072             :      `hebrew'                           (\\cw)
    1073             :      `cyrillic'                         (\\cy)
    1074             :      `can-break'                        (\\c|)
    1075             : 
    1076             : `(not (category CATEGORY))'
    1077             :      matches a character that doesn't have category CATEGORY.
    1078             : 
    1079             : `(and SEXP1 SEXP2 ...)'
    1080             : `(: SEXP1 SEXP2 ...)'
    1081             : `(seq SEXP1 SEXP2 ...)'
    1082             : `(sequence SEXP1 SEXP2 ...)'
    1083             :      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
    1084             : 
    1085             : `(submatch SEXP1 SEXP2 ...)'
    1086             : `(group SEXP1 SEXP2 ...)'
    1087             :      like `and', but makes the match accessible with `match-end',
    1088             :      `match-beginning', and `match-string'.
    1089             : 
    1090             : `(submatch-n N SEXP1 SEXP2 ...)'
    1091             : `(group-n N SEXP1 SEXP2 ...)'
    1092             :      like `group', but make it an explicitly-numbered group with
    1093             :      group number N.
    1094             : 
    1095             : `(or SEXP1 SEXP2 ...)'
    1096             : `(| SEXP1 SEXP2 ...)'
    1097             :      matches anything that matches SEXP1 or SEXP2, etc.  If all
    1098             :      args are strings, use `regexp-opt' to optimize the resulting
    1099             :      regular expression.
    1100             : 
    1101             : `(minimal-match SEXP)'
    1102             :      produce a non-greedy regexp for SEXP.  Normally, regexps matching
    1103             :      zero or more occurrences of something are \"greedy\" in that they
    1104             :      match as much as they can, as long as the overall regexp can
    1105             :      still match.  A non-greedy regexp matches as little as possible.
    1106             : 
    1107             : `(maximal-match SEXP)'
    1108             :      produce a greedy regexp for SEXP.  This is the default.
    1109             : 
    1110             : Below, `SEXP ...' represents a sequence of regexp forms, treated as if
    1111             : enclosed in `(and ...)'.
    1112             : 
    1113             : `(zero-or-more SEXP ...)'
    1114             : `(0+ SEXP ...)'
    1115             :      matches zero or more occurrences of what SEXP ... matches.
    1116             : 
    1117             : `(* SEXP ...)'
    1118             :      like `zero-or-more', but always produces a greedy regexp, independent
    1119             :      of `rx-greedy-flag'.
    1120             : 
    1121             : `(*? SEXP ...)'
    1122             :      like `zero-or-more', but always produces a non-greedy regexp,
    1123             :      independent of `rx-greedy-flag'.
    1124             : 
    1125             : `(one-or-more SEXP ...)'
    1126             : `(1+ SEXP ...)'
    1127             :      matches one or more occurrences of SEXP ...
    1128             : 
    1129             : `(+ SEXP ...)'
    1130             :      like `one-or-more', but always produces a greedy regexp.
    1131             : 
    1132             : `(+? SEXP ...)'
    1133             :      like `one-or-more', but always produces a non-greedy regexp.
    1134             : 
    1135             : `(zero-or-one SEXP ...)'
    1136             : `(optional SEXP ...)'
    1137             : `(opt SEXP ...)'
    1138             :      matches zero or one occurrences of A.
    1139             : 
    1140             : `(? SEXP ...)'
    1141             :      like `zero-or-one', but always produces a greedy regexp.
    1142             : 
    1143             : `(?? SEXP ...)'
    1144             :      like `zero-or-one', but always produces a non-greedy regexp.
    1145             : 
    1146             : `(repeat N SEXP)'
    1147             : `(= N SEXP ...)'
    1148             :      matches N occurrences.
    1149             : 
    1150             : `(>= N SEXP ...)'
    1151             :      matches N or more occurrences.
    1152             : 
    1153             : `(repeat N M SEXP)'
    1154             : `(** N M SEXP ...)'
    1155             :      matches N to M occurrences.
    1156             : 
    1157             : `(backref N)'
    1158             :      matches what was matched previously by submatch N.
    1159             : 
    1160             : `(eval FORM)'
    1161             :      evaluate FORM and insert result.  If result is a string,
    1162             :      `regexp-quote' it.
    1163             : 
    1164             : `(regexp REGEXP)'
    1165             :      include REGEXP in string notation in the result."
    1166             :   (cond ((null regexps)
    1167             :          (error "No regexp"))
    1168             :         ((cdr regexps)
    1169             :          (rx-to-string `(and ,@regexps) t))
    1170             :         (t
    1171             :          (rx-to-string (car regexps) t))))
    1172             : 
    1173             : 
    1174             : (pcase-defmacro rx (&rest regexps)
    1175             :   "Build a `pcase' pattern matching `rx' regexps.
    1176             : The REGEXPS are interpreted as by `rx'.  The pattern matches if
    1177             : the regular expression so constructed matches the object, as if
    1178             : by `string-match'.
    1179             : 
    1180             : In addition to the usual `rx' constructs, REGEXPS can contain the
    1181             : following constructs:
    1182             : 
    1183             :   (let VAR FORM...)  creates a new explicitly numbered submatch
    1184             :                      that matches FORM and binds the match to
    1185             :                      VAR.
    1186             :   (backref VAR)      creates a backreference to the submatch
    1187             :                      introduced by a previous (let VAR ...)
    1188             :                      construct.
    1189             : 
    1190             : The VARs are associated with explicitly numbered submatches
    1191             : starting from 1.  Multiple occurrences of the same VAR refer to
    1192             : the same submatch.
    1193             : 
    1194             : If a case matches, the match data is modified as usual so you can
    1195             : use it in the case body, but you still have to pass the correct
    1196             : string as argument to `match-string'."
    1197           2 :   (let* ((vars ())
    1198             :          (rx-constituents
    1199           2 :           `((let
    1200             :              ,(lambda (form)
    1201           0 :                 (rx-check form)
    1202           0 :                 (let ((var (cadr form)))
    1203           0 :                   (cl-check-type var symbol)
    1204           0 :                   (let ((i (or (cl-position var vars :test #'eq)
    1205           0 :                                (prog1 (length vars)
    1206           0 :                                  (setq vars `(,@vars ,var))))))
    1207           0 :                     (rx-form `(submatch-n ,(1+ i) ,@(cddr form))))))
    1208             :              1 nil)
    1209             :             (backref
    1210             :              ,(lambda (form)
    1211           0 :                 (rx-check form)
    1212           0 :                 (rx-backref
    1213           0 :                  `(backref ,(let ((var (cadr form)))
    1214           0 :                               (if (integerp var) var
    1215           0 :                                 (1+ (cl-position var vars :test #'eq)))))))
    1216             :              1 1
    1217             :              ,(lambda (var)
    1218           0 :                 (cond ((integerp var) (rx-check-backref var))
    1219           0 :                       ((memq var vars) t)
    1220           0 :                       (t (error "rx `backref' variable must be one of %s: %s"
    1221           0 :                                 vars var)))))
    1222           2 :             ,@rx-constituents))
    1223           2 :          (regexp (rx-to-string `(seq ,@regexps) :no-group)))
    1224           0 :     `(and (pred (string-match ,regexp))
    1225           0 :           ,@(cl-loop for i from 1
    1226           0 :                      for var in vars
    1227           2 :                      collect `(app (match-string ,i) ,var)))))
    1228             : 
    1229             : ;; ;; sregex.el replacement
    1230             : 
    1231             : ;; ;;;###autoload (provide 'sregex)
    1232             : ;; ;;;###autoload (autoload 'sregex "rx")
    1233             : ;; (defalias 'sregex 'rx-to-string)
    1234             : ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
    1235             : ;; (defalias 'sregexq 'rx)
    1236             : 
    1237             : (provide 'rx)
    1238             : 
    1239             : ;;; rx.el ends here

Generated by: LCOV version 1.12