Line data Source code
1 : ;;; rx.el --- sexp notation for regular expressions
2 :
3 : ;; Copyright (C) 2001-2017 Free Software Foundation, Inc.
4 :
5 : ;; Author: Gerd Moellmann <gerd@gnu.org>
6 : ;; Maintainer: emacs-devel@gnu.org
7 : ;; Keywords: strings, regexps, extensions
8 :
9 : ;; This file is part of GNU Emacs.
10 :
11 : ;; GNU Emacs is free software: you can redistribute it and/or modify
12 : ;; it under the terms of the GNU General Public License as published by
13 : ;; the Free Software Foundation, either version 3 of the License, or
14 : ;; (at your option) any later version.
15 :
16 : ;; GNU Emacs is distributed in the hope that it will be useful,
17 : ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 : ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 : ;; GNU General Public License for more details.
20 :
21 : ;; You should have received a copy of the GNU General Public License
22 : ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
23 :
24 : ;;; Commentary:
25 :
26 : ;; This is another implementation of sexp-form regular expressions.
27 : ;; It was unfortunately written without being aware of the Sregex
28 : ;; package coming with Emacs, but as things stand, Rx completely
29 : ;; covers all regexp features, which Sregex doesn't, doesn't suffer
30 : ;; from the bugs mentioned in the commentary section of Sregex, and
31 : ;; uses a nicer syntax (IMHO, of course :-).
32 :
33 : ;; This significantly extended version of the original, is almost
34 : ;; compatible with Sregex. The only incompatibility I (fx) know of is
35 : ;; that the `repeat' form can't have multiple regexp args.
36 :
37 : ;; Now alternative forms are provided for a degree of compatibility
38 : ;; with Olin Shivers' attempted definitive SRE notation. SRE forms
39 : ;; not catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
40 : ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
41 : ;; Some forms are inconsistent with SRE, either for historical reasons
42 : ;; or because of the implementation -- simple translation into Emacs
43 : ;; regexp strings. These include: any, word. Also, case-sensitivity
44 : ;; and greediness are controlled by variables external to the regexp,
45 : ;; and you need to feed the forms to the `posix-' functions to get
46 : ;; SRE's POSIX semantics. There are probably more difficulties.
47 :
48 : ;; Rx translates a sexp notation for regular expressions into the
49 : ;; usual string notation. The translation can be done at compile-time
50 : ;; by using the `rx' macro. It can be done at run-time by calling
51 : ;; function `rx-to-string'. See the documentation of `rx' for a
52 : ;; complete description of the sexp notation.
53 : ;;
54 : ;; Some examples of string regexps and their sexp counterparts:
55 : ;;
56 : ;; "^[a-z]*"
57 : ;; (rx (and line-start (0+ (in "a-z"))))
58 : ;;
59 : ;; "\n[^ \t]"
60 : ;; (rx (and "\n" (not blank))), or
61 : ;; (rx (and "\n" (not (any " \t"))))
62 : ;;
63 : ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
64 : ;; (rx "*** EOOH ***\n")
65 : ;;
66 : ;; "\\<\\(catch\\|finally\\)\\>[^_]"
67 : ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
68 : ;; (not (any ?_))))
69 : ;;
70 : ;; "[ \t\n]*:\\([^:]+\\|$\\)"
71 : ;; (rx (and (zero-or-more (in " \t\n")) ":"
72 : ;; (submatch (or line-end (one-or-more (not (any ?:)))))))
73 : ;;
74 : ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
75 : ;; (rx (and line-start
76 : ;; "content-transfer-encoding:"
77 : ;; (+ (? ?\n)) blank
78 : ;; "quoted-printable"
79 : ;; (+ (? ?\n)) blank))
80 : ;;
81 : ;; (concat "^\\(?:" something-else "\\)")
82 : ;; (rx (and line-start (eval something-else))), statically or
83 : ;; (rx-to-string '(and line-start ,something-else)), dynamically.
84 : ;;
85 : ;; (regexp-opt '(STRING1 STRING2 ...))
86 : ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
87 : ;; calls `regexp-opt' as needed.
88 : ;;
89 : ;; "^;;\\s-*\n\\|^\n"
90 : ;; (rx (or (and line-start ";;" (0+ space) ?\n)
91 : ;; (and line-start ?\n)))
92 : ;;
93 : ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
94 : ;; (rx (and "$Id: "
95 : ;; (1+ (not (in " ")))
96 : ;; " "
97 : ;; (submatch (1+ (not (in " "))))
98 : ;; " "))
99 : ;;
100 : ;; "\\\\\\\\\\[\\w+"
101 : ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
102 : ;;
103 : ;; etc.
104 :
105 : ;;; History:
106 : ;;
107 :
108 : ;;; Code:
109 :
110 : ;; FIXME: support macros.
111 :
112 : (defvar rx-constituents ;Not `const' because some modes extend it.
113 : '((and . (rx-and 1 nil))
114 : (seq . and) ; SRE
115 : (: . and) ; SRE
116 : (sequence . and) ; sregex
117 : (or . (rx-or 1 nil))
118 : (| . or) ; SRE
119 : (not-newline . ".")
120 : (nonl . not-newline) ; SRE
121 : (anything . (rx-anything 0 nil))
122 : (any . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
123 : (any . ".") ; sregex
124 : (in . any)
125 : (char . any) ; sregex
126 : (not-char . (rx-not-char 1 nil rx-check-any)) ; sregex
127 : (not . (rx-not 1 1 rx-check-not))
128 : (repeat . (rx-repeat 2 nil))
129 : (= . (rx-= 2 nil)) ; SRE
130 : (>= . (rx->= 2 nil)) ; SRE
131 : (** . (rx-** 2 nil)) ; SRE
132 : (submatch . (rx-submatch 1 nil)) ; SRE
133 : (group . submatch) ; sregex
134 : (submatch-n . (rx-submatch-n 2 nil))
135 : (group-n . submatch-n)
136 : (zero-or-more . (rx-kleene 1 nil))
137 : (one-or-more . (rx-kleene 1 nil))
138 : (zero-or-one . (rx-kleene 1 nil))
139 : (\? . zero-or-one) ; SRE
140 : (\?? . zero-or-one)
141 : (* . zero-or-more) ; SRE
142 : (*? . zero-or-more)
143 : (0+ . zero-or-more)
144 : (+ . one-or-more) ; SRE
145 : (+? . one-or-more)
146 : (1+ . one-or-more)
147 : (optional . zero-or-one)
148 : (opt . zero-or-one) ; sregex
149 : (minimal-match . (rx-greedy 1 1))
150 : (maximal-match . (rx-greedy 1 1))
151 : (backref . (rx-backref 1 1 rx-check-backref))
152 : (line-start . "^")
153 : (bol . line-start) ; SRE
154 : (line-end . "$")
155 : (eol . line-end) ; SRE
156 : (string-start . "\\`")
157 : (bos . string-start) ; SRE
158 : (bot . string-start) ; sregex
159 : (string-end . "\\'")
160 : (eos . string-end) ; SRE
161 : (eot . string-end) ; sregex
162 : (buffer-start . "\\`")
163 : (buffer-end . "\\'")
164 : (point . "\\=")
165 : (word-start . "\\<")
166 : (bow . word-start) ; SRE
167 : (word-end . "\\>")
168 : (eow . word-end) ; SRE
169 : (word-boundary . "\\b")
170 : (not-word-boundary . "\\B") ; sregex
171 : (symbol-start . "\\_<")
172 : (symbol-end . "\\_>")
173 : (syntax . (rx-syntax 1 1))
174 : (not-syntax . (rx-not-syntax 1 1)) ; sregex
175 : (category . (rx-category 1 1 rx-check-category))
176 : (eval . (rx-eval 1 1))
177 : (regexp . (rx-regexp 1 1 stringp))
178 : (regex . regexp) ; sregex
179 : (digit . "[[:digit:]]")
180 : (numeric . digit) ; SRE
181 : (num . digit) ; SRE
182 : (control . "[[:cntrl:]]") ; SRE
183 : (cntrl . control) ; SRE
184 : (hex-digit . "[[:xdigit:]]") ; SRE
185 : (hex . hex-digit) ; SRE
186 : (xdigit . hex-digit) ; SRE
187 : (blank . "[[:blank:]]") ; SRE
188 : (graphic . "[[:graph:]]") ; SRE
189 : (graph . graphic) ; SRE
190 : (printing . "[[:print:]]") ; SRE
191 : (print . printing) ; SRE
192 : (alphanumeric . "[[:alnum:]]") ; SRE
193 : (alnum . alphanumeric) ; SRE
194 : (letter . "[[:alpha:]]")
195 : (alphabetic . letter) ; SRE
196 : (alpha . letter) ; SRE
197 : (ascii . "[[:ascii:]]") ; SRE
198 : (nonascii . "[[:nonascii:]]")
199 : (lower . "[[:lower:]]") ; SRE
200 : (lower-case . lower) ; SRE
201 : (punctuation . "[[:punct:]]") ; SRE
202 : (punct . punctuation) ; SRE
203 : (space . "[[:space:]]") ; SRE
204 : (whitespace . space) ; SRE
205 : (white . space) ; SRE
206 : (upper . "[[:upper:]]") ; SRE
207 : (upper-case . upper) ; SRE
208 : (word . "[[:word:]]") ; inconsistent with SRE
209 : (wordchar . word) ; sregex
210 : (not-wordchar . "\\W"))
211 : "Alist of sexp form regexp constituents.
212 : Each element of the alist has the form (SYMBOL . DEFN).
213 : SYMBOL is a valid constituent of sexp regular expressions.
214 : If DEFN is a string, SYMBOL is translated into DEFN.
215 : If DEFN is a symbol, use the definition of DEFN, recursively.
216 : Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
217 : FUNCTION is used to produce code for SYMBOL. MIN-ARGS and MAX-ARGS
218 : are the minimum and maximum number of arguments the function-form
219 : sexp constituent SYMBOL may have in sexp regular expressions.
220 : MAX-ARGS nil means no limit. PREDICATE, if specified, means that
221 : all arguments must satisfy PREDICATE.")
222 :
223 :
224 : (defconst rx-syntax
225 : '((whitespace . ?-)
226 : (punctuation . ?.)
227 : (word . ?w)
228 : (symbol . ?_)
229 : (open-parenthesis . ?\()
230 : (close-parenthesis . ?\))
231 : (expression-prefix . ?\')
232 : (string-quote . ?\")
233 : (paired-delimiter . ?$)
234 : (escape . ?\\)
235 : (character-quote . ?/)
236 : (comment-start . ?<)
237 : (comment-end . ?>)
238 : (string-delimiter . ?|)
239 : (comment-delimiter . ?!))
240 : "Alist mapping Rx syntax symbols to syntax characters.
241 : Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
242 : symbol in `(syntax SYMBOL)', and CHAR is the syntax character
243 : corresponding to SYMBOL, as it would be used with \\s or \\S in
244 : regular expressions.")
245 :
246 :
247 : (defconst rx-categories
248 : '((consonant . ?0)
249 : (base-vowel . ?1)
250 : (upper-diacritical-mark . ?2)
251 : (lower-diacritical-mark . ?3)
252 : (tone-mark . ?4)
253 : (symbol . ?5)
254 : (digit . ?6)
255 : (vowel-modifying-diacritical-mark . ?7)
256 : (vowel-sign . ?8)
257 : (semivowel-lower . ?9)
258 : (not-at-end-of-line . ?<)
259 : (not-at-beginning-of-line . ?>)
260 : (alpha-numeric-two-byte . ?A)
261 : (chinese-two-byte . ?C)
262 : (chinse-two-byte . ?C) ;; A typo in Emacs 21.1-24.3.
263 : (greek-two-byte . ?G)
264 : (japanese-hiragana-two-byte . ?H)
265 : (indian-two-byte . ?I)
266 : (japanese-katakana-two-byte . ?K)
267 : (korean-hangul-two-byte . ?N)
268 : (cyrillic-two-byte . ?Y)
269 : (combining-diacritic . ?^)
270 : (ascii . ?a)
271 : (arabic . ?b)
272 : (chinese . ?c)
273 : (ethiopic . ?e)
274 : (greek . ?g)
275 : (korean . ?h)
276 : (indian . ?i)
277 : (japanese . ?j)
278 : (japanese-katakana . ?k)
279 : (latin . ?l)
280 : (lao . ?o)
281 : (tibetan . ?q)
282 : (japanese-roman . ?r)
283 : (thai . ?t)
284 : (vietnamese . ?v)
285 : (hebrew . ?w)
286 : (cyrillic . ?y)
287 : (can-break . ?|))
288 : "Alist mapping symbols to category characters.
289 : Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
290 : symbol in `(category SYMBOL)', and CHAR is the category character
291 : corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
292 : regular expression strings.")
293 :
294 :
295 : (defvar rx-greedy-flag t
296 : "Non-nil means produce greedy regular expressions for `zero-or-one',
297 : `zero-or-more', and `one-or-more'. Dynamically bound.")
298 :
299 :
300 : (defun rx-info (op head)
301 : "Return parsing/code generation info for OP.
302 : If OP is the space character ASCII 32, return info for the symbol `?'.
303 : If OP is the character `?', return info for the symbol `??'.
304 : See also `rx-constituents'.
305 : If HEAD is non-nil, then OP is the head of a sexp, otherwise it's
306 : a standalone symbol."
307 8 : (cond ((eq op ? ) (setq op '\?))
308 8 : ((eq op ??) (setq op '\??)))
309 8 : (let (old-op)
310 20 : (while (and (not (null op)) (symbolp op))
311 12 : (setq old-op op)
312 12 : (setq op (cdr (assq op rx-constituents)))
313 12 : (when (if head (stringp op) (consp op))
314 : ;; We found something but of the wrong kind. Let's look for an
315 : ;; alternate definition for the other case.
316 0 : (let ((new-op
317 0 : (cdr (assq old-op (cdr (memq (assq old-op rx-constituents)
318 0 : rx-constituents))))))
319 0 : (if (and new-op (not (if head (stringp new-op) (consp new-op))))
320 12 : (setq op new-op))))))
321 8 : op)
322 :
323 :
324 : (defun rx-check (form)
325 : "Check FORM according to its car's parsing info."
326 4 : (unless (listp form)
327 4 : (error "rx `%s' needs argument(s)" form))
328 4 : (let* ((rx (rx-info (car form) 'head))
329 4 : (nargs (1- (length form)))
330 4 : (min-args (nth 1 rx))
331 4 : (max-args (nth 2 rx))
332 4 : (type-pred (nth 3 rx)))
333 4 : (when (and (not (null min-args))
334 4 : (< nargs min-args))
335 0 : (error "rx form `%s' requires at least %d args"
336 4 : (car form) min-args))
337 4 : (when (and (not (null max-args))
338 4 : (> nargs max-args))
339 0 : (error "rx form `%s' accepts at most %d args"
340 4 : (car form) max-args))
341 4 : (when (not (null type-pred))
342 2 : (dolist (sub-form (cdr form))
343 2 : (unless (funcall type-pred sub-form)
344 0 : (error "rx form `%s' requires args satisfying `%s'"
345 4 : (car form) type-pred))))))
346 :
347 :
348 : (defun rx-group-if (regexp group)
349 : "Put shy groups around REGEXP if seemingly necessary when GROUP
350 : is non-nil."
351 5 : (cond
352 : ;; for some repetition
353 5 : ((eq group '*) (if (rx-atomic-p regexp) (setq group nil)))
354 : ;; for concatenation
355 5 : ((eq group ':)
356 0 : (if (rx-atomic-p
357 0 : (if (string-match
358 0 : "\\(?:[?*+]\\??\\|\\\\{[0-9]*,?[0-9]*\\\\}\\)\\'" regexp)
359 0 : (substring regexp 0 (match-beginning 0))
360 0 : regexp))
361 0 : (setq group nil)))
362 : ;; for OR
363 5 : ((eq group '|) (setq group nil))
364 : ;; do anyway
365 5 : ((eq group t))
366 5 : ((rx-atomic-p regexp t) (setq group nil)))
367 5 : (if group
368 0 : (concat "\\(?:" regexp "\\)")
369 5 : regexp))
370 :
371 :
372 : (defvar rx-parent)
373 : ;; dynamically bound in some functions.
374 :
375 :
376 : (defun rx-and (form)
377 : "Parse and produce code from FORM.
378 : FORM is of the form `(and FORM1 ...)'."
379 1 : (rx-check form)
380 1 : (rx-group-if
381 3 : (mapconcat (lambda (x) (rx-form x ':)) (cdr form) nil)
382 1 : (and (memq rx-parent '(* t)) rx-parent)))
383 :
384 :
385 : (defun rx-or (form)
386 : "Parse and produce code from FORM, which is `(or FORM1 ...)'."
387 1 : (rx-check form)
388 1 : (rx-group-if
389 1 : (if (memq nil (mapcar 'stringp (cdr form)))
390 3 : (mapconcat (lambda (x) (rx-form x '|)) (cdr form) "\\|")
391 1 : (regexp-opt (cdr form)))
392 1 : (and (memq rx-parent '(: * t)) rx-parent)))
393 :
394 :
395 : (defun rx-anything (form)
396 : "Match any character."
397 0 : (if (consp form)
398 0 : (error "rx `anything' syntax error: %s" form))
399 0 : (rx-or (list 'or 'not-newline ?\n)))
400 :
401 :
402 : (defun rx-any-delete-from-range (char ranges)
403 : "Delete by side effect character CHAR from RANGES.
404 : Only both edges of each range is checked."
405 0 : (let (m)
406 0 : (cond
407 0 : ((memq char ranges) (setq ranges (delq char ranges)))
408 0 : ((setq m (assq char ranges))
409 0 : (if (eq (1+ char) (cdr m))
410 0 : (setcar (memq m ranges) (1+ char))
411 0 : (setcar m (1+ char))))
412 0 : ((setq m (rassq char ranges))
413 0 : (if (eq (1- char) (car m))
414 0 : (setcar (memq m ranges) (1- char))
415 0 : (setcdr m (1- char)))))
416 0 : ranges))
417 :
418 :
419 : (defun rx-any-condense-range (args)
420 : "Condense by side effect ARGS as range for Rx `any'."
421 1 : (let (str
422 : l)
423 : ;; set STR list of all strings
424 : ;; set L list of all ranges
425 2 : (mapc (lambda (e) (cond ((stringp e) (push e str))
426 2 : ((numberp e) (push (cons e e) l))
427 1 : (t (push e l))))
428 1 : args)
429 : ;; condense overlapped ranges in L
430 1 : (let ((tail (setq l (sort l #'car-less-than-car)))
431 : d)
432 1 : (while (setq d (cdr tail))
433 0 : (if (>= (cdar tail) (1- (caar d)))
434 0 : (progn
435 0 : (setcdr (car tail) (max (cdar tail) (cdar d)))
436 0 : (setcdr tail (cdr d)))
437 1 : (setq tail d))))
438 : ;; Separate small ranges to single number, and delete dups.
439 1 : (nconc
440 1 : (apply #'nconc
441 1 : (mapcar (lambda (e)
442 1 : (cond
443 1 : ((= (car e) (cdr e)) (list (car e)))
444 0 : ((= (1+ (car e)) (cdr e)) (list (car e) (cdr e)))
445 1 : ((list e))))
446 1 : l))
447 1 : (delete-dups str))))
448 :
449 :
450 : (defun rx-check-any-string (str)
451 : "Check string argument STR for Rx `any'."
452 2 : (let ((i 0)
453 : c1 c2 l)
454 2 : (if (= 0 (length str))
455 2 : (error "String arg for Rx `any' must not be empty"))
456 2 : (while (string-match ".-." str i)
457 : ;; string before range: convert it to characters
458 0 : (if (< i (match-beginning 0))
459 0 : (setq l (nconc
460 0 : l
461 0 : (append (substring str i (match-beginning 0)) nil))))
462 : ;; range
463 0 : (setq i (match-end 0)
464 0 : c1 (aref str (match-beginning 0))
465 0 : c2 (aref str (1- i)))
466 0 : (cond
467 0 : ((< c1 c2) (setq l (nconc l (list (cons c1 c2)))))
468 2 : ((= c1 c2) (setq l (nconc l (list c1))))))
469 : ;; rest?
470 2 : (if (< i (length str))
471 2 : (setq l (nconc l (append (substring str i) nil))))
472 2 : l))
473 :
474 :
475 : (defun rx-check-any (arg)
476 : "Check arg ARG for Rx `any'."
477 2 : (cond
478 2 : ((integerp arg) (list arg))
479 2 : ((symbolp arg)
480 0 : (let ((translation (condition-case nil
481 0 : (rx-form arg)
482 0 : (error nil))))
483 0 : (if (or (null translation)
484 0 : (null (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'" translation)))
485 0 : (error "Invalid char class `%s' in Rx `any'" arg))
486 0 : (list (substring translation 1 -1)))) ; strip outer brackets
487 2 : ((and (integerp (car-safe arg)) (integerp (cdr-safe arg)))
488 0 : (list arg))
489 2 : ((stringp arg) (rx-check-any-string arg))
490 0 : ((error
491 2 : "rx `any' requires string, character, char pair or char class args"))))
492 :
493 :
494 : (defun rx-any (form)
495 : "Parse and produce code from FORM, which is `(any ARG ...)'.
496 : ARG is optional."
497 1 : (rx-check form)
498 1 : (let* ((args (rx-any-condense-range
499 1 : (apply
500 1 : #'nconc
501 1 : (mapcar #'rx-check-any (cdr form)))))
502 : m
503 : s)
504 1 : (cond
505 : ;; single close bracket
506 : ;; => "[]...-]" or "[]...--.]"
507 1 : ((memq ?\] args)
508 : ;; set ] at the beginning
509 0 : (setq args (cons ?\] (delq ?\] args)))
510 : ;; set - at the end
511 0 : (if (or (memq ?- args) (assq ?- args))
512 0 : (setq args (nconc (rx-any-delete-from-range ?- args)
513 0 : (list ?-)))))
514 : ;; close bracket starts a range
515 : ;; => "[]-....-]" or "[]-.--....]"
516 1 : ((setq m (assq ?\] args))
517 : ;; bring it to the beginning
518 0 : (setq args (cons m (delq m args)))
519 0 : (cond ((memq ?- args)
520 : ;; to the end
521 0 : (setq args (nconc (delq ?- args) (list ?-))))
522 0 : ((setq m (assq ?- args))
523 : ;; next to the bracket's range, make the second range
524 0 : (setcdr args (cons m (delq m (cdr args)))))))
525 : ;; bracket in the end range
526 : ;; => "[]...-]"
527 1 : ((setq m (rassq ?\] args))
528 : ;; set ] at the beginning
529 0 : (setq args (cons ?\] (rx-any-delete-from-range ?\] args)))
530 : ;; set - at the end
531 0 : (if (or (memq ?- args) (assq ?- args))
532 0 : (setq args (nconc (rx-any-delete-from-range ?- args)
533 0 : (list ?-)))))
534 : ;; {no close bracket appears}
535 : ;;
536 : ;; bring single bar to the beginning
537 1 : ((memq ?- args)
538 0 : (setq args (cons ?- (delq ?- args))))
539 : ;; bar start a range, bring it to the beginning
540 1 : ((setq m (assq ?- args))
541 0 : (setq args (cons m (delq m args))))
542 : ;;
543 : ;; hat at the beginning?
544 1 : ((or (eq (car args) ?^) (eq (car-safe (car args)) ?^))
545 0 : (setq args (if (cdr args)
546 0 : `(,(cadr args) ,(car args) ,@(cddr args))
547 0 : (nconc (rx-any-delete-from-range ?^ args)
548 1 : (list ?^))))))
549 : ;; some 1-char?
550 1 : (if (and (null (cdr args)) (numberp (car args))
551 1 : (or (= 1 (length
552 1 : (setq s (regexp-quote (string (car args))))))
553 0 : (and (equal (car args) ?^) ;; unnecessary predicate?
554 1 : (null (eq rx-parent '!)))))
555 1 : s
556 0 : (concat "["
557 0 : (mapconcat
558 0 : (lambda (e) (cond
559 0 : ((numberp e) (string e))
560 0 : ((consp e)
561 0 : (if (and (= (1+ (car e)) (cdr e))
562 : ;; rx-any-condense-range should
563 : ;; prevent this case from happening.
564 0 : (null (memq (car e) '(?\] ?-)))
565 0 : (null (memq (cdr e) '(?\] ?-))))
566 0 : (string (car e) (cdr e))
567 0 : (string (car e) ?- (cdr e))))
568 0 : (e)))
569 0 : args
570 0 : nil)
571 1 : "]"))))
572 :
573 :
574 : (defun rx-check-not (arg)
575 : "Check arg ARG for Rx `not'."
576 1 : (unless (or (and (symbolp arg)
577 0 : (string-match "\\`\\[\\[:[-a-z]+:\\]\\]\\'"
578 0 : (condition-case nil
579 0 : (rx-form arg)
580 1 : (error ""))))
581 1 : (eq arg 'word-boundary)
582 1 : (and (consp arg)
583 1 : (memq (car arg) '(not any in syntax category))))
584 1 : (error "rx `not' syntax error: %s" arg))
585 : t)
586 :
587 :
588 : (defun rx-not (form)
589 : "Parse and produce code from FORM. FORM is `(not ...)'."
590 1 : (rx-check form)
591 1 : (let ((result (rx-form (cadr form) '!))
592 : case-fold-search)
593 1 : (cond ((string-match "\\`\\[^" result)
594 0 : (cond
595 0 : ((equal result "[^]") "[^^]")
596 0 : ((and (= (length result) 4) (null (eq rx-parent '!)))
597 0 : (regexp-quote (substring result 2 3)))
598 0 : ((concat "[" (substring result 2)))))
599 1 : ((eq ?\[ (aref result 0))
600 0 : (concat "[^" (substring result 1)))
601 1 : ((string-match "\\`\\\\[scbw]" result)
602 0 : (concat (upcase (substring result 0 2))
603 0 : (substring result 2)))
604 1 : ((string-match "\\`\\\\[SCBW]" result)
605 0 : (concat (downcase (substring result 0 2))
606 0 : (substring result 2)))
607 : (t
608 1 : (concat "[^" result "]")))))
609 :
610 :
611 : (defun rx-not-char (form)
612 : "Parse and produce code from FORM. FORM is `(not-char ...)'."
613 0 : (rx-check form)
614 0 : (rx-not `(not (in ,@(cdr form)))))
615 :
616 :
617 : (defun rx-not-syntax (form)
618 : "Parse and produce code from FORM. FORM is `(not-syntax SYNTAX)'."
619 0 : (rx-check form)
620 0 : (rx-not `(not (syntax ,@(cdr form)))))
621 :
622 :
623 : (defun rx-trans-forms (form &optional skip)
624 : "If FORM's length is greater than two, transform it to length two.
625 : A form (HEAD REST ...) becomes (HEAD (and REST ...)).
626 : If SKIP is non-nil, allow that number of items after the head, i.e.
627 : `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
628 0 : (unless skip (setq skip 0))
629 0 : (let ((tail (nthcdr (1+ skip) form)))
630 0 : (if (= (length tail) 1)
631 0 : form
632 0 : (let ((form (copy-sequence form)))
633 0 : (setcdr (nthcdr skip form) (list (cons 'and tail)))
634 0 : form))))
635 :
636 :
637 : (defun rx-= (form)
638 : "Parse and produce code from FORM `(= N ...)'."
639 0 : (rx-check form)
640 0 : (setq form (rx-trans-forms form 1))
641 0 : (unless (and (integerp (nth 1 form))
642 0 : (> (nth 1 form) 0))
643 0 : (error "rx `=' requires positive integer first arg"))
644 0 : (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
645 :
646 :
647 : (defun rx->= (form)
648 : "Parse and produce code from FORM `(>= N ...)'."
649 0 : (rx-check form)
650 0 : (setq form (rx-trans-forms form 1))
651 0 : (unless (and (integerp (nth 1 form))
652 0 : (> (nth 1 form) 0))
653 0 : (error "rx `>=' requires positive integer first arg"))
654 0 : (format "%s\\{%d,\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
655 :
656 :
657 : (defun rx-** (form)
658 : "Parse and produce code from FORM `(** N M ...)'."
659 0 : (rx-check form)
660 0 : (rx-form (cons 'repeat (cdr (rx-trans-forms form 2))) '*))
661 :
662 :
663 : (defun rx-repeat (form)
664 : "Parse and produce code from FORM.
665 : FORM is either `(repeat N FORM1)' or `(repeat N M FORMS...)'."
666 0 : (rx-check form)
667 0 : (if (> (length form) 4)
668 0 : (setq form (rx-trans-forms form 2)))
669 0 : (if (null (nth 2 form))
670 0 : (setq form (cons (nth 0 form) (cons (nth 1 form) (nthcdr 3 form)))))
671 0 : (cond ((= (length form) 3)
672 0 : (unless (and (integerp (nth 1 form))
673 0 : (> (nth 1 form) 0))
674 0 : (error "rx `repeat' requires positive integer first arg"))
675 0 : (format "%s\\{%d\\}" (rx-form (nth 2 form) '*) (nth 1 form)))
676 0 : ((or (not (integerp (nth 2 form)))
677 0 : (< (nth 2 form) 0)
678 0 : (not (integerp (nth 1 form)))
679 0 : (< (nth 1 form) 0)
680 0 : (< (nth 2 form) (nth 1 form)))
681 0 : (error "rx `repeat' range error"))
682 : (t
683 0 : (format "%s\\{%d,%d\\}" (rx-form (nth 3 form) '*)
684 0 : (nth 1 form) (nth 2 form)))))
685 :
686 :
687 : (defun rx-submatch (form)
688 : "Parse and produce code from FORM, which is `(submatch ...)'."
689 0 : (concat "\\("
690 0 : (if (= 2 (length form))
691 : ;; Only one sub-form.
692 0 : (rx-form (cadr form))
693 : ;; Several sub-forms implicitly concatenated.
694 0 : (mapconcat (lambda (re) (rx-form re ':)) (cdr form) nil))
695 0 : "\\)"))
696 :
697 : (defun rx-submatch-n (form)
698 : "Parse and produce code from FORM, which is `(submatch-n N ...)'."
699 0 : (let ((n (nth 1 form)))
700 0 : (concat "\\(?" (number-to-string n) ":"
701 0 : (if (= 3 (length form))
702 : ;; Only one sub-form.
703 0 : (rx-form (nth 2 form))
704 : ;; Several sub-forms implicitly concatenated.
705 0 : (mapconcat (lambda (re) (rx-form re ':)) (cddr form) nil))
706 0 : "\\)")))
707 :
708 : (defun rx-backref (form)
709 : "Parse and produce code from FORM, which is `(backref N)'."
710 0 : (rx-check form)
711 0 : (format "\\%d" (nth 1 form)))
712 :
713 : (defun rx-check-backref (arg)
714 : "Check arg ARG for Rx `backref'."
715 0 : (or (and (integerp arg) (>= arg 1) (<= arg 9))
716 0 : (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
717 :
718 : (defun rx-kleene (form)
719 : "Parse and produce code from FORM.
720 : FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
721 : `zero-or-more' etc. operators.
722 : If OP is one of `*', `+', `?', produce a greedy regexp.
723 : If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
724 : If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
725 : is non-nil."
726 0 : (rx-check form)
727 0 : (setq form (rx-trans-forms form))
728 0 : (let ((suffix (cond ((memq (car form) '(* + ?\s)) "")
729 0 : ((memq (car form) '(*? +? ??)) "?")
730 0 : (rx-greedy-flag "")
731 0 : (t "?")))
732 0 : (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
733 0 : ((memq (car form) '(+ +? 1+ one-or-more)) "+")
734 0 : (t "?"))))
735 0 : (rx-group-if
736 0 : (concat (rx-form (cadr form) '*) op suffix)
737 0 : (and (memq rx-parent '(t *)) rx-parent))))
738 :
739 :
740 : (defun rx-atomic-p (r &optional lax)
741 : "Return non-nil if regexp string R is atomic.
742 : An atomic regexp R is one such that a suffix operator
743 : appended to R will apply to all of R. For example, \"a\"
744 : \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
745 : \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
746 :
747 : This function may return false negatives, but it will not
748 : return false positives. It is nevertheless useful in
749 : situations where an efficiency shortcut can be taken only if a
750 : regexp is atomic. The function can be improved to detect
751 : more cases of atomic regexps. Presently, this function
752 : detects the following categories of atomic regexp;
753 :
754 : a group or shy group: \\(...\\)
755 : a character class: [...]
756 : a single character: a
757 :
758 : On the other hand, false negatives will be returned for
759 : regexps that are atomic but end in operators, such as
760 : \"a+\". I think these are rare. Probably such cases could
761 : be detected without much effort. A guarantee of no false
762 : negatives would require a theoretic specification of the set
763 : of all atomic regexps."
764 5 : (let ((l (length r)))
765 5 : (cond
766 5 : ((<= l 1))
767 5 : ((= l 2) (= (aref r 0) ?\\))
768 5 : ((= l 3) (string-match "\\`\\(?:\\\\[cCsS_]\\|\\[[^^]\\]\\)" r))
769 4 : ((null lax)
770 0 : (cond
771 0 : ((string-match "\\`\\[^?\]?\\(?:\\[:[a-z]+:]\\|[^]]\\)*\\]\\'" r))
772 5 : ((string-match "\\`\\\\(\\(?:[^\\]\\|\\\\[^)]\\)*\\\\)\\'" r)))))))
773 :
774 :
775 : (defun rx-syntax (form)
776 : "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
777 0 : (rx-check form)
778 0 : (let* ((sym (cadr form))
779 0 : (syntax (cdr (assq sym rx-syntax))))
780 0 : (unless syntax
781 : ;; Try sregex compatibility.
782 0 : (cond
783 0 : ((characterp sym) (setq syntax sym))
784 0 : ((symbolp sym)
785 0 : (let ((name (symbol-name sym)))
786 0 : (if (= 1 (length name))
787 0 : (setq syntax (aref name 0))))))
788 0 : (unless syntax
789 0 : (error "Unknown rx syntax `%s'" sym)))
790 0 : (format "\\s%c" syntax)))
791 :
792 :
793 : (defun rx-check-category (form)
794 : "Check the argument FORM of a `(category FORM)'."
795 0 : (unless (or (integerp form)
796 0 : (cdr (assq form rx-categories)))
797 0 : (error "Unknown category `%s'" form))
798 : t)
799 :
800 :
801 : (defun rx-category (form)
802 : "Parse and produce code from FORM, which is `(category SYMBOL)'."
803 0 : (rx-check form)
804 0 : (let ((char (if (integerp (cadr form))
805 0 : (cadr form)
806 0 : (cdr (assq (cadr form) rx-categories)))))
807 0 : (format "\\c%c" char)))
808 :
809 :
810 : (defun rx-eval (form)
811 : "Parse and produce code from FORM, which is `(eval FORM)'."
812 0 : (rx-check form)
813 0 : (rx-form (eval (cadr form)) rx-parent))
814 :
815 :
816 : (defun rx-greedy (form)
817 : "Parse and produce code from FORM.
818 : If FORM is `(minimal-match FORM1)', non-greedy versions of `*',
819 : `+', and `?' operators will be used in FORM1. If FORM is
820 : `(maximal-match FORM1)', greedy operators will be used."
821 0 : (rx-check form)
822 0 : (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
823 0 : (rx-form (cadr form) rx-parent)))
824 :
825 :
826 : (defun rx-regexp (form)
827 : "Parse and produce code from FORM, which is `(regexp STRING)'."
828 0 : (rx-check form)
829 0 : (rx-group-if (cadr form) rx-parent))
830 :
831 :
832 : (defun rx-form (form &optional rx-parent)
833 : "Parse and produce code for regular expression FORM.
834 : FORM is a regular expression in sexp form.
835 : RX-PARENT shows which type of expression calls and controls putting of
836 : shy groups around the result and some more in other functions."
837 6 : (cond
838 6 : ((stringp form)
839 2 : (rx-group-if (regexp-quote form)
840 2 : (if (and (eq rx-parent '*) (< 1 (length form)))
841 2 : rx-parent)))
842 4 : ((integerp form)
843 0 : (regexp-quote (char-to-string form)))
844 4 : ((symbolp form)
845 0 : (let ((info (rx-info form nil)))
846 0 : (cond ((stringp info)
847 0 : info)
848 0 : ((null info)
849 0 : (error "Unknown rx form `%s'" form))
850 : (t
851 0 : (funcall (nth 0 info) form)))))
852 4 : ((consp form)
853 4 : (let ((info (rx-info (car form) 'head)))
854 4 : (unless (consp info)
855 4 : (error "Unknown rx form `%s'" (car form)))
856 4 : (funcall (nth 0 info) form)))
857 : (t
858 6 : (error "rx syntax error at `%s'" form))))
859 :
860 :
861 : ;;;###autoload
862 : (defun rx-to-string (form &optional no-group)
863 : "Parse and produce code for regular expression FORM.
864 : FORM is a regular expression in sexp form.
865 : NO-GROUP non-nil means don't put shy groups around the result."
866 1 : (rx-group-if (rx-form form) (null no-group)))
867 :
868 :
869 : ;;;###autoload
870 : (defmacro rx (&rest regexps)
871 : "Translate regular expressions REGEXPS in sexp form to a regexp string.
872 : REGEXPS is a non-empty sequence of forms of the sort listed below.
873 :
874 : Note that `rx' is a Lisp macro; when used in a Lisp program being
875 : compiled, the translation is performed by the compiler.
876 : See `rx-to-string' for how to do such a translation at run-time.
877 :
878 : The following are valid subforms of regular expressions in sexp
879 : notation.
880 :
881 : STRING
882 : matches string STRING literally.
883 :
884 : CHAR
885 : matches character CHAR literally.
886 :
887 : `not-newline', `nonl'
888 : matches any character except a newline.
889 :
890 : `anything'
891 : matches any character
892 :
893 : `(any SET ...)'
894 : `(in SET ...)'
895 : `(char SET ...)'
896 : matches any character in SET .... SET may be a character or string.
897 : Ranges of characters can be specified as `A-Z' in strings.
898 : Ranges may also be specified as conses like `(?A . ?Z)'.
899 :
900 : SET may also be the name of a character class: `digit',
901 : `control', `hex-digit', `blank', `graph', `print', `alnum',
902 : `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
903 : `word', or one of their synonyms.
904 :
905 : `(not (any SET ...))'
906 : matches any character not in SET ...
907 :
908 : `line-start', `bol'
909 : matches the empty string, but only at the beginning of a line
910 : in the text being matched
911 :
912 : `line-end', `eol'
913 : is similar to `line-start' but matches only at the end of a line
914 :
915 : `string-start', `bos', `bot'
916 : matches the empty string, but only at the beginning of the
917 : string being matched against.
918 :
919 : `string-end', `eos', `eot'
920 : matches the empty string, but only at the end of the
921 : string being matched against.
922 :
923 : `buffer-start'
924 : matches the empty string, but only at the beginning of the
925 : buffer being matched against. Actually equivalent to `string-start'.
926 :
927 : `buffer-end'
928 : matches the empty string, but only at the end of the
929 : buffer being matched against. Actually equivalent to `string-end'.
930 :
931 : `point'
932 : matches the empty string, but only at point.
933 :
934 : `word-start', `bow'
935 : matches the empty string, but only at the beginning of a word.
936 :
937 : `word-end', `eow'
938 : matches the empty string, but only at the end of a word.
939 :
940 : `word-boundary'
941 : matches the empty string, but only at the beginning or end of a
942 : word.
943 :
944 : `(not word-boundary)'
945 : `not-word-boundary'
946 : matches the empty string, but not at the beginning or end of a
947 : word.
948 :
949 : `symbol-start'
950 : matches the empty string, but only at the beginning of a symbol.
951 :
952 : `symbol-end'
953 : matches the empty string, but only at the end of a symbol.
954 :
955 : `digit', `numeric', `num'
956 : matches 0 through 9.
957 :
958 : `control', `cntrl'
959 : matches ASCII control characters.
960 :
961 : `hex-digit', `hex', `xdigit'
962 : matches 0 through 9, a through f and A through F.
963 :
964 : `blank'
965 : matches space and tab only.
966 :
967 : `graphic', `graph'
968 : matches graphic characters--everything except whitespace, ASCII
969 : and non-ASCII control characters, surrogates, and codepoints
970 : unassigned by Unicode.
971 :
972 : `printing', `print'
973 : matches whitespace and graphic characters.
974 :
975 : `alphanumeric', `alnum'
976 : matches alphabetic characters and digits. (For multibyte characters,
977 : it matches according to Unicode character properties.)
978 :
979 : `letter', `alphabetic', `alpha'
980 : matches alphabetic characters. (For multibyte characters,
981 : it matches according to Unicode character properties.)
982 :
983 : `ascii'
984 : matches ASCII (unibyte) characters.
985 :
986 : `nonascii'
987 : matches non-ASCII (multibyte) characters.
988 :
989 : `lower', `lower-case'
990 : matches anything lower-case.
991 :
992 : `upper', `upper-case'
993 : matches anything upper-case.
994 :
995 : `punctuation', `punct'
996 : matches punctuation. (But at present, for multibyte characters,
997 : it matches anything that has non-word syntax.)
998 :
999 : `space', `whitespace', `white'
1000 : matches anything that has whitespace syntax.
1001 :
1002 : `word', `wordchar'
1003 : matches anything that has word syntax.
1004 :
1005 : `not-wordchar'
1006 : matches anything that has non-word syntax.
1007 :
1008 : `(syntax SYNTAX)'
1009 : matches a character with syntax SYNTAX. SYNTAX must be one
1010 : of the following symbols, or a symbol corresponding to the syntax
1011 : character, e.g. `\\.' for `\\s.'.
1012 :
1013 : `whitespace' (\\s- in string notation)
1014 : `punctuation' (\\s.)
1015 : `word' (\\sw)
1016 : `symbol' (\\s_)
1017 : `open-parenthesis' (\\s()
1018 : `close-parenthesis' (\\s))
1019 : `expression-prefix' (\\s')
1020 : `string-quote' (\\s\")
1021 : `paired-delimiter' (\\s$)
1022 : `escape' (\\s\\)
1023 : `character-quote' (\\s/)
1024 : `comment-start' (\\s<)
1025 : `comment-end' (\\s>)
1026 : `string-delimiter' (\\s|)
1027 : `comment-delimiter' (\\s!)
1028 :
1029 : `(not (syntax SYNTAX))'
1030 : matches a character that doesn't have syntax SYNTAX.
1031 :
1032 : `(category CATEGORY)'
1033 : matches a character with category CATEGORY. CATEGORY must be
1034 : either a character to use for C, or one of the following symbols.
1035 :
1036 : `consonant' (\\c0 in string notation)
1037 : `base-vowel' (\\c1)
1038 : `upper-diacritical-mark' (\\c2)
1039 : `lower-diacritical-mark' (\\c3)
1040 : `tone-mark' (\\c4)
1041 : `symbol' (\\c5)
1042 : `digit' (\\c6)
1043 : `vowel-modifying-diacritical-mark' (\\c7)
1044 : `vowel-sign' (\\c8)
1045 : `semivowel-lower' (\\c9)
1046 : `not-at-end-of-line' (\\c<)
1047 : `not-at-beginning-of-line' (\\c>)
1048 : `alpha-numeric-two-byte' (\\cA)
1049 : `chinese-two-byte' (\\cC)
1050 : `greek-two-byte' (\\cG)
1051 : `japanese-hiragana-two-byte' (\\cH)
1052 : `indian-tow-byte' (\\cI)
1053 : `japanese-katakana-two-byte' (\\cK)
1054 : `korean-hangul-two-byte' (\\cN)
1055 : `cyrillic-two-byte' (\\cY)
1056 : `combining-diacritic' (\\c^)
1057 : `ascii' (\\ca)
1058 : `arabic' (\\cb)
1059 : `chinese' (\\cc)
1060 : `ethiopic' (\\ce)
1061 : `greek' (\\cg)
1062 : `korean' (\\ch)
1063 : `indian' (\\ci)
1064 : `japanese' (\\cj)
1065 : `japanese-katakana' (\\ck)
1066 : `latin' (\\cl)
1067 : `lao' (\\co)
1068 : `tibetan' (\\cq)
1069 : `japanese-roman' (\\cr)
1070 : `thai' (\\ct)
1071 : `vietnamese' (\\cv)
1072 : `hebrew' (\\cw)
1073 : `cyrillic' (\\cy)
1074 : `can-break' (\\c|)
1075 :
1076 : `(not (category CATEGORY))'
1077 : matches a character that doesn't have category CATEGORY.
1078 :
1079 : `(and SEXP1 SEXP2 ...)'
1080 : `(: SEXP1 SEXP2 ...)'
1081 : `(seq SEXP1 SEXP2 ...)'
1082 : `(sequence SEXP1 SEXP2 ...)'
1083 : matches what SEXP1 matches, followed by what SEXP2 matches, etc.
1084 :
1085 : `(submatch SEXP1 SEXP2 ...)'
1086 : `(group SEXP1 SEXP2 ...)'
1087 : like `and', but makes the match accessible with `match-end',
1088 : `match-beginning', and `match-string'.
1089 :
1090 : `(submatch-n N SEXP1 SEXP2 ...)'
1091 : `(group-n N SEXP1 SEXP2 ...)'
1092 : like `group', but make it an explicitly-numbered group with
1093 : group number N.
1094 :
1095 : `(or SEXP1 SEXP2 ...)'
1096 : `(| SEXP1 SEXP2 ...)'
1097 : matches anything that matches SEXP1 or SEXP2, etc. If all
1098 : args are strings, use `regexp-opt' to optimize the resulting
1099 : regular expression.
1100 :
1101 : `(minimal-match SEXP)'
1102 : produce a non-greedy regexp for SEXP. Normally, regexps matching
1103 : zero or more occurrences of something are \"greedy\" in that they
1104 : match as much as they can, as long as the overall regexp can
1105 : still match. A non-greedy regexp matches as little as possible.
1106 :
1107 : `(maximal-match SEXP)'
1108 : produce a greedy regexp for SEXP. This is the default.
1109 :
1110 : Below, `SEXP ...' represents a sequence of regexp forms, treated as if
1111 : enclosed in `(and ...)'.
1112 :
1113 : `(zero-or-more SEXP ...)'
1114 : `(0+ SEXP ...)'
1115 : matches zero or more occurrences of what SEXP ... matches.
1116 :
1117 : `(* SEXP ...)'
1118 : like `zero-or-more', but always produces a greedy regexp, independent
1119 : of `rx-greedy-flag'.
1120 :
1121 : `(*? SEXP ...)'
1122 : like `zero-or-more', but always produces a non-greedy regexp,
1123 : independent of `rx-greedy-flag'.
1124 :
1125 : `(one-or-more SEXP ...)'
1126 : `(1+ SEXP ...)'
1127 : matches one or more occurrences of SEXP ...
1128 :
1129 : `(+ SEXP ...)'
1130 : like `one-or-more', but always produces a greedy regexp.
1131 :
1132 : `(+? SEXP ...)'
1133 : like `one-or-more', but always produces a non-greedy regexp.
1134 :
1135 : `(zero-or-one SEXP ...)'
1136 : `(optional SEXP ...)'
1137 : `(opt SEXP ...)'
1138 : matches zero or one occurrences of A.
1139 :
1140 : `(? SEXP ...)'
1141 : like `zero-or-one', but always produces a greedy regexp.
1142 :
1143 : `(?? SEXP ...)'
1144 : like `zero-or-one', but always produces a non-greedy regexp.
1145 :
1146 : `(repeat N SEXP)'
1147 : `(= N SEXP ...)'
1148 : matches N occurrences.
1149 :
1150 : `(>= N SEXP ...)'
1151 : matches N or more occurrences.
1152 :
1153 : `(repeat N M SEXP)'
1154 : `(** N M SEXP ...)'
1155 : matches N to M occurrences.
1156 :
1157 : `(backref N)'
1158 : matches what was matched previously by submatch N.
1159 :
1160 : `(eval FORM)'
1161 : evaluate FORM and insert result. If result is a string,
1162 : `regexp-quote' it.
1163 :
1164 : `(regexp REGEXP)'
1165 : include REGEXP in string notation in the result."
1166 : (cond ((null regexps)
1167 : (error "No regexp"))
1168 : ((cdr regexps)
1169 : (rx-to-string `(and ,@regexps) t))
1170 : (t
1171 : (rx-to-string (car regexps) t))))
1172 :
1173 :
1174 : (pcase-defmacro rx (&rest regexps)
1175 : "Build a `pcase' pattern matching `rx' regexps.
1176 : The REGEXPS are interpreted as by `rx'. The pattern matches if
1177 : the regular expression so constructed matches the object, as if
1178 : by `string-match'.
1179 :
1180 : In addition to the usual `rx' constructs, REGEXPS can contain the
1181 : following constructs:
1182 :
1183 : (let VAR FORM...) creates a new explicitly numbered submatch
1184 : that matches FORM and binds the match to
1185 : VAR.
1186 : (backref VAR) creates a backreference to the submatch
1187 : introduced by a previous (let VAR ...)
1188 : construct.
1189 :
1190 : The VARs are associated with explicitly numbered submatches
1191 : starting from 1. Multiple occurrences of the same VAR refer to
1192 : the same submatch.
1193 :
1194 : If a case matches, the match data is modified as usual so you can
1195 : use it in the case body, but you still have to pass the correct
1196 : string as argument to `match-string'."
1197 2 : (let* ((vars ())
1198 : (rx-constituents
1199 2 : `((let
1200 : ,(lambda (form)
1201 0 : (rx-check form)
1202 0 : (let ((var (cadr form)))
1203 0 : (cl-check-type var symbol)
1204 0 : (let ((i (or (cl-position var vars :test #'eq)
1205 0 : (prog1 (length vars)
1206 0 : (setq vars `(,@vars ,var))))))
1207 0 : (rx-form `(submatch-n ,(1+ i) ,@(cddr form))))))
1208 : 1 nil)
1209 : (backref
1210 : ,(lambda (form)
1211 0 : (rx-check form)
1212 0 : (rx-backref
1213 0 : `(backref ,(let ((var (cadr form)))
1214 0 : (if (integerp var) var
1215 0 : (1+ (cl-position var vars :test #'eq)))))))
1216 : 1 1
1217 : ,(lambda (var)
1218 0 : (cond ((integerp var) (rx-check-backref var))
1219 0 : ((memq var vars) t)
1220 0 : (t (error "rx `backref' variable must be one of %s: %s"
1221 0 : vars var)))))
1222 2 : ,@rx-constituents))
1223 2 : (regexp (rx-to-string `(seq ,@regexps) :no-group)))
1224 0 : `(and (pred (string-match ,regexp))
1225 0 : ,@(cl-loop for i from 1
1226 0 : for var in vars
1227 2 : collect `(app (match-string ,i) ,var)))))
1228 :
1229 : ;; ;; sregex.el replacement
1230 :
1231 : ;; ;;;###autoload (provide 'sregex)
1232 : ;; ;;;###autoload (autoload 'sregex "rx")
1233 : ;; (defalias 'sregex 'rx-to-string)
1234 : ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
1235 : ;; (defalias 'sregexq 'rx)
1236 :
1237 : (provide 'rx)
1238 :
1239 : ;;; rx.el ends here
|