[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#73188: [PATCH] PEG: Add support for `not-in-range` and [^...]
From: |
Ekaitz Zarraga |
Subject: |
bug#73188: [PATCH] PEG: Add support for `not-in-range` and [^...] |
Date: |
Fri, 11 Oct 2024 14:31:40 +0200 |
Modern PEG supports inversed class like `[^a-z]` that would get any
character not in the `a-z` range. This commit adds support for that and
also for a new `not-in-range` PEG pattern for scheme.
* module/ice-9/peg/codegen.scm (cg-not-in-range): New function.
* module/ice-9/peg/string-peg.scm: Add support for `[^...]`
* test-suite/tests/peg.test: Test it.
* doc/ref/api-peg.texi: Document accordingly.
---
doc/ref/api-peg.texi | 8 +++++++
module/ice-9/peg/codegen.scm | 22 +++++++++++++++++++
module/ice-9/peg/string-peg.scm | 39 +++++++++++++++++++++++++++++----
test-suite/tests/peg.test | 6 ++++-
4 files changed, 70 insertions(+), 5 deletions(-)
diff --git a/doc/ref/api-peg.texi b/doc/ref/api-peg.texi
index d34ddc64c..f2707442c 100644
--- a/doc/ref/api-peg.texi
+++ b/doc/ref/api-peg.texi
@@ -143,6 +143,14 @@ Parses any character falling between @var{a} and @var{z}.
@code{(range #\a #\z)}
@end deftp
+@deftp {PEG Pattern} {inversed range of characters} a z
+Parses any character not falling between @var{a} and @var{z}.
+
+@code{"[^a-z]"}
+
+@code{(not-in-range #\a #\z)}
+@end deftp
+
Example:
@example
diff --git a/module/ice-9/peg/codegen.scm b/module/ice-9/peg/codegen.scm
index d80c3e849..82367ef55 100644
--- a/module/ice-9/peg/codegen.scm
+++ b/module/ice-9/peg/codegen.scm
@@ -140,6 +140,27 @@ return EXP."
((none) #`(list (1+ pos) '()))
(else (error "bad accum" accum))))))))))
+;; Generates code for matching a range of characters not between start and end.
+;; E.g.: (cg-not-in-range syntax #\a #\z 'body)
+(define (cg-not-in-range pat accum)
+ (syntax-case pat ()
+ ((start end)
+ (if (not (and (char? (syntax->datum #'start))
+ (char? (syntax->datum #'end))))
+ (error "range PEG should have characters after it; instead got"
+ #'start #'end))
+ #`(lambda (str len pos)
+ (and (< pos len)
+ (let ((c (string-ref str pos)))
+ (and (or (char<? c start) (char>? c end))
+ #,(case accum
+ ((all) #`(list (1+ pos)
+ (list 'cg-not-in-range (string c))))
+ ((name) #`(list (1+ pos) 'cg-not-in-range))
+ ((body) #`(list (1+ pos) (string c)))
+ ((none) #`(list (1+ pos) '()))
+ (else (error "bad accum" accum))))))))))
+
;; Generate code to match a pattern and do nothing with the result
(define (cg-ignore pat accum)
(syntax-case pat ()
@@ -304,6 +325,7 @@ return EXP."
(assq-set! peg-compiler-alist symbol function)))
(add-peg-compiler! 'range cg-range)
+(add-peg-compiler! 'not-in-range cg-not-in-range)
(add-peg-compiler! 'ignore cg-ignore)
(add-peg-compiler! 'capture cg-capture)
(add-peg-compiler! 'and cg-and)
diff --git a/module/ice-9/peg/string-peg.scm b/module/ice-9/peg/string-peg.scm
index 05755693a..f88d2f7d8 100644
--- a/module/ice-9/peg/string-peg.scm
+++ b/module/ice-9/peg/string-peg.scm
@@ -49,7 +49,7 @@ Prefix <-- (AND / NOT)? Suffix
Suffix <-- Primary (QUESTION / STAR / PLUS)?
Primary <-- Identifier !LEFTARROW
/ OPEN Expression CLOSE
- / Literal / Class / DOT
+ / Literal / Class / NotInClass / DOT
# Lexical syntax
Identifier <-- IdentStart IdentCont* Spacing
@@ -59,7 +59,8 @@ IdentCont <- IdentStart / [0-9]
Literal <-- SQUOTE (!SQUOTE Char)* SQUOTE Spacing
/ DQUOTE (!DQUOTE Char)* DQUOTE Spacing
-Class <-- '[' (!']' Range)* ']' Spacing
+NotInClass <-- '[' NOTIN (!']' Range)* ']' Spacing
+Class <-- '[' !NOTIN (!']' Range)* ']' Spacing
Range <-- Char '-' Char / Char
Char <-- '\\\\' [nrt'\"\\[\\]\\\\]
/ '\\\\' [0-7][0-7][0-7]
@@ -72,6 +73,7 @@ SQUOTE <-- [']
DQUOTE <-- [\"]
OPENBRACKET < '['
CLOSEBRACKET < ']'
+NOTIN < '^'
SLASH < '/' Spacing
AND <-- '&' Spacing
NOT <-- '!' Spacing
@@ -116,6 +118,7 @@ EndOfFile < !.
(and OPEN Expression CLOSE)
Literal
Class
+ NotInClass
DOT))
(define-sexp-parser Identifier all
(and IdentStart (* IdentCont) Spacing))
@@ -127,7 +130,11 @@ EndOfFile < !.
(or (and SQUOTE (* (and (not-followed-by SQUOTE) Char)) SQUOTE Spacing)
(and DQUOTE (* (and (not-followed-by DQUOTE) Char)) DQUOTE Spacing)))
(define-sexp-parser Class all
- (and OPENBRACKET (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET
Spacing))
+ (and OPENBRACKET (not-followed-by NOTIN)
+ (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
+(define-sexp-parser NotInClass all
+ (and OPENBRACKET NOTIN
+ (* (and (not-followed-by CLOSEBRACKET) Range)) CLOSEBRACKET Spacing))
(define-sexp-parser Range all
(or (and Char DASH Char) Char))
(define-sexp-parser Char all
@@ -137,6 +144,8 @@ EndOfFile < !.
(and (not-followed-by "\\") peg-any)))
(define-sexp-parser LEFTARROW body
(and (or "<--" "<-" "<") Spacing)) ; NOTE: <-- and < are extensions
+(define-sexp-parser NOTIN none
+ (and "^"))
(define-sexp-parser SLASH none
(and "/" Spacing))
(define-sexp-parser AND all
@@ -271,6 +280,7 @@ EndOfFile < !.
('Identifier (Identifier->defn value for-syntax))
('Expression (Expression->defn value for-syntax))
('Literal (Literal->defn value for-syntax))
+ ('NotInClass (NotInClass->defn value for-syntax))
('Class (Class->defn value for-syntax)))))
;; (Identifier "hello")
@@ -283,13 +293,34 @@ EndOfFile < !.
(define (Literal->defn lst for-syntax)
(apply string (map (lambda (x) (Char->defn x for-syntax)) (cdr lst))))
-;; TODO: empty Class can happen: `[]`, but what does it represent?
+;; (NotInClass ...)
+;; `-> (and ...)
+(define (NotInClass->defn lst for-syntax)
+ #`(and #,@(map (lambda (x) (NotInRange->defn x for-syntax))
+ (cdr lst))))
+
;; (Class ...)
;; `-> (or ...)
(define (Class->defn lst for-syntax)
#`(or #,@(map (lambda (x) (Range->defn x for-syntax))
(cdr lst))))
+;; For one character:
+;; (NotInRange (Char "a"))
+;; `-> (not-in-range #\a #\a)
+;; Or for a range:
+;; (NotInRange (Char "a") (Char "b"))
+;; `-> (not-in-range #\a #\b)
+(define (NotInRange->defn lst for-syntax)
+ (cond
+ ((= 2 (length lst))
+ (let ((ch (Char->defn (second lst) for-syntax)))
+ #`(not-in-range #,ch #,ch)))
+ ((= 3 (length lst))
+ #`(not-in-range
+ #,(Char->defn (second lst) for-syntax)
+ #,(Char->defn (third lst) for-syntax)))))
+
;; For one character:
;; (Range (Char "a"))
;; `-> "a"
diff --git a/test-suite/tests/peg.test b/test-suite/tests/peg.test
index 556145e72..965e1c12f 100644
--- a/test-suite/tests/peg.test
+++ b/test-suite/tests/peg.test
@@ -38,6 +38,7 @@
(Identifier Identifier)
(Literal Literal)
(Class Class)
+ (NotInClass NotInClass)
(Range Range)
(Char Char)
(LEFTARROW LEFTARROW)
@@ -85,7 +86,7 @@
End <-- '*)'
C <- Begin N* End
N <- C / (!Begin !End Z)
-Z <- .")
+Z <- [^X-Z]") ;; Forbid some characters to test not-in-range
;; A short /etc/passwd file.
(define *etc-passwd*
@@ -125,6 +126,9 @@ SLASH < '/'")
(match-pattern C "(*blah*)")
(make-prec 0 8 "(*blah*)"
'((Begin "(*") "blah" (End "*)")))))
+ (pass-if
+ "simple comment with forbidden char"
+ (not (match-pattern C "(*blYh*)")))
(pass-if
"simple comment padded"
(equal?
--
2.46.0
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- bug#73188: [PATCH] PEG: Add support for `not-in-range` and [^...],
Ekaitz Zarraga <=