>From 6c3ead6bd5c61801915dcedbb8dd17622610a899 Mon Sep 17 00:00:00 2001 From: Noam Postavsky Date: Sat, 2 Dec 2017 19:01:54 -0500 Subject: [PATCH] Raise limit of regexp repetition (Bug#24914) * src/regex.h (RE_DUP_MAX): Raise limit to 2^16-1. * etc/NEWS: Announce it. * doc/lispref/searching.texi (Regexp Backslash): Document it. * test/src/regex-tests.el (regex-repeat-limit): Test it. * src/regex.h (reg_errcode_t): Add REG_ESIZEBR code. * src/regex.c (re_error_msgid): Add corresponding entry. (GET_INTERVAL_COUNT): Return it instead of the more generic REG_EBADBR when encountering a repetition greater than RE_DUP_MAX. * lisp/isearch.el (isearch-search): Don't convert errors starting with "Invalid" into "incomplete". Such errors are not incomplete, in the sense that they cannot be corrected by appending more characters to the end of the regexp. The affected error messages are: - REG_BADPAT "Invalid regular expression" - \\(?X:\\) where X is not a legal group number - \\_X where X is not < or > - REG_ECOLLATE "Invalid collation character" - There is no code to throw this. - REG_ECTYPE "Invalid character class name" - [[:foo:] where foo is not a valid class name - REG_ESUBREG "Invalid back reference" - \N where N is referenced before matching group N - REG_BADBR "Invalid content of \\{\\}" - \\{N,M\\} where N < 0, M < N, M or N larger than max - \\{NX where X is not a digit or backslash - \\{N\\X where X is not a } - REG_ERANGE "Invalid range end" - There is no code to throw this. - REG_BADRPT "Invalid preceding regular expression" - We never throw this. It would usually indicate a "*" with no preceding regexp text, but Emacs allows that to match a literal "*". --- doc/lispref/searching.texi | 10 +++++++++- etc/NEWS | 8 ++++++++ lisp/isearch.el | 2 +- src/regex.c | 5 +++-- src/regex.h | 9 ++++++--- test/src/regex-tests.el | 6 ++++++ 6 files changed, 33 insertions(+), 7 deletions(-) diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi index 755fa554bb..ab52cf2802 100644 --- a/doc/lispref/searching.texi +++ b/doc/lispref/searching.texi @@ -639,7 +639,15 @@ Regexp Backslash is a more general postfix operator that specifies repetition with a minimum of @var{m} repeats and a maximum of @var{n} repeats. If @var{m} is omitted, the minimum is 0; if @var{n} is omitted, there is no -maximum. +maximum. For both forms, @var{m} and @var{n}, if specified, may be no +larger than +@ifnottex +2**16 @minus{} 1 +@end ifnottex +@tex +@math{2^{16}-1} +@end tex +. For example, @samp{c[ad]\@{1,2\@}r} matches the strings @samp{car}, @samp{cdr}, @samp{caar}, @samp{cadr}, @samp{cdar}, and @samp{cddr}, and diff --git a/etc/NEWS b/etc/NEWS index 64b53d88c8..c7efc53f6a 100644 --- a/etc/NEWS +++ b/etc/NEWS @@ -509,6 +509,14 @@ instead. ** The new user option 'arabic-shaper-ZWNJ-handling' controls how to handle ZWNJ in Arabic text rendering. ++++ +** The limit on repetitions in regexps has been raised to 2^16-1. +It was previously undocumented and limited to 2^15-1. For example, +the following regular expression was previously invalid, but is now +accepted: + + x\{32768\} + * Editing Changes in Emacs 26.1 diff --git a/lisp/isearch.el b/lisp/isearch.el index 13fa97ea71..093185a096 100644 --- a/lisp/isearch.el +++ b/lisp/isearch.el @@ -2851,7 +2851,7 @@ isearch-search (setq isearch-error (car (cdr lossage))) (cond ((string-match - "\\`Premature \\|\\`Unmatched \\|\\`Invalid " + "\\`Premature \\|\\`Unmatched " isearch-error) (setq isearch-error "incomplete input")) ((and (not isearch-regexp) diff --git a/src/regex.c b/src/regex.c index 330f2f78a8..ab74f457d4 100644 --- a/src/regex.c +++ b/src/regex.c @@ -1200,7 +1200,8 @@ WEAK_ALIAS (__re_set_syntax, re_set_syntax) gettext_noop ("Premature end of regular expression"), /* REG_EEND */ gettext_noop ("Regular expression too big"), /* REG_ESIZE */ gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */ - gettext_noop ("Range striding over charsets") /* REG_ERANGEX */ + gettext_noop ("Range striding over charsets"), /* REG_ERANGEX */ + gettext_noop ("Invalid content of \\{\\}, repetitions too big") /* REG_ESIZEBR */ }; /* Whether to allocate memory during matching. */ @@ -1921,7 +1922,7 @@ while (REMAINING_AVAIL_SLOTS <= space) { \ if (num < 0) \ num = 0; \ if (RE_DUP_MAX / 10 - (RE_DUP_MAX % 10 < c - '0') < num) \ - FREE_STACK_RETURN (REG_BADBR); \ + FREE_STACK_RETURN (REG_ESIZEBR); \ num = num * 10 + c - '0'; \ if (p == pend) \ FREE_STACK_RETURN (REG_EBRACE); \ diff --git a/src/regex.h b/src/regex.h index 9fa8356011..4c8632d6aa 100644 --- a/src/regex.h +++ b/src/regex.h @@ -270,8 +270,10 @@ #ifdef RE_DUP_MAX # undef RE_DUP_MAX #endif -/* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ -#define RE_DUP_MAX (0x7fff) +/* Repeat counts are stored in opcodes as 2 byte integers. This was + previously limited to 7fff because the parsing code uses signed + ints. But Emacs only runs on 32 bit platforms anyway. */ +#define RE_DUP_MAX (0xffff) /* POSIX `cflags' bits (i.e., information for `regcomp'). */ @@ -337,7 +339,8 @@ REG_EEND, /* Premature end. */ REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ REG_ERPAREN, /* Unmatched ) or \); not returned from regcomp. */ - REG_ERANGEX /* Range striding over charsets. */ + REG_ERANGEX, /* Range striding over charsets. */ + REG_ESIZEBR /* n or m too big in \{n,m\} */ } reg_errcode_t; /* This data structure represents a compiled pattern. Before calling diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el index b1f1ea71ce..872d16a085 100644 --- a/test/src/regex-tests.el +++ b/test/src/regex-tests.el @@ -677,4 +677,10 @@ regex-tests-TESTS This evaluates the TESTS test cases from glibc." (should-not (regex-tests-TESTS))) +(ert-deftest regex-repeat-limit () + "Test the #xFFFF repeat limit." + (should (string-match "\\`x\\{65535\\}" (make-string 65535 ?x))) + (should-not (string-match "\\`x\\{65535\\}" (make-string 65534 ?x))) + (should-error (string-match "\\`x\\{65536\\}" "X") :type invalid-regexp)) + ;;; regex-tests.el ends here -- 2.11.0