>From b9a691aa9b7aaa43e07841f11095d779b210448d Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 21 Apr 2014 10:51:16 -0700 Subject: [PATCH] grep: -P now rejects invalid input sequences in UTF-8 locales See and . * NEWS: Document this. * src/pcresearch.c (Pexecute): Do not use PCRE_NO_UTF8_CHECK, as this leads to undefined behavior when the input is not UTF-8. * tests/pcre-infloop, tests/pcre-invalid-utf8-input: Exit status is now 2, not 1, when grep -P is given invalid UTF-8 data in a UTF-8 locale. --- NEWS | 4 ++++ src/pcresearch.c | 17 ++++------------- tests/pcre-infloop | 2 +- tests/pcre-invalid-utf8-input | 5 ++--- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index fbb782b..2d3e12a 100644 --- a/NEWS +++ b/NEWS @@ -14,6 +14,10 @@ GNU grep NEWS -*- outline -*- grep -f no longer mishandles patterns containing NUL bytes. [bug introduced in grep-2.11] + grep -P now reports an error and exits when given invalid UTF-8 data. + Previously it was unreliable, and sometimes crashed or looped. + [bug introduced in grep-2.16] + grep -P now works with -w and -x and backreferences. Before, echo aa|grep -Pw '(.)\1' would fail to match, yet echo aa|grep -Pw '(.)\2' would match. diff --git a/src/pcresearch.c b/src/pcresearch.c index a5e953f..9f63f37 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -52,19 +52,14 @@ Pcompile (char const *pattern, size_t size) int e; char const *ep; char *re = xnmalloc (4, size + 7); - int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); + int flags = (PCRE_MULTILINE + | (match_icase ? PCRE_CASELESS : 0) + | (using_utf8 () ? PCRE_UTF8 : 0)); char const *patlim = pattern + size; char *n = re; char const *p; char const *pnul; - if (using_utf8 ()) - { - /* Enable PCRE's UTF-8 matching. Note also the use of - PCRE_NO_UTF8_CHECK when calling pcre_extra, below. */ - flags |= PCRE_UTF8; - } - /* FIXME: Remove these restrictions. */ if (memchr (pattern, '\n', size)) error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); @@ -154,10 +149,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size, e == PCRE_ERROR_NOMATCH && line_next < buf + size; start_ofs -= line_next - line_buf) { - /* Disable the check that would make an invalid byte - seqence *in the input* trigger a failure. */ - int options = PCRE_NO_UTF8_CHECK; - line_buf = line_next; line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf); if (line_end == NULL) @@ -172,7 +163,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size, error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); e = pcre_exec (cre, extra, line_buf, line_end - line_buf, - start_ofs < 0 ? 0 : start_ofs, options, + start_ofs < 0 ? 0 : start_ofs, 0, sub, sizeof sub / sizeof *sub); } diff --git a/tests/pcre-infloop b/tests/pcre-infloop index 57b67ae..febf356 100755 --- a/tests/pcre-infloop +++ b/tests/pcre-infloop @@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_ fail=0 LC_ALL=en_US.utf8 timeout 3 grep -P 'a.?..b' in -test $? = 1 || fail_ "libpcre's match function appears to infloop" +test $? = 2 || fail_ "libpcre's match function appears to infloop" Exit $fail diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input index ccf3caf..913e8ee 100755 --- a/tests/pcre-invalid-utf8-input +++ b/tests/pcre-invalid-utf8-input @@ -15,8 +15,7 @@ fail=0 printf 'j\202\nj\n' > in || framework_failure_ -LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1 -compare in out || fail=1 -compare /dev/null err || fail=1 +LC_ALL=en_US.UTF-8 grep -P j in +test $? -eq 2 || fail=1 Exit $fail -- 1.9.0