>From b9a691aa9b7aaa43e07841f11095d779b210448d Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Mon, 21 Apr 2014 10:51:16 -0700
Subject: [PATCH] grep: -P now rejects invalid input sequences in UTF-8 locales
See and .
* NEWS: Document this.
* src/pcresearch.c (Pexecute): Do not use PCRE_NO_UTF8_CHECK,
as this leads to undefined behavior when the input is not UTF-8.
* tests/pcre-infloop, tests/pcre-invalid-utf8-input:
Exit status is now 2, not 1, when grep -P is given invalid UTF-8
data in a UTF-8 locale.
---
NEWS | 4 ++++
src/pcresearch.c | 17 ++++-------------
tests/pcre-infloop | 2 +-
tests/pcre-invalid-utf8-input | 5 ++---
4 files changed, 11 insertions(+), 17 deletions(-)
diff --git a/NEWS b/NEWS
index fbb782b..2d3e12a 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,10 @@ GNU grep NEWS -*- outline -*-
grep -f no longer mishandles patterns containing NUL bytes.
[bug introduced in grep-2.11]
+ grep -P now reports an error and exits when given invalid UTF-8 data.
+ Previously it was unreliable, and sometimes crashed or looped.
+ [bug introduced in grep-2.16]
+
grep -P now works with -w and -x and backreferences. Before,
echo aa|grep -Pw '(.)\1' would fail to match, yet
echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a5e953f..9f63f37 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -52,19 +52,14 @@ Pcompile (char const *pattern, size_t size)
int e;
char const *ep;
char *re = xnmalloc (4, size + 7);
- int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+ int flags = (PCRE_MULTILINE
+ | (match_icase ? PCRE_CASELESS : 0)
+ | (using_utf8 () ? PCRE_UTF8 : 0));
char const *patlim = pattern + size;
char *n = re;
char const *p;
char const *pnul;
- if (using_utf8 ())
- {
- /* Enable PCRE's UTF-8 matching. Note also the use of
- PCRE_NO_UTF8_CHECK when calling pcre_extra, below. */
- flags |= PCRE_UTF8;
- }
-
/* FIXME: Remove these restrictions. */
if (memchr (pattern, '\n', size))
error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
@@ -154,10 +149,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
e == PCRE_ERROR_NOMATCH && line_next < buf + size;
start_ofs -= line_next - line_buf)
{
- /* Disable the check that would make an invalid byte
- seqence *in the input* trigger a failure. */
- int options = PCRE_NO_UTF8_CHECK;
-
line_buf = line_next;
line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
if (line_end == NULL)
@@ -172,7 +163,7 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
- start_ofs < 0 ? 0 : start_ofs, options,
+ start_ofs < 0 ? 0 : start_ofs, 0,
sub, sizeof sub / sizeof *sub);
}
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 57b67ae..febf356 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
fail=0
LC_ALL=en_US.utf8 timeout 3 grep -P 'a.?..b' in
-test $? = 1 || fail_ "libpcre's match function appears to infloop"
+test $? = 2 || fail_ "libpcre's match function appears to infloop"
Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index ccf3caf..913e8ee 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -15,8 +15,7 @@ fail=0
printf 'j\202\nj\n' > in || framework_failure_
-LC_ALL=en_US.UTF-8 grep -P j in > out 2>&1 || fail=1
-compare in out || fail=1
-compare /dev/null err || fail=1
+LC_ALL=en_US.UTF-8 grep -P j in
+test $? -eq 2 || fail=1
Exit $fail
--
1.9.0