>From f11f0c9351fdd2bd65efdb469754096d1a237d61 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Thu, 27 Feb 2014 09:26:23 -0800
Subject: [PATCH] grep: fix multiple bugs with bracket expressions
* NEWS: Document this.
* src/dfa.c (using_simple_locale): New function.
(parse_bracket_exp): Handle bracket expressions like [a-[.z.]]
correctly. Don't assume that dfaexec handles expressions like
[^a-z] correctly, as they can match multiple characters in some
locales.
* tests/posix-bracket: New file.
* tests/Makefile.am (TESTS): Add it.
---
NEWS | 4 ++
src/dfa.c | 129 +++++++++++++++++++++++++++++-----------------------
tests/Makefile.am | 1 +
tests/posix-bracket | 33 ++++++++++++++
4 files changed, 110 insertions(+), 57 deletions(-)
create mode 100755 tests/posix-bracket
diff --git a/NEWS b/NEWS
index 657f3d1..6cfcaba 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,10 @@ GNU grep NEWS -*- outline -*-
** Bug fixes
+ grep no longer mishandles patterns like [a-[.z.]], and no longer
+ mishandles patterns like [^a] in locales that have multicharacter
+ collating sequences so that [^a] can match a string of two characters.
+
grep -P now works with -w and -x and backreferences. Before,
echo aa|grep -Pw '(.)\1' would fail to match, yet
echo aa|grep -Pw '(.)\2' would match.
diff --git a/src/dfa.c b/src/dfa.c
index 8906ed3..65ab5d6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -182,7 +182,8 @@ enum
EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches
the empty string. */
- BACKREF, /* BACKREF is generated by \; it
+ BACKREF, /* BACKREF is generated by \
+ or by any other construct that
is not completely handled. If the scanner
detects a transition on backref, it returns
a kind of "semi-success" indicating that
@@ -769,6 +770,45 @@ using_utf8 (void)
return utf8;
}
+/* Return true if the current locale is known to be a unibyte locale
+ without multicharacter collating sequences and where range
+ comparisons simply use the native encoding. These locales can be
+ processed more efficiently. */
+
+static bool
+using_simple_locale (void)
+{
+ /* True if the native character set is known to be compatible with
+ the C locale. The following test isn't perfect, but it's good
+ enough in practice, as only ASCII and EBCDIC are in common use
+ and this test correctly accepts ASCII and rejects EBCDIC. */
+ enum { native_c_charset =
+ ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
+ && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
+ && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
+ && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
+ && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
+ && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
+ && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
+ && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
+ && '}' == 125 && '~' == 126)
+ };
+
+ if (! native_c_charset || MB_CUR_MAX > 1)
+ return false;
+ else
+ {
+ static int unibyte_c = -1;
+ if (unibyte_c < 0)
+ {
+ char *locale = setlocale (LC_ALL, 0);
+ unibyte_c = (locale && (STREQ (locale, "C")
+ || STREQ (locale, "POSIX")));
+ }
+ return unibyte_c;
+ }
+}
+
/* Lexical analyzer. All the dross that deals with the obnoxious
GNU Regex syntax bits is located here. The poor, suffering
reader is referred to the GNU Regex documentation for the
@@ -917,6 +957,10 @@ parse_bracket_exp (void)
int c, c1, c2;
charclass ccl;
+ /* True if this is a bracket expression that dfaexec is known to
+ process correctly. */
+ bool known_bracket_exp = true;
+
/* Used to warn about [:space:].
Bit 0 = first character is a colon.
Bit 1 = last character is a colon.
@@ -958,6 +1002,7 @@ parse_bracket_exp (void)
{
FETCH_WC (c, wc, _("unbalanced ["));
invert = 1;
+ known_bracket_exp = using_simple_locale ();
}
else
invert = 0;
@@ -972,16 +1017,14 @@ parse_bracket_exp (void)
we just treat it as a bunch of ordinary characters. We can do
this because we assume regex has checked for syntax errors before
dfa is ever called. */
- if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))
+ if (c == '[')
{
#define MAX_BRACKET_STRING_LEN 32
char str[MAX_BRACKET_STRING_LEN + 1];
FETCH_WC (c1, wc1, _("unbalanced ["));
- /* If pattern contains '[[:', '[[.', or '[[='. */
- if (c1 == ':'
- /* TODO: handle '[[.' and '[[=' also for MB_CUR_MAX == 1. */
- || (MB_CUR_MAX > 1 && (c1 == '.' || c1 == '=')))
+ if ((c1 == ':' && syntax_bits & RE_CHAR_CLASSES)
+ || c1 == '.' || c1 == '=')
{
size_t len = 0;
for (;;)
@@ -1000,7 +1043,10 @@ parse_bracket_exp (void)
/* Fetch bracket. */
FETCH_WC (c, wc, _("unbalanced ["));
if (c1 == ':')
- /* build character class. */
+ /* Build character class. POSIX allows character
+ classes to match multicharacter collating elements,
+ but the regex code does not support that, so do not
+ worry about that possibility. */
{
char const *class
= (case_fold && (STREQ (str, "upper")
@@ -1024,28 +1070,9 @@ parse_bracket_exp (void)
if (pred->func (c2))
setbit_case_fold_c (c2, ccl);
}
+ else
+ known_bracket_exp = false;
- else if (MBS_SUPPORT && (c1 == '=' || c1 == '.'))
- {
- char *elem = xmemdup (str, len + 1);
-
- if (c1 == '=')
- /* build equivalence class. */
- {
- REALLOC_IF_NECESSARY (work_mbc->equivs,
- equivs_al, work_mbc->nequivs + 1);
- work_mbc->equivs[work_mbc->nequivs++] = elem;
- }
-
- if (c1 == '.')
- /* build collating element. */
- {
- REALLOC_IF_NECESSARY (work_mbc->coll_elems,
- coll_elems_al,
- work_mbc->ncoll_elems + 1);
- work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
- }
- }
colon_warning_state |= 8;
/* Fetch new lookahead character. */
@@ -1067,6 +1094,16 @@ parse_bracket_exp (void)
/* build range characters. */
{
FETCH_WC (c2, wc2, _("unbalanced ["));
+
+ /* A bracket expression like [a-[.aa.]] matches an unknown set.
+ Treat it like [-a[.aa.]] while parsing it, and
+ remember that the set is unknown. */
+ if (c2 == '[' && *lexptr == '.')
+ {
+ known_bracket_exp = false;
+ c2 = ']';
+ }
+
if (c2 == ']')
{
/* In the case [x-], the - is an ordinary hyphen,
@@ -1104,36 +1141,11 @@ parse_bracket_exp (void)
work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
}
}
+ else if (using_simple_locale ())
+ for (; c <= c2; c++)
+ setbit_case_fold_c (c, ccl);
else
- {
- /* Defer to the system regex library about the meaning
- of range expressions. */
- struct re_pattern_buffer re = { 0 };
- char const *compile_msg;
-#if 199901 <= __STDC_VERSION__
- char pattern[] = { '[', '\\', c, '-', '\\', c2, ']' };
-#else
- char pattern[] = { '[', '\\', 0, '-', '\\', 0, ']' };
- pattern[2] = c;
- pattern[5] = c2;
-#endif
- re_set_syntax (syntax_bits | RE_BACKSLASH_ESCAPE_IN_LISTS);
- compile_msg = re_compile_pattern (pattern, sizeof pattern, &re);
- if (compile_msg)
- dfaerror (compile_msg);
- for (c = 0; c < NOTCHAR; c++)
- {
- char subject = c;
- switch (re_match (&re, &subject, 1, 0, NULL))
- {
- case 1: setbit (c, ccl); break;
- case -1: break;
- default: xalloc_die ();
- }
- }
- regfree (&re);
- re_set_syntax (syntax_bits);
- }
+ known_bracket_exp = false;
colon_warning_state |= 8;
FETCH_WC (c1, wc1, _("unbalanced ["));
@@ -1171,6 +1183,9 @@ parse_bracket_exp (void)
if (colon_warning_state == 7)
dfawarn (_("character class syntax is [[:space:]], not [:space:]"));
+ if (! known_bracket_exp)
+ return BACKREF;
+
if (MB_CUR_MAX > 1)
{
static charclass zeroclass;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 742a580..972ffc5 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -86,6 +86,7 @@ TESTS = \
pcre-w \
pcre-wx-backref \
pcre-z \
+ posix-bracket \
prefix-of-multibyte \
r-dot \
repetition-overflow \
diff --git a/tests/posix-bracket b/tests/posix-bracket
new file mode 100755
index 0000000..d9d1d84
--- /dev/null
+++ b/tests/posix-bracket
@@ -0,0 +1,33 @@
+#!/bin/sh
+# Check various bracket expressions in the POSIX locale.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+LC_ALL=C
+export LC_ALL
+
+fail=0
+
+echo a >in || framework_failure_
+for bracketed in '[.a.]' '[.a.]-a' 'a-[.a.]' '[.a.]-[.a.]' \
+ '[=a=]' '[:alpha:]'; do
+ grep "[$bracketed]" in >out || fail=1
+ compare in out || fail=1
+ grep "[^$bracketed]" in >out && fail=1
+ compare /dev/null out || fail=1
+done
+Exit $fail
--
1.8.5.3