bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

bug#22382: [PATCH] grep: -x now supersedes -w more consistently


From: Paul Eggert
Subject: bug#22382: [PATCH] grep: -x now supersedes -w more consistently
Date: Fri, 15 Jan 2016 23:06:22 -0800

* NEWS, doc/grep.texi (Matching Control): Mention this.
* src/dfasearch.c (EGexecute):
* src/pcresearch.c (Pcompile):
Don't get confused by -w if -x is also present.
* src/pcresearch.c (Pcompile): Remove misleading comment about
non-UTF-8 multibyte locales, as PCRE doesn't support them.
Calculate buffer sizes more carefully; the old method
allocated a buffer slightly too big, seemingly due to luck.
* tests/backref-word, tests/pcre: Add tests for this bug.
---
 NEWS               |  5 ++++-
 doc/grep.texi      |  1 +
 src/dfasearch.c    |  6 +++---
 src/pcresearch.c   | 24 +++++++++++++-----------
 tests/backref-word |  4 ++++
 tests/pcre         |  5 ++++-
 6 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/NEWS b/NEWS
index a0f6bbb..9de7fcb 100644
--- a/NEWS
+++ b/NEWS
@@ -32,9 +32,12 @@ GNU grep NEWS                                    -*- outline 
-*-
   This partly reverts the --exclude-related change in 2.22.
   [bug introduced in grep-2.22]
 
-  --line-buffer is no longer ineffective when combined with -l
+  --line-buffer is no longer ineffective when combined with -l.
   [bug introduced in grep-2.5]
 
+  -xw is now equivalent to -x more consistently, with -P and with backrefs.
+  [bug only partially fixed in grep-2.19]
+
 
 * Noteworthy changes in release 2.22 (2015-11-01) [stable]
 
diff --git a/doc/grep.texi b/doc/grep.texi
index 76769b9..8883b27 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -233,6 +233,7 @@ Similarly,
 it must be either at the end of the line
 or followed by a non-word constituent character.
 Word-constituent characters are letters, digits, and the underscore.
+This option has no effect if @option{-x} is also specified.
 
 @item -x
 @itemx --line-regexp
diff --git a/src/dfasearch.c b/src/dfasearch.c
index a330eac..e04a2df 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -363,14 +363,14 @@ EGexecute (char *buf, size_t size, size_t *match_size,
                   len = end - ptr;
                   goto assess_pattern_match;
                 }
-              /* If -w, check if the match aligns with word boundaries.
-                 We do this iteratively because:
+              /* If -w and not -x, check whether the match aligns with
+                 word boundaries.  Do this iteratively because:
                  (a) the line may contain more than one occurrence of the
                  pattern, and
                  (b) Several alternatives in the pattern might be valid at a
                  given point, and we may need to consider a shorter one to
                  find a word boundary.  */
-              if (match_words)
+              if (!match_lines && match_words)
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 1fae94d..3fee67a 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -98,7 +98,13 @@ Pcompile (char const *pattern, size_t size)
 #else
   int e;
   char const *ep;
-  char *re = xnmalloc (4, size + 7);
+  static char const wprefix[] = "(?<!\\w)(?:";
+  static char const wsuffix[] = ")(?!\\w)";
+  static char const xprefix[] = "^(?:";
+  static char const xsuffix[] = ")$";
+  int fix_len_max = MAX (sizeof wprefix - 1 + sizeof wsuffix - 1,
+                         sizeof xprefix - 1 + sizeof xsuffix - 1);
+  char *re = xnmalloc (4, size + (fix_len_max + 4 - 1) / 4);
   int flags = (PCRE_MULTILINE
                | (match_icase ? PCRE_CASELESS : 0));
   char const *patlim = pattern + size;
@@ -120,20 +126,16 @@ Pcompile (char const *pattern, size_t size)
     error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
   *n = '\0';
-  if (match_lines)
-    strcpy (n, "^(?:");
   if (match_words)
-    strcpy (n, "(?<!\\w)(?:");
+    strcpy (n, wprefix);
+  if (match_lines)
+    strcpy (n, xprefix);
   n += strlen (n);
 
   /* The PCRE interface doesn't allow NUL bytes in the pattern, so
      replace each NUL byte in the pattern with the four characters
      "\000", removing a preceding backslash if there are an odd
-     number of backslashes before the NUL.
-
-     FIXME: This method does not work with some multibyte character
-     encodings, notably Shift-JIS, where a multibyte character can end
-     in a backslash byte.  */
+     number of backslashes before the NUL.  */
   for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
     {
       memcpy (n, p, pnul - p);
@@ -149,9 +151,9 @@ Pcompile (char const *pattern, size_t size)
   n += patlim - p;
   *n = '\0';
   if (match_words)
-    strcpy (n, ")(?!\\w)");
+    strcpy (n, wsuffix);
   if (match_lines)
-    strcpy (n, ")$");
+    strcpy (n, xsuffix);
 
   cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
   if (!cre)
diff --git a/tests/backref-word b/tests/backref-word
index 557c6d8..e5b5486 100755
--- a/tests/backref-word
+++ b/tests/backref-word
@@ -9,6 +9,10 @@ for LOC in en_US.UTF-8 zh_CN $LOCALE_FR_UTF8; do
   out=out1-$LOC
   LC_ALL=$LOC grep -w '\(foo\) \1' exp1 > $out || fail=1
   compare exp1 $out || fail=1
+
+  LC_ALL=$LOC grep -wx '\(foo\) \1' exp1 > $out
+  test $? -eq 1 || fail=1
+  compare /dev/null $out || fail=1
 done
 
 Exit $fail
diff --git a/tests/pcre b/tests/pcre
index a9dfb4b..92e788e 100755
--- a/tests/pcre
+++ b/tests/pcre
@@ -1,5 +1,5 @@
 #! /bin/sh
-# Ensure that with -P, \s matches a newline.
+# Simple PCRE tests.
 #
 # Copyright (C) 2001, 2006, 2009-2016 Free Software Foundation, Inc.
 #
@@ -15,4 +15,7 @@ fail=0
 echo | grep -P '\s*$' || fail=1
 echo | grep -zP '\s$' || fail=1
 
+echo '.ab' | grep -Pwx ab
+test $? -eq 1 || fail=1
+
 Exit $fail
-- 
2.5.0






reply via email to

[Prev in Thread] Current Thread [Next in Thread]