grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.12-12-g582cdfa


From: Paul Eggert
Subject: grep branch, master, updated. v2.12-12-g582cdfa
Date: Tue, 15 May 2012 16:28:04 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  582cdfacf297181c2c5ffec83fd8a3c0f6562fc6 (commit)
      from  6c4a43ab84bc20d39128c390d14fba36f668ad98 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=582cdfacf297181c2c5ffec83fd8a3c0f6562fc6


commit 582cdfacf297181c2c5ffec83fd8a3c0f6562fc6
Author: Paul Eggert <address@hidden>
Date:   Tue May 15 09:26:02 2012 -0700

    grep: sparse files are now considered binary
    
    * NEWS: Document this.
    * doc/grep.texi (File and Directory Selection): Likewise.
    * bootstrap.conf (gnulib_modules): Add stat-size.
    * src/main.c: Include stat-size.h.
    (usable_st_size): New function, mostly stolen from coreutils.
    (fillbuf): Use it.
    (file_is_binary): New function, which looks for holes too.
    (grep): Use it.
    * tests/Makefile.am (TESTS): Add big-hole.
    * tests/big-hole: New file.

diff --git a/NEWS b/NEWS
index 1497b92..f515e84 100644
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   PATTERN *" again reads all *.c and *.h files except for system.h.
   [bug introduced in grep-2.6]
 
+** New features
+
+  'grep' without -z now treats a sparse file as binary, if it can
+  easily determine that the file is sparse.
+
 ** Dropped features
 
   Bootstrapping with Makefile.boot has been broken since grep 2.6,
@@ -45,7 +50,6 @@ GNU grep NEWS                                    -*- outline 
-*-
   use -R if you prefer the old behavior of following all symlinks and
   defaulting to reading all devices.
 
-
 * Noteworthy changes in release 2.11 (2012-03-02) [stable]
 
 ** Bug fixes
diff --git a/bootstrap.conf b/bootstrap.conf
index a7853c9..57749b4 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -69,6 +69,7 @@ realloc-gnu
 regex
 same-inode
 ssize_t
+stat-size
 stddef
 stdlib
 stpcpy
diff --git a/doc/grep.texi b/doc/grep.texi
index 3b52a19..0e519dd 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -580,7 +580,8 @@ this is equivalent to the @samp{--binary-files=text} option.
 @item address@hidden
 @opindex --binary-files
 @cindex binary files
-If the first few bytes of a file indicate that the file contains binary data,
+If a file's allocation metadata or its first few bytes
+indicate that the file contains binary data,
 assume that the file is of type @var{type}.
 By default, @var{type} is @samp{binary},
 and @command{grep} normally outputs either
@@ -722,8 +723,8 @@ better performance.
 @cindex binary files, MS-DOS/MS-Windows
 Treat the file(s) as binary.
 By default, under MS-DOS and MS-Windows,
address@hidden guesses the file type
-by looking at the contents of the first 32kB read from the file.
address@hidden guesses whether a file is text or binary
+as described for the @option{--binary-files} option.
 If @command{grep} decides the file is a text file,
 it strips the @code{CR} characters from the original file contents
 (to make regular expressions with @code{^} and @code{$} work correctly).
diff --git a/src/main.c b/src/main.c
index bc9177e..10fbfac 100644
--- a/src/main.c
+++ b/src/main.c
@@ -44,6 +44,7 @@
 #include "progname.h"
 #include "propername.h"
 #include "quote.h"
+#include "stat-size.h"
 #include "version-etc.h"
 #include "xalloc.h"
 #include "xstrtol.h"
@@ -406,6 +407,14 @@ is_device_mode (mode_t m)
   return S_ISCHR (m) || S_ISBLK (m) || S_ISSOCK (m) || S_ISFIFO (m);
 }
 
+/* Return nonzero if ST->st_size is defined.  Assume the file is not a
+   symbolic link.  */
+static int
+usable_st_size (struct stat const *st)
+{
+  return S_ISREG (st->st_mode) || S_TYPEISSHM (st) || S_TYPEISTMO (st);
+}
+
 /* Functions we'll use to search. */
 static compile_fp_t compile;
 static execute_fp_t execute;
@@ -428,6 +437,70 @@ clean_up_stdout (void)
     close_stdout ();
 }
 
+/* Return 1 if a file is known to be binary for the purpose of 'grep'.
+   BUF, of size BUFSIZE, is the initial buffer read from the file with
+   descriptor FD and status ST.  */
+static int
+file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st)
+{
+  #ifndef HAVE_STRUCT_STAT_ST_BLOCKS
+  enum { HAVE_STRUCT_STAT_ST_BLOCKS = 0 };
+  #endif
+  #ifndef SEEK_HOLE
+  enum { SEEK_HOLE = SEEK_END };
+  #endif
+
+  /* If -z, test only whether the initial buffer contains '\200';
+     knowing about holes won't help.  */
+  if (! eolbyte)
+    return memchr (buf, '\200', bufsize) != 0;
+
+  /* If the initial buffer contains a null byte, guess that the file
+     is binary.  */
+  if (memchr (buf, '\0', bufsize))
+    return 1;
+
+  /* If the file has holes, it must contain a null byte somewhere.  */
+  if ((HAVE_STRUCT_STAT_ST_BLOCKS || SEEK_HOLE != SEEK_END)
+      && usable_st_size (st))
+    {
+      off_t cur = bufsize;
+      if (O_BINARY || fd == STDIN_FILENO)
+        {
+          cur = lseek (fd, 0, SEEK_CUR);
+          if (cur < 0)
+            return 0;
+        }
+
+      /* If the file has fewer blocks than would be needed to
+         represent its data, then it must have at least one hole.  */
+      if (HAVE_STRUCT_STAT_ST_BLOCKS)
+        {
+          off_t nonzeros_needed = st->st_size - cur + bufsize;
+          off_t full_blocks = nonzeros_needed / ST_NBLOCKSIZE;
+          int partial_block = 0 < nonzeros_needed % ST_NBLOCKSIZE;
+          if (ST_NBLOCKS (*st) < full_blocks + partial_block)
+            return 1;
+        }
+
+      /* Look for a hole after the current location.  */
+      if (SEEK_HOLE != SEEK_END)
+        {
+          off_t hole_start = lseek (fd, cur, SEEK_HOLE);
+          if (0 <= hole_start)
+            {
+              if (lseek (fd, cur, SEEK_SET) < 0)
+                suppressible_error (filename, errno);
+              if (hole_start < st->st_size)
+                return 1;
+            }
+        }
+    }
+
+  /* Guess that the file does not contain binary data.  */
+  return 0;
+}
+
 /* Convert STR to a nonnegative integer, storing the result in *OUT.
    STR must be a valid context length argument; report an error if it
    isn't.  Silently ceiling *OUT at the maximum value, as that is
@@ -559,7 +632,7 @@ fillbuf (size_t save, struct stat const *st)
          is large.  However, do not use the original file size as a
          heuristic if we've already read past the file end, as most
          likely the file is growing.  */
-      if (S_ISREG (st->st_mode))
+      if (usable_st_size (st))
         {
           off_t to_be_read = st->st_size - bufoffset;
           off_t maxsize_off = save + to_be_read;
@@ -1133,7 +1206,7 @@ grep (int fd, struct stat const *st)
 
   not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
                || binary_files == WITHOUT_MATCH_BINARY_FILES)
-              && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
+              && file_is_binary (bufbeg, buflim - bufbeg, fd, st));
   if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
     return 0;
   done_on_match += not_text;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index d0d622b..7be788c 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -38,6 +38,7 @@ TESTS =                                               \
   backref                                      \
   backref-multibyte-slow                       \
   backref-word                                 \
+  big-hole                                     \
   big-match                                    \
   bogus-wctob                                  \
   bre                                          \
diff --git a/tests/big-hole b/tests/big-hole
new file mode 100755
index 0000000..47e36e1
--- /dev/null
+++ b/tests/big-hole
@@ -0,0 +1,31 @@
+#!/bin/sh
+# Check that grep --binary-file=without-match quickly skips files with holes.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+expensive_
+
+# Try to make this test not THAT expensive, on typical hosts.
+virtual_memory_KiB=10240
+if echo x | (ulimit -v $virtual_memory_KiB && grep x) >/dev/null 2>&1; then
+  ulimit -v $virtual_memory_KiB
+fi
+
+# Create a file that starts with at least a buffer's worth of text,
+# but has a big hole later.
+ten='1 2 3 4 5 6 7 8 9 10'
+x='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
+(for i in $ten; do
+   for j in $ten; do
+     for k in $ten; do
+       echo $x
+     done
+   done
+ done
+ echo x | dd bs=1024k seek=8000000
+) >8T-or-so || skip_ 'cannot create big sparse file'
+
+grep --binary-file=without-match x 8T-or-so >/dev/null
+test $? -eq 1 || fail=1
+
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS              |    6 +++-
 bootstrap.conf    |    1 +
 doc/grep.texi     |    7 +++--
 src/main.c        |   77 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 tests/Makefile.am |    1 +
 tests/big-hole    |   31 +++++++++++++++++++++
 6 files changed, 117 insertions(+), 6 deletions(-)
 create mode 100755 tests/big-hole


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]