From f784a73a01b823109d660aa8d256535623e98971 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 14 Sep 2014 13:49:18 -0700 Subject: [PROPOSED PATCH 4/6] grep: treat a file as binary if its prefix contains encoding errors * NEWS: * doc/grep.texi (File and Directory Selection): Document this. * src/grep.c (buffer_encoding, buffer_textbin): New functions. (file_textbin): Rename from file_is_binary. Now returns 3-way value. All callers changed. (file_textbin, grep): Check the input more carefully for text vs binary data. (contains_encoding_error): Remove; use replaced by buffer_encoding. * tests/backref-multibyte-slow: * tests/high-bit-range: * tests/invalid-multibyte-infloop: Use -a, since the input is now considered to be binary. * tests/invalid-multibyte-infloop: Add a check for new behavior. --- NEWS | 4 ++ doc/grep.texi | 3 +- src/grep.c | 126 +++++++++++++++++++++++++++------------- tests/backref-multibyte-slow | 2 +- tests/high-bit-range | 2 +- tests/invalid-multibyte-infloop | 14 ++++- 6 files changed, 106 insertions(+), 45 deletions(-) diff --git a/NEWS b/NEWS index 36bb48f..9377d7d 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,10 @@ GNU grep NEWS -*- outline -*- Performance has improved for very long strings in patterns. + If a file contains data improperly encoded for the current locale, + and this is discovered before any of the file's contents are output, + grep now treats the file as binary. + grep -P no longer reports an error and exits when given invalid UTF-8 data. Instead, it considers the data to be non-matching. diff --git a/doc/grep.texi b/doc/grep.texi index c8e4acd..14bd69e 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -592,7 +592,8 @@ this is equivalent to the @samp{--binary-files=text} option. @item address@hidden @opindex --binary-files @cindex binary files -If a file's allocation metadata or its first few bytes +If a file's allocation metadata, +or if its data read before a line is selected for output, indicate that the file contains binary data, assume that the file is of type @var{type}. By default, @var{type} is @samp{binary}, diff --git a/src/grep.c b/src/grep.c index 1e0cc6d..ccba1b6 100644 --- a/src/grep.c +++ b/src/grep.c @@ -437,50 +437,74 @@ clean_up_stdout (void) close_stdout (); } -/* Return true if a file is known to be binary for the purpose of 'grep'. +/* Return 1 if BUF (of size SIZE) contains text, -1 if it contains + binary data, and 0 if the answer depends on what comes immediately + after BUF. */ +static int +buffer_textbin (char const *buf, size_t size) +{ + mbstate_t mbs = { 0 }; + size_t charlen; + char badbyte = eolbyte ? '\0' : '\200'; + char const *p; + + for (p = buf; p < buf + size; p += charlen) + { + if (*p == badbyte) + return -1; + charlen = mbrlen (p, buf + size - p, &mbs); + if ((size_t) -2 <= charlen) + return charlen == (size_t) -2 ? 0 : -1; + charlen += !charlen; + } + + return 1; +} + +/* Return 1 if a file is known to be text for the purpose of 'grep'. + Return -1 if it is known to be binary, 0 if unknown. BUF, of size BUFSIZE, is the initial buffer read from the file with descriptor FD and status ST. */ -static bool -file_is_binary (char const *buf, size_t bufsize, int fd, struct stat const *st) +static int +file_textbin (char const *buf, size_t bufsize, int fd, struct stat const *st) { #ifndef SEEK_HOLE enum { SEEK_HOLE = SEEK_END }; #endif - /* If -z, test only whether the initial buffer contains '\200'; - knowing about holes won't help. */ - if (! eolbyte) - return memchr (buf, '\200', bufsize) != 0; + int textbin = buffer_textbin (buf, bufsize); + if (textbin < 0) + return textbin; - /* If the initial buffer contains a null byte, guess that the file - is binary. */ - if (memchr (buf, '\0', bufsize)) - return true; - - /* If the file has holes, it must contain a null byte somewhere. */ - if (SEEK_HOLE != SEEK_END && usable_st_size (st)) + if (usable_st_size (st)) { - off_t cur = bufsize; - if (O_BINARY || fd == STDIN_FILENO) - { - cur = lseek (fd, 0, SEEK_CUR); - if (cur < 0) - return false; - } + if (st->st_size <= bufsize) + return 2 * textbin - 1; - /* Look for a hole after the current location. */ - off_t hole_start = lseek (fd, cur, SEEK_HOLE); - if (0 <= hole_start) + /* If the file has holes, it must contain a null byte somewhere. */ + if (SEEK_HOLE != SEEK_END && eolbyte) { - if (lseek (fd, cur, SEEK_SET) < 0) - suppressible_error (filename, errno); - if (hole_start < st->st_size) - return true; + off_t cur = bufsize; + if (O_BINARY || fd == STDIN_FILENO) + { + cur = lseek (fd, 0, SEEK_CUR); + if (cur < 0) + return 0; + } + + /* Look for a hole after the current location. */ + off_t hole_start = lseek (fd, cur, SEEK_HOLE); + if (0 <= hole_start) + { + if (lseek (fd, cur, SEEK_SET) < 0) + suppressible_error (filename, errno); + if (hole_start < st->st_size) + return -1; + } } } - /* Guess that the file does not contain binary data. */ - return false; + return 0; } /* Convert STR to a nonnegative integer, storing the result in *OUT. @@ -1100,7 +1124,7 @@ static intmax_t grep (int fd, struct stat const *st) { intmax_t nlines, i; - bool not_text; + int textbin; size_t residue, save; char oldc; char *beg; @@ -1129,13 +1153,18 @@ grep (int fd, struct stat const *st) return 0; } - not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet) - || binary_files == WITHOUT_MATCH_BINARY_FILES) - && file_is_binary (bufbeg, buflim - bufbeg, fd, st)); - if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES) - return 0; - done_on_match |= not_text; - out_quiet |= not_text; + if (binary_files == TEXT_BINARY_FILES) + textbin = 1; + else + { + textbin = file_textbin (bufbeg, buflim - bufbeg, fd, st); + if (textbin < 0) + { + if (binary_files == WITHOUT_MATCH_BINARY_FILES) + return 0; + done_on_match = out_quiet = true; + } + } for (;;) { @@ -1187,8 +1216,13 @@ grep (int fd, struct stat const *st) } /* Detect whether leading context is adjacent to previous output. */ - if (beg != lastout) - lastout = 0; + if (lastout) + { + if (!textbin) + textbin = 1; + if (beg != lastout) + lastout = 0; + } /* Handle some details and read more data to scan. */ save = residue + lim - beg; @@ -1201,6 +1235,16 @@ grep (int fd, struct stat const *st) suppressible_error (filename, errno); goto finish_grep; } + + /* If the file's textbin has not been determined yet, assume + it's binary if the next input buffer suggests so. */ + if (! textbin && buffer_textbin (bufbeg, buflim - bufbeg) < 0) + { + textbin = -1; + if (binary_files == WITHOUT_MATCH_BINARY_FILES) + return 0; + done_on_match = out_quiet = true; + } } if (residue) { @@ -1214,7 +1258,7 @@ grep (int fd, struct stat const *st) finish_grep: done_on_match = done_on_match_0; out_quiet = out_quiet_0; - if ((not_text & ~out_quiet) && nlines != 0) + if (textbin < 0 && !out_quiet && nlines != 0) printf (_("Binary file %s matches\n"), filename); return nlines; } diff --git a/tests/backref-multibyte-slow b/tests/backref-multibyte-slow index ffebb6b..d447a4a 100755 --- a/tests/backref-multibyte-slow +++ b/tests/backref-multibyte-slow @@ -21,7 +21,7 @@ max_seconds=$(LC_ALL=C perl -le 'use Time::HiRes qw(time); my $s = time(); for LOC in en_US.UTF-8; do out=out-$LOC - LC_ALL=$LOC timeout ${max_seconds}s grep -E '^([a-z]).\1$' in > $out 2>&1 + LC_ALL=$LOC timeout ${max_seconds}s grep -aE '^([a-z]).\1$' in > $out 2>&1 test $? = 0 || fail=1 compare $out in || fail=1 done diff --git a/tests/high-bit-range b/tests/high-bit-range index 74b6e65..76c3310 100755 --- a/tests/high-bit-range +++ b/tests/high-bit-range @@ -21,7 +21,7 @@ fail=0 printf '\201\n' > in || framework_failure_ -grep "$(printf '[\201]')" in > out || fail=1 +grep -a "$(printf '[\201]')" in > out || fail=1 compare out in || fail=1 diff --git a/tests/invalid-multibyte-infloop b/tests/invalid-multibyte-infloop index b28bc53..d7c6165 100755 --- a/tests/invalid-multibyte-infloop +++ b/tests/invalid-multibyte-infloop @@ -14,7 +14,7 @@ encode AA > input fail=0 # Before 2.15, this would infloop. -LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out +LC_ALL=en_US.UTF-8 timeout 3 grep -aF $(encode A) input > out status=$? if test $status -eq 0; then compare input out @@ -24,4 +24,16 @@ else test $status -eq 2 fi || fail=1 +echo 'Binary file input matches' >binary-file-matches + +LC_ALL=en_US.UTF-8 timeout 3 grep -F $(encode A) input > out +status=$? +if test $status -eq 0; then + compare binary-file-matches out +elif test $status -eq 1; then + compare_dev_null_ /dev/null out +else + test $status -eq 2 +fi || fail=1 + Exit $fail -- 1.9.3