[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 7/9] grep: split search.c
From: |
Paolo Bonzini |
Subject: |
[PATCH 7/9] grep: split search.c |
Date: |
Fri, 19 Mar 2010 12:36:50 +0100 |
* po/POTFILES.in: Update.
* src/Makefile.am (grep_SOURCES, egrep_SOURCES, fgrep_SOURCES): Move
kwset.c and dfa.c to libsearch.a. Add searchutils.c there too.
* src/search.h, src/dfasearch.c, src/pcresearch.c, src/kwsearch.c,
src/searchutils.c: New files, split out of src/search.c.
* src/esearch.c, src/fsearch.c: Include the new files instead of search.c.
* src/gsearch.c: Likewise, plus move Gcompile/Acompile here.
---
po/POTFILES.in | 4 +-
src/Makefile.am | 18 +-
src/dfasearch.c | 394 +++++++++++++++++++++++++
src/esearch.c | 3 +-
src/fsearch.c | 3 +-
src/gsearch.c | 18 +-
src/kwsearch.c | 161 ++++++++++
src/pcresearch.c | 177 +++++++++++
src/search.c | 844 -----------------------------------------------------
src/search.h | 47 +++
src/searchutils.c | 141 +++++++++
11 files changed, 952 insertions(+), 858 deletions(-)
create mode 100644 src/dfasearch.c
create mode 100644 src/kwsearch.c
create mode 100644 src/pcresearch.c
delete mode 100644 src/search.c
create mode 100644 src/search.h
create mode 100644 src/searchutils.c
diff --git a/po/POTFILES.in b/po/POTFILES.in
index 920413e..286c731 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -26,4 +26,6 @@ lib/xstrtol-error.c
src/dfa.c
src/grep.c
src/kwset.c
-src/search.c
+src/dfasearch.c
+src/kwsearch.c
+src/pcresearch.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 0b0140e..7ebc126 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,16 +19,20 @@ LN = ln
AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
bin_PROGRAMS = grep egrep fgrep
-grep_SOURCES = grep.c gsearch.c kwset.c dfa.c
-egrep_SOURCES = egrep.c esearch.c kwset.c dfa.c
-fgrep_SOURCES = fgrep.c fsearch.c kwset.c
-noinst_HEADERS = grep.h dfa.h kwset.h system.h mbsupport.h
+grep_SOURCES = grep.c gsearch.c
+egrep_SOURCES = egrep.c esearch.c
+fgrep_SOURCES = fgrep.c fsearch.c
+noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h
-LDADD = $(LIBINTL) ../lib/libgreputils.a
-grep_LDADD = $(PCRE_LIBS) $(LDADD)
+noinst_LIBRARIES = libsearch.a
+libsearch_a_SOURCES = kwset.c dfa.c searchutils.c
+
+LDADD = $(LIBINTL) libsearch.a ../lib/libgreputils.a
+grep_LDADD = $(LDADD) $(PCRE_LIBS)
localedir = $(datadir)/locale
INCLUDES = -I$(top_srcdir)/lib -DLOCALEDIR=\"$(localedir)\"
EXTRA_DIST = \
- dosbuf.c search.c \
+ dosbuf.c \
+ pcresearch.c dfasearch.c kwsearch.c \
vms_fab.c vms_fab.h
diff --git a/src/dfasearch.c b/src/dfasearch.c
new file mode 100644
index 0000000..ef9404b
--- /dev/null
+++ b/src/dfasearch.c
@@ -0,0 +1,394 @@
+/* dfasearch.c - searching subroutines using dfa and regex for grep.
+ Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+#include "dfa.h"
+
+/* For -w, we also consider _ to be word constituent. */
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
+
+/* KWset compiled pattern. For Ecompile and Gcompile, we compile
+ a list of strings, at least one of which is known to occur in
+ any string matching the regexp. */
+static kwset_t kwset;
+
+/* DFA compiled regexp. */
+static struct dfa dfa;
+
+/* The Regex compiled patterns. */
+static struct patterns
+{
+ /* Regex compiled regexp. */
+ struct re_pattern_buffer regexbuf;
+ struct re_registers regs; /* This is here on account of a BRAIN-DEAD
+ address@hidden library interface in regex.c. */
+} patterns0;
+
+struct patterns *patterns;
+size_t pcount;
+
+void
+dfaerror (char const *mesg)
+{
+ error (EXIT_TROUBLE, 0, "%s", mesg);
+}
+
+/* Number of compiled fixed strings known to exactly match the regexp.
+ If kwsexec returns < kwset_exact_matches, then we don't need to
+ call the regexp matcher at all. */
+static int kwset_exact_matches;
+
+static char const *
+kwsincr_case (const char *must)
+{
+ const char *buf;
+ size_t n;
+
+ n = strlen (must);
+#ifdef MBS_SUPPORT
+ if (match_icase && MB_CUR_MAX > 1)
+ buf = mbtolower (must, &n);
+ else
+#endif
+ buf = must;
+ return kwsincr (kwset, buf, n);
+}
+
+/* If the DFA turns out to have some set of fixed strings one of
+ which must occur in the match, then we build a kwset matcher
+ to find those strings, and thus quickly filter out impossible
+ matches. */
+static void
+kwsmusts (void)
+{
+ struct dfamust const *dm;
+ char const *err;
+
+ if (dfa.musts)
+ {
+ kwsinit (&kwset);
+ /* First, we compile in the substrings known to be exact
+ matches. The kwset matcher will return the index
+ of the matching string that it chooses. */
+ for (dm = dfa.musts; dm; dm = dm->next)
+ {
+ if (!dm->exact)
+ continue;
+ ++kwset_exact_matches;
+ if ((err = kwsincr_case (dm->must)) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+ }
+ /* Now, we compile the substrings that will require
+ the use of the regexp matcher. */
+ for (dm = dfa.musts; dm; dm = dm->next)
+ {
+ if (dm->exact)
+ continue;
+ if ((err = kwsincr_case (dm->must)) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+ }
+ if ((err = kwsprep (kwset)) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+ }
+}
+
+/* No __VA_ARGS__ in C89. So we have to do it this way. */
+static void
+GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
+{
+ const char *err;
+ const char *p, *sep;
+ size_t total = size;
+ char *motif;
+
+ if (match_icase)
+ syntax_bits |= RE_ICASE;
+ re_set_syntax (syntax_bits);
+ dfasyntax (syntax_bits, match_icase, eolbyte);
+
+ /* For GNU regex compiler we have to pass the patterns separately to detect
+ errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
+ GNU regex should have raise a syntax error. The same for backref, where
+ the backref should have been local to each pattern. */
+ p = pattern;
+ do
+ {
+ size_t len;
+ sep = memchr (p, '\n', total);
+ if (sep)
+ {
+ len = sep - p;
+ sep++;
+ total -= (len + 1);
+ }
+ else
+ {
+ len = total;
+ total = 0;
+ }
+
+ patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+ if (patterns == NULL)
+ error (EXIT_TROUBLE, errno, _("memory exhausted"));
+ patterns[pcount] = patterns0;
+
+ if ((err = re_compile_pattern (p, len,
+ &(patterns[pcount].regexbuf))) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+ pcount++;
+
+ p = sep;
+ } while (sep && total != 0);
+
+ /* In the match_words and match_lines cases, we use a different pattern
+ for the DFA matcher that will quickly throw out cases that won't work.
+ Then if DFA succeeds we do some hairy stuff using the regex matcher
+ to decide whether the match should really count. */
+ if (match_words || match_lines)
+ {
+ static char const line_beg_no_bk[] = "^(";
+ static char const line_end_no_bk[] = ")$";
+ static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
+ static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
+ static char const line_beg_bk[] = "^\\(";
+ static char const line_end_bk[] = "\\)$";
+ static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
+ static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
+ int bk = !(syntax_bits & RE_NO_BK_PARENS);
+ char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
+
+ strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
+ : (bk ? word_beg_bk : word_beg_no_bk));
+ total = strlen(n);
+ memcpy (n + total, pattern, size);
+ total += size;
+ strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
+ : (bk ? word_end_bk : word_end_no_bk));
+ total += strlen (n + total);
+ pattern = motif = n;
+ size = total;
+ }
+ else
+ motif = NULL;
+
+ dfacomp (pattern, size, &dfa, 1);
+ kwsmusts ();
+
+ free(motif);
+}
+
+static void
+Ecompile (char const *pattern, size_t size)
+{
+ return GEAcompile (pattern, size, RE_SYNTAX_POSIX_EGREP);
+}
+
+static size_t
+EGexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
+{
+ char const *buflim, *beg, *end, *match, *best_match, *mb_start;
+ char eol = eolbyte;
+ int backref, start, len, best_len;
+ struct kwsmatch kwsm;
+ size_t i, ret_val;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ if (match_icase)
+ {
+ /* mbtolower adds a NUL byte at the end. That will provide
+ space for the sentinel byte dfaexec may add. */
+ char *case_buf = mbtolower (buf, &size);
+ if (start_ptr)
+ start_ptr = case_buf + (start_ptr - buf);
+ buf = case_buf;
+ }
+ }
+#endif /* MBS_SUPPORT */
+
+ mb_start = buf;
+ buflim = buf + size;
+
+ for (beg = end = buf; end < buflim; beg = end)
+ {
+ if (!start_ptr)
+ {
+ /* We don't care about an exact match. */
+ if (kwset)
+ {
+ /* Find a possible match using the KWset matcher. */
+ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+ if (offset == (size_t) -1)
+ goto failure;
+ beg += offset;
+ /* Narrow down to the line containing the candidate, and
+ run it through DFA. */
+ if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+ end++;
+ else
+ end = buflim;
+ match = beg;
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ if (kwsm.index < kwset_exact_matches)
+ {
+#ifdef MBS_SUPPORT
+ if (mb_start < beg)
+ mb_start = beg;
+ if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match,
buflim))
+#endif
+ goto success;
+ }
+ if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
+ continue;
+ }
+ else
+ {
+ /* No good fixed strings; start with DFA. */
+ char const *next_beg = dfaexec (&dfa, beg, (char *) buflim,
+ 0, NULL, &backref);
+ if (next_beg == NULL)
+ break;
+ /* Narrow down to the line we've found. */
+ beg = next_beg;
+ if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+ end++;
+ else
+ end = buflim;
+ while (beg > buf && beg[-1] != eol)
+ --beg;
+ }
+ /* Successful, no backreferences encountered! */
+ if (!backref)
+ goto success;
+ }
+ else
+ {
+ /* We are looking for the leftmost (then longest) exact match.
+ We will go through the outer loop only once. */
+ beg = start_ptr;
+ end = buflim;
+ }
+
+ /* If we've made it to this point, this means DFA has seen
+ a probable match, and we need to run it through Regex. */
+ best_match = end;
+ best_len = 0;
+ for (i = 0; i < pcount; i++)
+ {
+ patterns[i].regexbuf.not_eol = 0;
+ if (0 <= (start = re_search (&(patterns[i].regexbuf),
+ buf, end - buf - 1,
+ beg - buf, end - beg - 1,
+ &(patterns[i].regs))))
+ {
+ len = patterns[i].regs.end[0] - start;
+ match = buf + start;
+ if (match > best_match)
+ continue;
+ if (start_ptr && !match_words)
+ goto assess_pattern_match;
+ if ((!match_lines && !match_words)
+ || (match_lines && len == end - beg - 1))
+ {
+ match = beg;
+ len = end - beg;
+ goto assess_pattern_match;
+ }
+ /* If -w, check if the match aligns with word boundaries.
+ We do this iteratively because:
+ (a) the line may contain more than one occurence of the
+ pattern, and
+ (b) Several alternatives in the pattern might be valid at a
+ given point, and we may need to consider a shorter one to
+ find a word boundary. */
+ if (match_words)
+ while (match <= best_match)
+ {
+ if ((match == buf || !WCHAR ((unsigned char) match[-1]))
+ && (len == end - beg - 1
+ || !WCHAR ((unsigned char) match[len])))
+ goto assess_pattern_match;
+ if (len > 0)
+ {
+ /* Try a shorter length anchored at the same place. */
+ --len;
+ patterns[i].regexbuf.not_eol = 1;
+ len = re_match (&(patterns[i].regexbuf),
+ buf, match + len - beg, match - buf,
+ &(patterns[i].regs));
+ }
+ if (len <= 0)
+ {
+ /* Try looking further on. */
+ if (match == end - 1)
+ break;
+ match++;
+ patterns[i].regexbuf.not_eol = 0;
+ start = re_search (&(patterns[i].regexbuf),
+ buf, end - buf - 1,
+ match - buf, end - match - 1,
+ &(patterns[i].regs));
+ if (start < 0)
+ break;
+ len = patterns[i].regs.end[0] - start;
+ match = buf + start;
+ }
+ } /* while (match <= best_match) */
+ continue;
+ assess_pattern_match:
+ if (!start_ptr)
+ {
+ /* Good enough for a non-exact match.
+ No need to look at further patterns, if any. */
+ beg = match;
+ goto success_in_len;
+ }
+ if (match < best_match || (match == best_match && len > best_len))
+ {
+ /* Best exact match: leftmost, then longest. */
+ best_match = match;
+ best_len = len;
+ }
+ } /* if re_search >= 0 */
+ } /* for Regex patterns. */
+ if (best_match < end)
+ {
+ /* We have found an exact match. We were just
+ waiting for the best one (leftmost then longest). */
+ beg = best_match;
+ len = best_len;
+ goto success_in_len;
+ }
+ } /* for (beg = end ..) */
+
+ failure:
+ ret_val = -1;
+ goto out;
+
+ success:
+ len = end - beg;
+ success_in_len:
+ *match_size = len;
+ ret_val = beg - buf;
+ out:
+ return ret_val;
+}
diff --git a/src/esearch.c b/src/esearch.c
index d76c310..8c749c8 100644
--- a/src/esearch.c
+++ b/src/esearch.c
@@ -1,5 +1,4 @@
-#define EGREP_PROGRAM
-#include "search.c"
+#include "dfasearch.c"
struct matcher const matchers[] = {
{ "egrep", Ecompile, EGexecute },
diff --git a/src/fsearch.c b/src/fsearch.c
index e1ca0b1..b16e769 100644
--- a/src/fsearch.c
+++ b/src/fsearch.c
@@ -1,5 +1,4 @@
-#define FGREP_PROGRAM
-#include "search.c"
+#include "kwsearch.c"
struct matcher const matchers[] = {
{ "fgrep", Fcompile, Fexecute },
diff --git a/src/gsearch.c b/src/gsearch.c
index e3e0423..07ceba6 100644
--- a/src/gsearch.c
+++ b/src/gsearch.c
@@ -1,4 +1,19 @@
-#include "search.c"
+#include "dfasearch.c"
+#include "pcresearch.c"
+#include "kwsearch.c"
+
+static void
+Gcompile (char const *pattern, size_t size)
+{
+ return GEAcompile (pattern, size,
+ RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
+}
+
+static void
+Acompile (char const *pattern, size_t size)
+{
+ return GEAcompile (pattern, size, RE_SYNTAX_AWK);
+}
struct matcher const matchers[] = {
{ "grep", Gcompile, EGexecute },
@@ -8,4 +23,3 @@ struct matcher const matchers[] = {
{ "perl", Pcompile, Pexecute },
{ NULL, NULL, NULL },
};
-
diff --git a/src/kwsearch.c b/src/kwsearch.c
new file mode 100644
index 0000000..0e0f3bf
--- /dev/null
+++ b/src/kwsearch.c
@@ -0,0 +1,161 @@
+/* kwsearch.c - searching subroutines using kwset for grep.
+ Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+
+/* For -w, we also consider _ to be word constituent. */
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
+
+/* KWset compiled pattern. For Ecompile and Gcompile, we compile
+ a list of strings, at least one of which is known to occur in
+ any string matching the regexp. */
+static kwset_t kwset;
+
+static void
+Fcompile (char const *pattern, size_t size)
+{
+ char const *beg, *end, *lim, *err, *pat;
+ size_t psize;
+
+ kwsinit (&kwset);
+ psize = size;
+ if (match_icase && MB_CUR_MAX > 1)
+ pat = mbtolower (pattern, &psize);
+ else
+ pat = pattern;
+
+ beg = pat;
+ do
+ {
+ for (lim = beg;; ++lim)
+ {
+ end = lim;
+ if (lim >= pat + psize)
+ break;
+ if (*lim == '\n')
+ {
+ lim++;
+ break;
+ }
+#if HAVE_DOS_FILE_CONTENTS
+ if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
+ {
+ lim += 2;
+ break;
+ }
+#endif
+ }
+
+ if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+ beg = lim;
+ }
+ while (beg < pat + psize);
+
+ if ((err = kwsprep (kwset)) != NULL)
+ error (EXIT_TROUBLE, 0, "%s", err);
+}
+
+static size_t
+Fexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
+{
+ char const *beg, *try, *end, *mb_start;
+ size_t len;
+ char eol = eolbyte;
+ struct kwsmatch kwsmatch;
+ size_t ret_val;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1)
+ {
+ if (match_icase)
+ {
+ char *case_buf = mbtolower (buf, &size);
+ if (start_ptr)
+ start_ptr = case_buf + (start_ptr - buf);
+ buf = case_buf;
+ }
+ }
+#endif /* MBS_SUPPORT */
+
+ for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
+ {
+ size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+ if (offset == (size_t) -1)
+ goto failure;
+#ifdef MBS_SUPPORT
+ if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
+ {
+ beg = mb_start - 1;
+ continue; /* It is a part of multibyte character. */
+ }
+#endif /* MBS_SUPPORT */
+ beg += offset;
+ len = kwsmatch.size[0];
+ if (start_ptr && !match_words)
+ goto success_in_beg_and_len;
+ if (match_lines)
+ {
+ if (beg > buf && beg[-1] != eol)
+ continue;
+ if (beg + len < buf + size && beg[len] != eol)
+ continue;
+ goto success;
+ }
+ else if (match_words)
+ for (try = beg; len; )
+ {
+ if (try > buf && WCHAR((unsigned char) try[-1]))
+ break;
+ if (try + len < buf + size && WCHAR((unsigned char) try[len]))
+ {
+ offset = kwsexec (kwset, beg, --len, &kwsmatch);
+ if (offset == (size_t) -1)
+ break;
+ try = beg + offset;
+ len = kwsmatch.size[0];
+ }
+ else if (!start_ptr)
+ goto success;
+ else
+ goto success_in_beg_and_len;
+ } /* for (try) */
+ else
+ goto success;
+ } /* for (beg in buf) */
+
+ failure:
+ ret_val = -1;
+ goto out;
+
+ success:
+ if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
+ end++;
+ else
+ end = buf + size;
+ while (buf < beg && beg[-1] != eol)
+ --beg;
+ len = end - beg;
+ success_in_beg_and_len:
+ *match_size = len;
+ ret_val = beg - buf;
+ out:
+ return ret_val;
+}
diff --git a/src/pcresearch.c b/src/pcresearch.c
new file mode 100644
index 0000000..c3411b7
--- /dev/null
+++ b/src/pcresearch.c
@@ -0,0 +1,177 @@
+/* pcresearch.c - searching subroutines using PCRE for grep.
+ Copyright 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
+
+#if HAVE_LIBPCRE
+/* Compiled internal form of a Perl regular expression. */
+static pcre *cre;
+
+/* Additional information about the pattern. */
+static pcre_extra *extra;
+#endif
+
+static void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+ error (EXIT_TROUBLE, 0, "%s",
+ _("support for the -P option is not compiled into "
+ "this --disable-perl-regexp binary"));
+#else
+ int e;
+ char const *ep;
+ char *re = xmalloc (4 * size + 7);
+ int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+ char const *patlim = pattern + size;
+ char *n = re;
+ char const *p;
+ char const *pnul;
+
+ /* FIXME: Remove these restrictions. */
+ if (memchr(pattern, '\n', size))
+ error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+
+ *n = '\0';
+ if (match_lines)
+ strcpy (n, "^(");
+ if (match_words)
+ strcpy (n, "\\b(");
+ n += strlen (n);
+
+ /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+ replace each NUL byte in the pattern with the four characters
+ "\000", removing a preceding backslash if there are an odd
+ number of backslashes before the NUL.
+
+ FIXME: This method does not work with some multibyte character
+ encodings, notably Shift-JIS, where a multibyte character can end
+ in a backslash byte. */
+ for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+ {
+ memcpy (n, p, pnul - p);
+ n += pnul - p;
+ for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+ continue;
+ n -= (pnul - p) & 1;
+ strcpy (n, "\\000");
+ n += 4;
+ }
+
+ memcpy (n, p, patlim - p);
+ n += patlim - p;
+ *n = '\0';
+ if (match_words)
+ strcpy (n, ")\\b");
+ if (match_lines)
+ strcpy (n, ")$");
+
+ cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+ if (!cre)
+ error (EXIT_TROUBLE, 0, "%s", ep);
+
+ extra = pcre_study (cre, 0, &ep);
+ if (ep)
+ error (EXIT_TROUBLE, 0, "%s", ep);
+
+ free (re);
+#endif
+}
+
+static size_t
+Pexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
+{
+#if !HAVE_LIBPCRE
+ abort ();
+ return -1;
+#else
+ /* This array must have at least two elements; everything after that
+ is just for performance improvement in pcre_exec. */
+ int sub[300];
+
+ const char *line_buf, *line_end, *line_next;
+ int e = PCRE_ERROR_NOMATCH;
+ ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+
+ /* PCRE can't limit the matching to single lines, therefore we have to
+ match each line in the buffer separately. */
+ for (line_next = buf;
+ e == PCRE_ERROR_NOMATCH && line_next < buf + size;
+ start_ofs -= line_next - line_buf)
+ {
+ line_buf = line_next;
+ line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
+ if (line_end == NULL)
+ line_next = line_end = buf + size;
+ else
+ line_next = line_end + 1;
+
+ if (start_ptr && start_ptr >= line_end)
+ continue;
+
+ e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
+ start_ofs < 0 ? 0 : start_ofs, 0,
+ sub, sizeof sub / sizeof *sub);
+ }
+
+ if (e <= 0)
+ {
+ switch (e)
+ {
+ case PCRE_ERROR_NOMATCH:
+ return -1;
+
+ case PCRE_ERROR_NOMEMORY:
+ error (EXIT_TROUBLE, 0, _("memory exhausted"));
+
+ default:
+ abort ();
+ }
+ }
+ else
+ {
+ /* Narrow down to the line we've found. */
+ char const *beg = line_buf + sub[0];
+ char const *end = line_buf + sub[1];
+ char const *buflim = buf + size;
+ char eol = eolbyte;
+ if (!start_ptr)
+ {
+ /* FIXME: The case when '\n' is not found indicates a bug:
+ Since grep is line oriented, the match should never contain
+ a newline, so there _must_ be a newline following.
+ */
+ if (!(end = memchr (end, eol, buflim - end)))
+ end = buflim;
+ else
+ end++;
+ while (buf < beg && beg[-1] != eol)
+ --beg;
+ }
+
+ *match_size = end - beg;
+ return beg - buf;
+ }
+#endif
+}
diff --git a/src/search.c b/src/search.c
deleted file mode 100644
index 7b601e0..0000000
--- a/src/search.c
+++ /dev/null
@@ -1,844 +0,0 @@
-/* search.c - searching subroutines using dfa, kwset and regex for grep.
- Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 3, or (at your option)
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
- 02110-1301, USA. */
-
-/* Written August 1992 by Mike Haertel. */
-
-#include <config.h>
-
-#include <sys/types.h>
-
-#include "mbsupport.h"
-#ifdef MBS_SUPPORT
-/* We can handle multibyte strings. */
-# include <wchar.h>
-# include <wctype.h>
-#endif
-
-#include "system.h"
-#include "grep.h"
-#ifndef FGREP_PROGRAM
-# include <regex.h>
-# include "dfa.h"
-#endif
-#include "kwset.h"
-#include "error.h"
-#include "xalloc.h"
-#ifdef HAVE_LIBPCRE
-# include <pcre.h>
-#endif
-
-#define NCHAR (UCHAR_MAX + 1)
-
-/* For -w, we also consider _ to be word constituent. */
-#define WCHAR(C) (ISALNUM(C) || (C) == '_')
-
-/* KWset compiled pattern. For Ecompile and Gcompile, we compile
- a list of strings, at least one of which is known to occur in
- any string matching the regexp. */
-static kwset_t kwset;
-
-static void
-kwsinit (void)
-{
- static char trans[NCHAR];
- int i;
-
- if (match_icase && MB_CUR_MAX == 1)
- {
- for (i = 0; i < NCHAR; ++i)
- trans[i] = TOLOWER (i);
-
- kwset = kwsalloc (trans);
- }
- else
- kwset = kwsalloc (NULL);
-
- if (!kwset)
- xalloc_die ();
-}
-
-#ifdef MBS_SUPPORT
-/* Convert the *N-byte string, BEG, to lowercase, and write the
- NUL-terminated result into malloc'd storage. Upon success, set *N
- to the length (in bytes) of the resulting string (not including the
- trailing NUL byte), and return a pointer to the lowercase string.
- Upon memory allocation failure, this function exits.
-
- Note that while this function returns a pointer to malloc'd storage,
- the caller must not free it, since this function retains a pointer
- to the buffer and reuses it on any subsequent call. As a consequence,
- this function is not thread-safe. */
-static char *
-mbtolower (const char *beg, size_t *n)
-{
- static char *out;
- static size_t outalloc;
- size_t outlen, mb_cur_max;
- mbstate_t is, os;
- const char *end;
- char *p;
-
- if (*n > outalloc)
- {
- out = xrealloc (out, *n);
- outalloc = *n;
- }
-
- memset (&is, 0, sizeof (is));
- memset (&os, 0, sizeof (os));
- end = beg + *n;
-
- mb_cur_max = MB_CUR_MAX;
- p = out;
- outlen = 0;
- while (beg < end)
- {
- wchar_t wc;
- size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
- if (outlen + mb_cur_max >= outalloc)
- {
- out = x2nrealloc (out, &outalloc, 1);
- p = out + outlen;
- }
-
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multi-octet character.
- We treat it as a single-octet character. */
- *p++ = *beg++;
- outlen++;
- memset (&is, 0, sizeof (is));
- memset (&os, 0, sizeof (os));
- }
- else
- {
- beg += mbclen;
- mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
- p += mbclen;
- outlen += mbclen;
- }
- }
-
- *n = p - out;
- *p++ = 0;
- return out;
-}
-#endif
-
-
-#ifndef FGREP_PROGRAM
-/* DFA compiled regexp. */
-static struct dfa dfa;
-
-/* The Regex compiled patterns. */
-static struct patterns
-{
- /* Regex compiled regexp. */
- struct re_pattern_buffer regexbuf;
- struct re_registers regs; /* This is here on account of a BRAIN-DEAD
- address@hidden library interface in regex.c. */
-} patterns0;
-
-struct patterns *patterns;
-size_t pcount;
-
-void
-dfaerror (char const *mesg)
-{
- error (EXIT_TROUBLE, 0, "%s", mesg);
-}
-
-/* Number of compiled fixed strings known to exactly match the regexp.
- If kwsexec returns < kwset_exact_matches, then we don't need to
- call the regexp matcher at all. */
-static int kwset_exact_matches;
-
-static char const *
-kwsincr_case (const char *must)
-{
- const char *buf;
- size_t n;
-
- n = strlen (must);
-#ifdef MBS_SUPPORT
- if (match_icase && MB_CUR_MAX > 1)
- buf = mbtolower (must, &n);
- else
-#endif
- buf = must;
- return kwsincr (kwset, buf, n);
-}
-
-/* If the DFA turns out to have some set of fixed strings one of
- which must occur in the match, then we build a kwset matcher
- to find those strings, and thus quickly filter out impossible
- matches. */
-static void
-kwsmusts (void)
-{
- struct dfamust const *dm;
- char const *err;
-
- if (dfa.musts)
- {
- kwsinit ();
- /* First, we compile in the substrings known to be exact
- matches. The kwset matcher will return the index
- of the matching string that it chooses. */
- for (dm = dfa.musts; dm; dm = dm->next)
- {
- if (!dm->exact)
- continue;
- ++kwset_exact_matches;
- if ((err = kwsincr_case (dm->must)) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- }
- /* Now, we compile the substrings that will require
- the use of the regexp matcher. */
- for (dm = dfa.musts; dm; dm = dm->next)
- {
- if (dm->exact)
- continue;
- if ((err = kwsincr_case (dm->must)) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- }
- if ((err = kwsprep (kwset)) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- }
-}
-#endif /* !FGREP_PROGRAM */
-
-#ifdef MBS_SUPPORT
-
-static bool
-is_mb_middle(const char **good, const char *buf, const char *end)
-{
- const char *p = *good;
- const char *prev = p;
- mbstate_t cur_state;
-
- /* TODO: can be optimized for UTF-8. */
- memset(&cur_state, 0, sizeof(mbstate_t));
- while (p < buf)
- {
- size_t mbclen = mbrlen(p, end - p, &cur_state);
-
- /* Store the beginning of the previous complete multibyte character. */
- if (mbclen != (size_t) -2)
- prev = p;
-
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
- {
- /* An invalid sequence, or a truncated multibyte character.
- We treat it as a single byte character. */
- mbclen = 1;
- }
- p += mbclen;
- }
-
- *good = prev;
- return p > buf;
-}
-#endif /* MBS_SUPPORT */
-
-#if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
-/* No __VA_ARGS__ in C89. So we have to do it this way. */
-static void
-GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
-{
- const char *err;
- const char *p, *sep;
- size_t total = size;
- char *motif;
-
- if (match_icase)
- syntax_bits |= RE_ICASE;
- re_set_syntax (syntax_bits);
- dfasyntax (syntax_bits, match_icase, eolbyte);
-
- /* For GNU regex compiler we have to pass the patterns separately to detect
- errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
- GNU regex should have raise a syntax error. The same for backref, where
- the backref should have been local to each pattern. */
- p = pattern;
- do
- {
- size_t len;
- sep = memchr (p, '\n', total);
- if (sep)
- {
- len = sep - p;
- sep++;
- total -= (len + 1);
- }
- else
- {
- len = total;
- total = 0;
- }
-
- patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
- if (patterns == NULL)
- error (EXIT_TROUBLE, errno, _("memory exhausted"));
- patterns[pcount] = patterns0;
-
- if ((err = re_compile_pattern (p, len,
- &(patterns[pcount].regexbuf))) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- pcount++;
-
- p = sep;
- } while (sep && total != 0);
-
- /* In the match_words and match_lines cases, we use a different pattern
- for the DFA matcher that will quickly throw out cases that won't work.
- Then if DFA succeeds we do some hairy stuff using the regex matcher
- to decide whether the match should really count. */
- if (match_words || match_lines)
- {
- static char const line_beg_no_bk[] = "^(";
- static char const line_end_no_bk[] = ")$";
- static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
- static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
- static char const line_beg_bk[] = "^\\(";
- static char const line_end_bk[] = "\\)$";
- static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
- static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
- int bk = !(syntax_bits & RE_NO_BK_PARENS);
- char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
-
- strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
- : (bk ? word_beg_bk : word_beg_no_bk));
- total = strlen(n);
- memcpy (n + total, pattern, size);
- total += size;
- strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
- : (bk ? word_end_bk : word_end_no_bk));
- total += strlen (n + total);
- pattern = motif = n;
- size = total;
- }
- else
- motif = NULL;
-
- dfacomp (pattern, size, &dfa, 1);
- kwsmusts ();
-
- free(motif);
-}
-
-#ifndef EGREP_PROGRAM
-static void
-Gcompile (char const *pattern, size_t size)
-{
- return GEAcompile (pattern, size,
- RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
-}
-
-static void
-Acompile (char const *pattern, size_t size)
-{
- return GEAcompile (pattern, size, RE_SYNTAX_AWK);
-}
-#endif /* !EGREP_PROGRAM */
-
-static void
-Ecompile (char const *pattern, size_t size)
-{
- return GEAcompile (pattern, size, RE_SYNTAX_POSIX_EGREP);
-}
-
-static size_t
-EGexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
-{
- char const *buflim, *beg, *end, *match, *best_match, *mb_start;
- char eol = eolbyte;
- int backref, start, len, best_len;
- struct kwsmatch kwsm;
- size_t i, ret_val;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- {
- /* mbtolower adds a NUL byte at the end. That will provide
- space for the sentinel byte dfaexec may add. */
- char *case_buf = mbtolower (buf, &size);
- if (start_ptr)
- start_ptr = case_buf + (start_ptr - buf);
- buf = case_buf;
- }
- }
-#endif /* MBS_SUPPORT */
-
- mb_start = buf;
- buflim = buf + size;
-
- for (beg = end = buf; end < buflim; beg = end)
- {
- if (!start_ptr)
- {
- /* We don't care about an exact match. */
- if (kwset)
- {
- /* Find a possible match using the KWset matcher. */
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
- if (offset == (size_t) -1)
- goto failure;
- beg += offset;
- /* Narrow down to the line containing the candidate, and
- run it through DFA. */
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
- end++;
- else
- end = buflim;
- match = beg;
- while (beg > buf && beg[-1] != eol)
- --beg;
- if (kwsm.index < kwset_exact_matches)
- {
-#ifdef MBS_SUPPORT
- if (mb_start < beg)
- mb_start = beg;
- if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match,
buflim))
-#endif
- goto success;
- }
- if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
- continue;
- }
- else
- {
- /* No good fixed strings; start with DFA. */
- char const *next_beg = dfaexec (&dfa, beg, (char *) buflim,
- 0, NULL, &backref);
- if (next_beg == NULL)
- break;
- /* Narrow down to the line we've found. */
- beg = next_beg;
- if ((end = memchr(beg, eol, buflim - beg)) != NULL)
- end++;
- else
- end = buflim;
- while (beg > buf && beg[-1] != eol)
- --beg;
- }
- /* Successful, no backreferences encountered! */
- if (!backref)
- goto success;
- }
- else
- {
- /* We are looking for the leftmost (then longest) exact match.
- We will go through the outer loop only once. */
- beg = start_ptr;
- end = buflim;
- }
-
- /* If we've made it to this point, this means DFA has seen
- a probable match, and we need to run it through Regex. */
- best_match = end;
- best_len = 0;
- for (i = 0; i < pcount; i++)
- {
- patterns[i].regexbuf.not_eol = 0;
- if (0 <= (start = re_search (&(patterns[i].regexbuf),
- buf, end - buf - 1,
- beg - buf, end - beg - 1,
- &(patterns[i].regs))))
- {
- len = patterns[i].regs.end[0] - start;
- match = buf + start;
- if (match > best_match)
- continue;
- if (start_ptr && !match_words)
- goto assess_pattern_match;
- if ((!match_lines && !match_words)
- || (match_lines && len == end - beg - 1))
- {
- match = beg;
- len = end - beg;
- goto assess_pattern_match;
- }
- /* If -w, check if the match aligns with word boundaries.
- We do this iteratively because:
- (a) the line may contain more than one occurence of the
- pattern, and
- (b) Several alternatives in the pattern might be valid at a
- given point, and we may need to consider a shorter one to
- find a word boundary. */
- if (match_words)
- while (match <= best_match)
- {
- if ((match == buf || !WCHAR ((unsigned char) match[-1]))
- && (len == end - beg - 1
- || !WCHAR ((unsigned char) match[len])))
- goto assess_pattern_match;
- if (len > 0)
- {
- /* Try a shorter length anchored at the same place. */
- --len;
- patterns[i].regexbuf.not_eol = 1;
- len = re_match (&(patterns[i].regexbuf),
- buf, match + len - beg, match - buf,
- &(patterns[i].regs));
- }
- if (len <= 0)
- {
- /* Try looking further on. */
- if (match == end - 1)
- break;
- match++;
- patterns[i].regexbuf.not_eol = 0;
- start = re_search (&(patterns[i].regexbuf),
- buf, end - buf - 1,
- match - buf, end - match - 1,
- &(patterns[i].regs));
- if (start < 0)
- break;
- len = patterns[i].regs.end[0] - start;
- match = buf + start;
- }
- } /* while (match <= best_match) */
- continue;
- assess_pattern_match:
- if (!start_ptr)
- {
- /* Good enough for a non-exact match.
- No need to look at further patterns, if any. */
- beg = match;
- goto success_in_len;
- }
- if (match < best_match || (match == best_match && len > best_len))
- {
- /* Best exact match: leftmost, then longest. */
- best_match = match;
- best_len = len;
- }
- } /* if re_search >= 0 */
- } /* for Regex patterns. */
- if (best_match < end)
- {
- /* We have found an exact match. We were just
- waiting for the best one (leftmost then longest). */
- beg = best_match;
- len = best_len;
- goto success_in_len;
- }
- } /* for (beg = end ..) */
-
- failure:
- ret_val = -1;
- goto out;
-
- success:
- len = end - beg;
- success_in_len:
- *match_size = len;
- ret_val = beg - buf;
- out:
- return ret_val;
-}
-#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
-
-#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
-static void
-Fcompile (char const *pattern, size_t size)
-{
- char const *beg, *end, *lim, *err, *pat;
- size_t psize;
-
- kwsinit ();
- psize = size;
- if (match_icase && MB_CUR_MAX > 1)
- pat = mbtolower (pattern, &psize);
- else
- pat = pattern;
-
- beg = pat;
- do
- {
- for (lim = beg;; ++lim)
- {
- end = lim;
- if (lim >= pat + psize)
- break;
- if (*lim == '\n')
- {
- lim++;
- break;
- }
-#if HAVE_DOS_FILE_CONTENTS
- if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
- {
- lim += 2;
- break;
- }
-#endif
- }
-
- if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
- beg = lim;
- }
- while (beg < pat + psize);
-
- if ((err = kwsprep (kwset)) != NULL)
- error (EXIT_TROUBLE, 0, "%s", err);
-}
-
-static size_t
-Fexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
-{
- char const *beg, *try, *end, *mb_start;
- size_t len;
- char eol = eolbyte;
- struct kwsmatch kwsmatch;
- size_t ret_val;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1)
- {
- if (match_icase)
- {
- char *case_buf = mbtolower (buf, &size);
- if (start_ptr)
- start_ptr = case_buf + (start_ptr - buf);
- buf = case_buf;
- }
- }
-#endif /* MBS_SUPPORT */
-
- for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
- {
- size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
- if (offset == (size_t) -1)
- goto failure;
-#ifdef MBS_SUPPORT
- if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
- {
- beg = mb_start - 1;
- continue; /* It is a part of multibyte character. */
- }
-#endif /* MBS_SUPPORT */
- beg += offset;
- len = kwsmatch.size[0];
- if (start_ptr && !match_words)
- goto success_in_beg_and_len;
- if (match_lines)
- {
- if (beg > buf && beg[-1] != eol)
- continue;
- if (beg + len < buf + size && beg[len] != eol)
- continue;
- goto success;
- }
- else if (match_words)
- for (try = beg; len; )
- {
- if (try > buf && WCHAR((unsigned char) try[-1]))
- break;
- if (try + len < buf + size && WCHAR((unsigned char) try[len]))
- {
- offset = kwsexec (kwset, beg, --len, &kwsmatch);
- if (offset == (size_t) -1)
- break;
- try = beg + offset;
- len = kwsmatch.size[0];
- }
- else if (!start_ptr)
- goto success;
- else
- goto success_in_beg_and_len;
- } /* for (try) */
- else
- goto success;
- } /* for (beg in buf) */
-
- failure:
- ret_val = -1;
- goto out;
-
- success:
- if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
- end++;
- else
- end = buf + size;
- while (buf < beg && beg[-1] != eol)
- --beg;
- len = end - beg;
- success_in_beg_and_len:
- *match_size = len;
- ret_val = beg - buf;
- out:
- return ret_val;
-}
-#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
-
-#ifdef GREP_PROGRAM
-#if HAVE_LIBPCRE
-/* Compiled internal form of a Perl regular expression. */
-static pcre *cre;
-
-/* Additional information about the pattern. */
-static pcre_extra *extra;
-#endif
-
-static void
-Pcompile (char const *pattern, size_t size)
-{
-#if !HAVE_LIBPCRE
- error (EXIT_TROUBLE, 0, "%s",
- _("support for the -P option is not compiled into "
- "this --disable-perl-regexp binary"));
-#else
- int e;
- char const *ep;
- char *re = xmalloc (4 * size + 7);
- int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
- char const *patlim = pattern + size;
- char *n = re;
- char const *p;
- char const *pnul;
-
- /* FIXME: Remove these restrictions. */
- if (memchr(pattern, '\n', size))
- error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
-
- *n = '\0';
- if (match_lines)
- strcpy (n, "^(");
- if (match_words)
- strcpy (n, "\\b(");
- n += strlen (n);
-
- /* The PCRE interface doesn't allow NUL bytes in the pattern, so
- replace each NUL byte in the pattern with the four characters
- "\000", removing a preceding backslash if there are an odd
- number of backslashes before the NUL.
-
- FIXME: This method does not work with some multibyte character
- encodings, notably Shift-JIS, where a multibyte character can end
- in a backslash byte. */
- for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
- {
- memcpy (n, p, pnul - p);
- n += pnul - p;
- for (p = pnul; pattern < p && p[-1] == '\\'; p--)
- continue;
- n -= (pnul - p) & 1;
- strcpy (n, "\\000");
- n += 4;
- }
-
- memcpy (n, p, patlim - p);
- n += patlim - p;
- *n = '\0';
- if (match_words)
- strcpy (n, ")\\b");
- if (match_lines)
- strcpy (n, ")$");
-
- cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
- if (!cre)
- error (EXIT_TROUBLE, 0, "%s", ep);
-
- extra = pcre_study (cre, 0, &ep);
- if (ep)
- error (EXIT_TROUBLE, 0, "%s", ep);
-
- free (re);
-#endif
-}
-
-static size_t
-Pexecute (char const *buf, size_t size, size_t *match_size, char const
*start_ptr)
-{
-#if !HAVE_LIBPCRE
- abort ();
- return -1;
-#else
- /* This array must have at least two elements; everything after that
- is just for performance improvement in pcre_exec. */
- int sub[300];
-
- const char *line_buf, *line_end, *line_next;
- int e = PCRE_ERROR_NOMATCH;
- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
-
- /* PCRE can't limit the matching to single lines, therefore we have to
- match each line in the buffer separately. */
- for (line_next = buf;
- e == PCRE_ERROR_NOMATCH && line_next < buf + size;
- start_ofs -= line_next - line_buf)
- {
- line_buf = line_next;
- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
- if (line_end == NULL)
- line_next = line_end = buf + size;
- else
- line_next = line_end + 1;
-
- if (start_ptr && start_ptr >= line_end)
- continue;
-
- e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
- start_ofs < 0 ? 0 : start_ofs, 0,
- sub, sizeof sub / sizeof *sub);
- }
-
- if (e <= 0)
- {
- switch (e)
- {
- case PCRE_ERROR_NOMATCH:
- return -1;
-
- case PCRE_ERROR_NOMEMORY:
- error (EXIT_TROUBLE, 0, _("memory exhausted"));
-
- default:
- abort ();
- }
- }
- else
- {
- /* Narrow down to the line we've found. */
- char const *beg = line_buf + sub[0];
- char const *end = line_buf + sub[1];
- char const *buflim = buf + size;
- char eol = eolbyte;
- if (!start_ptr)
- {
- /* FIXME: The case when '\n' is not found indicates a bug:
- Since grep is line oriented, the match should never contain
- a newline, so there _must_ be a newline following.
- */
- if (!(end = memchr (end, eol, buflim - end)))
- end = buflim;
- else
- end++;
- while (buf < beg && beg[-1] != eol)
- --beg;
- }
-
- *match_size = end - beg;
- return beg - buf;
- }
-#endif
-}
-#endif /* GREP_PROGRAM */
diff --git a/src/search.h b/src/search.h
new file mode 100644
index 0000000..cb3b535
--- /dev/null
+++ b/src/search.h
@@ -0,0 +1,47 @@
+/* search.c - searching subroutines using dfa, kwset and regex for grep.
+ Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+#ifndef GREP_SEARCH_H
+#define GREP_SEARCH_H 1
+
+#include <config.h>
+
+#include <sys/types.h>
+
+#include "mbsupport.h"
+#ifdef MBS_SUPPORT
+/* We can handle multibyte strings. */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
+#include <regex.h>
+#include "system.h"
+#include "grep.h"
+#include "error.h"
+#include "kwset.h"
+#include "xalloc.h"
+
+void kwsinit (kwset_t *);
+
+#ifdef MBS_SUPPORT
+char * mbtolower (const char *, size_t *);
+bool is_mb_middle(const char **, const char *, const char *);
+#endif
+
+#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
new file mode 100644
index 0000000..ef4fef3
--- /dev/null
+++ b/src/searchutils.c
@@ -0,0 +1,141 @@
+/* searchutils.c - helper subroutines for grep's matchers.
+ Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+#include <config.h>
+#include "search.h"
+
+#define NCHAR (UCHAR_MAX + 1)
+
+void
+kwsinit (kwset_t *kwset)
+{
+ static char trans[NCHAR];
+ int i;
+
+ if (match_icase && MB_CUR_MAX == 1)
+ {
+ for (i = 0; i < NCHAR; ++i)
+ trans[i] = TOLOWER (i);
+
+ *kwset = kwsalloc (trans);
+ }
+ else
+ *kwset = kwsalloc (NULL);
+
+ if (!*kwset)
+ xalloc_die ();
+}
+
+#ifdef MBS_SUPPORT
+/* Convert the *N-byte string, BEG, to lowercase, and write the
+ NUL-terminated result into malloc'd storage. Upon success, set *N
+ to the length (in bytes) of the resulting string (not including the
+ trailing NUL byte), and return a pointer to the lowercase string.
+ Upon memory allocation failure, this function exits.
+
+ Note that while this function returns a pointer to malloc'd storage,
+ the caller must not free it, since this function retains a pointer
+ to the buffer and reuses it on any subsequent call. As a consequence,
+ this function is not thread-safe. */
+char *
+mbtolower (const char *beg, size_t *n)
+{
+ static char *out;
+ static size_t outalloc;
+ size_t outlen, mb_cur_max;
+ mbstate_t is, os;
+ const char *end;
+ char *p;
+
+ if (*n > outalloc)
+ {
+ out = xrealloc (out, *n);
+ outalloc = *n;
+ }
+
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ end = beg + *n;
+
+ mb_cur_max = MB_CUR_MAX;
+ p = out;
+ outlen = 0;
+ while (beg < end)
+ {
+ wchar_t wc;
+ size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
+ if (outlen + mb_cur_max >= outalloc)
+ {
+ out = x2nrealloc (out, &outalloc, 1);
+ p = out + outlen;
+ }
+
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multi-octet character.
+ We treat it as a single-octet character. */
+ *p++ = *beg++;
+ outlen++;
+ memset (&is, 0, sizeof (is));
+ memset (&os, 0, sizeof (os));
+ }
+ else
+ {
+ beg += mbclen;
+ mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+ p += mbclen;
+ outlen += mbclen;
+ }
+ }
+
+ *n = p - out;
+ *p++ = 0;
+ return out;
+}
+
+
+bool
+is_mb_middle(const char **good, const char *buf, const char *end)
+{
+ const char *p = *good;
+ const char *prev = p;
+ mbstate_t cur_state;
+
+ /* TODO: can be optimized for UTF-8. */
+ memset(&cur_state, 0, sizeof(mbstate_t));
+ while (p < buf)
+ {
+ size_t mbclen = mbrlen(p, end - p, &cur_state);
+
+ /* Store the beginning of the previous complete multibyte character. */
+ if (mbclen != (size_t) -2)
+ prev = p;
+
+ if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+ {
+ /* An invalid sequence, or a truncated multibyte character.
+ We treat it as a single byte character. */
+ mbclen = 1;
+ }
+ p += mbclen;
+ }
+
+ *good = prev;
+ return p > buf;
+}
+#endif /* MBS_SUPPORT */
--
1.6.6.1
- [PATCH 5/9] grep: eliminate {COMPILE,EXECUTE}_{RET,ARGS,FCT}, (continued)
- [PATCH 5/9] grep: eliminate {COMPILE,EXECUTE}_{RET,ARGS,FCT}, Paolo Bonzini, 2010/03/19
- [PATCH 6/9] grep: remove one #ifdef, Paolo Bonzini, 2010/03/19
- [PATCH 8/9] grep: prepare for libification of *search.c, Paolo Bonzini, 2010/03/19
- [PATCH 9/9] grep: libify *search.c, Paolo Bonzini, 2010/03/19
- [PATCH 7/9] grep: split search.c,
Paolo Bonzini <=
- Re: [PATCH 0/9] remove most {,E,F}GREP_PROGRAM occurrences, Jim Meyering, 2010/03/19
- Re: [PATCH 0/9] remove most {,E,F}GREP_PROGRAM occurrences, Jim Meyering, 2010/03/19
- Re: [PATCH 0/9] remove most {,E,F}GREP_PROGRAM occurrences, Jim Meyering, 2010/03/19
- Re: [PATCH 0/9] remove most {,E,F}GREP_PROGRAM occurrences, Jim Meyering, 2010/03/22