>From 08da80735ce86291350092c312681583184a526a Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Fri, 2 Sep 2016 15:53:28 -0700
Subject: [PATCH] Sync dfa.c with grep.
---
ChangeLog | 5 +++
dfa.c | 114 +++++++++++++++++++-----------------------------------
dfa.h | 28 +++++++++++---
helpers/ChangeLog | 4 ++
helpers/testdfa.c | 10 ++++-
re.c | 3 +-
6 files changed, 81 insertions(+), 83 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 3288f48..f5f4e7f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2016-09-02 Paul Eggert
+
+ * dfa.c, dfa.h: Sync with grep.
+ * re.c (make_regexp): Adjust to DFA API changes.
+
2016-09-02 Arnold D. Robbins
* dfa.c: Sync with grep.
diff --git a/dfa.c b/dfa.c
index 5d68af2..39031e8 100644
--- a/dfa.c
+++ b/dfa.c
@@ -59,7 +59,6 @@
#define _(str) gettext (str)
#include
-#include
#include "xalloc.h"
@@ -363,6 +362,10 @@ struct regex_syntax
/* Flag for case-folding letters into sets. */
bool case_fold;
+ /* True if ^ and $ match only the start and end of data, and do not match
+ end-of-line within data. */
+ bool anchor;
+
/* End-of-line byte in data. */
unsigned char eolbyte;
@@ -782,7 +785,7 @@ unibyte_word_constituent (struct dfa const *dfa, unsigned char c)
static int
char_context (struct dfa const *dfa, unsigned char c)
{
- if (c == dfa->syntax.eolbyte)
+ if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor)
return CTX_NEWLINE;
if (unibyte_word_constituent (dfa, c))
return CTX_LETTER;
@@ -2699,18 +2702,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
is to fail miserably. */
if (d->searchflag)
{
- /* Find the state(s) corresponding to the positions of state 0. */
- copy (&d->states[0].elems, &follows);
- separate_contexts = state_separate_contexts (&follows);
- state = state_index (d, &follows, separate_contexts ^ CTX_ANY);
- if (separate_contexts & CTX_NEWLINE)
- state_newline = state_index (d, &follows, CTX_NEWLINE);
- else
- state_newline = state;
- if (separate_contexts & CTX_LETTER)
- state_letter = state_index (d, &follows, CTX_LETTER);
- else
- state_letter = state;
+ state_newline = 0;
+ state_letter = d->min_trcount - 1;
+ state = d->initstate_notbol;
for (i = 0; i < NOTCHAR; ++i)
trans[i] = unibyte_word_constituent (d, i) ? state_letter : state;
@@ -3075,16 +3069,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp,
Both P and MBP must be no larger than END. */
static unsigned char const *
skip_remains_mb (struct dfa *d, unsigned char const *p,
- unsigned char const *mbp, char const *end, wint_t *wcp)
+ unsigned char const *mbp, char const *end)
{
- wint_t wc = WEOF;
+ wint_t wc;
if (d->syntax.never_trail[*p])
return p;
while (mbp < p)
mbp += mbs_to_wchar (&wc, (char const *) mbp,
end - (char const *) mbp, d);
- if (wcp != NULL)
- *wcp = wc;
return mbp;
}
@@ -3141,46 +3133,22 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
for (;;)
{
- if (multibyte)
+ while ((t = trans[s]) != NULL)
{
- while ((t = trans[s]) != NULL)
+ if (s < d->min_trcount)
{
- s1 = s;
-
- if (s < d->min_trcount)
+ if (!multibyte || d->states[s].mbps.nelem == 0)
{
- if (d->min_trcount == 1)
- {
- if (d->states[s].mbps.nelem == 0)
- {
- do
- {
- while (t[*p] == 0)
- p++;
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- while (t[*p] == 0);
- }
- else
- p = mbp = skip_remains_mb (d, p, mbp, end, NULL);
- }
- else
- {
- wint_t wc;
- mbp = skip_remains_mb (d, p, mbp, end, &wc);
-
- /* If d->min_trcount is greater than 1, maybe
- transit to another initial state after skip. */
- if (p < mbp)
- {
- /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE
- cannot happen, as we assume that a newline
- is always a single byte character. */
- s1 = s = d->initstate_notbol;
- p = mbp;
- }
- }
+ while (t[*p] == s)
+ p++;
}
+ if (multibyte)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+ }
+
+ if (multibyte)
+ {
+ s1 = s;
if (d->states[s].mbps.nelem == 0
|| d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end)
@@ -3196,22 +3164,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
trans = d->trans;
}
}
- }
- else
- {
- if (s == 0)
- {
- t = trans[s];
- if (t)
- {
- while (t[*p] == 0)
- p++;
- s1 = 0;
- s = t[*p++];
- }
- }
-
- while ((t = trans[s]) != NULL)
+ else
{
s1 = t[*p++];
t = trans[s1];
@@ -3222,6 +3175,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
s1 = tmp; /* swap */
break;
}
+ if (s < d->min_trcount)
+ {
+ while (t[*p] == s1)
+ p++;
+ }
s = t[*p++];
}
}
@@ -3246,6 +3204,9 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl,
if (d->success[s] & d->syntax.sbit[*p])
goto done;
+ if (multibyte && s < d->min_trcount)
+ p = mbp = skip_remains_mb (d, p, mbp, end);
+
s1 = s;
if (!multibyte || d->states[s].mbps.nelem == 0
|| (*p == eol && !allow_nl)
@@ -3813,9 +3774,11 @@ dfamust (struct dfa const *d)
bool exact = false;
bool begline = false;
bool endline = false;
+ size_t rj;
bool need_begline = false;
bool need_endline = false;
bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1;
+ struct dfamust *dm;
for (ri = 0; ri < d->tindex; ++ri)
{
@@ -3992,7 +3955,7 @@ dfamust (struct dfa const *d)
}
}
- size_t rj = ri + 2;
+ rj = ri + 2;
if (d->tokens[ri + 1] == CAT)
{
for (; rj < d->tindex - 1; rj += 2)
@@ -4021,7 +3984,7 @@ dfamust (struct dfa const *d)
}
done:;
- struct dfamust *dm = NULL;
+ dm = NULL;
if (*result)
{
dm = xmalloc (sizeof *dm);
@@ -4057,7 +4020,7 @@ dfaalloc (void)
/* Initialize DFA. */
void
dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
- reg_syntax_t bits, bool fold, unsigned char eol)
+ reg_syntax_t bits, int dfaopts)
{
int i;
memset (dfa, 0, offsetof (struct dfa, dfaexec));
@@ -4070,9 +4033,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo,
dfa->canychar = -1;
dfa->lex.cur_mb_len = 1;
dfa->syntax.syntax_bits_set = true;
+ dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0;
+ dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0;
+ dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n';
dfa->syntax.syntax_bits = bits;
- dfa->syntax.case_fold = fold;
- dfa->syntax.eolbyte = eol;
for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
{
diff --git a/dfa.h b/dfa.h
index 1fd37ec..8608b10 100644
--- a/dfa.h
+++ b/dfa.h
@@ -26,7 +26,11 @@
#endif /* HAVE_STDBOOL_H */
#include
-#define _GL_ATTRIBUTE_MALLOC
+#if 3 <= __GNUC__
+# define _GL_ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
+#else
+# define _GL_ATTRIBUTE_MALLOC
+#endif
struct localeinfo; /* See localeinfo.h. */
@@ -50,15 +54,29 @@ struct dfa;
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
+/* DFA options that can be ORed together, for dfasyntax's 4th arg. */
+enum
+ {
+ /* ^ and $ match only the start and end of data, and do not match
+ end-of-line within data. This is always false for grep, but
+ possibly true for other apps. */
+ DFA_ANCHOR = 1 << 0,
+
+ /* Ignore case while matching. */
+ DFA_CASE_FOLD = 1 << 1,
+
+ /* '\0' in data is end-of-line, instead of the traditional '\n'. */
+ DFA_EOL_NUL = 1 << 2
+ };
+
/* Initialize or reinitialize a DFA. This must be called before
any of the routines below. The arguments are:
1. The DFA to operate on.
2. Information about the current locale.
- 3. The syntax bits described earlier in this file.
- 4. The case-folding flag.
- 5. The line terminator. */
+ 3. Syntax bits described in regex.h.
+ 4. Additional DFA options described above. */
extern void dfasyntax (struct dfa *, struct localeinfo const *,
- reg_syntax_t, bool, unsigned char);
+ reg_syntax_t, int);
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
diff --git a/helpers/ChangeLog b/helpers/ChangeLog
index 0958a02..6fc3af7 100644
--- a/helpers/ChangeLog
+++ b/helpers/ChangeLog
@@ -1,3 +1,7 @@
+2016-09-02 Paul Eggert
+
+ * testdfa.c: Adjust to DFA API changes.
+
2016-08-25 Arnold D. Robbins
* 4.1.4: Release tar ball made.
diff --git a/helpers/testdfa.c b/helpers/testdfa.c
index 4495e11..fa7715f 100644
--- a/helpers/testdfa.c
+++ b/helpers/testdfa.c
@@ -44,6 +44,7 @@
#define _Noreturn
#define _GL_ATTRIBUTE_PURE
#include "dfa.h"
+#include "localeinfo.h"
const char *regexflags2str(int flags);
char *databuf(int fd);
@@ -71,7 +72,8 @@ void usage(const char *myname)
int main(int argc, char **argv)
{
- int c, ret, try_backref;
+ int c, ret;
+ bool try_backref;
struct re_pattern_buffer pat;
struct re_registers regs;
struct dfa *dfareg;
@@ -84,6 +86,7 @@ int main(int argc, char **argv)
char save;
size_t count = 0;
char *place;
+ struct localeinfo localeinfo;
if (argc < 2)
usage(argv[0]);
@@ -158,7 +161,6 @@ int main(int argc, char **argv)
dfa_syn = syn;
if (ignorecase)
dfa_syn |= RE_ICASE;
- dfasyntax(dfa_syn, ignorecase, '\n');
re_set_syntax(syn);
if ((rerr = re_compile_pattern(pattern, len, & pat)) != NULL) {
@@ -171,6 +173,10 @@ int main(int argc, char **argv)
pat.newline_anchor = false; /* don't get \n in middle of string */
dfareg = dfaalloc();
+ init_localeinfo(&localeinfo);
+ dfasyntax(dfareg, &localeinfo, dfa_syn,
+ ignorecase ? DFA_CASE_FOLD : 0);
+
printf("Calling dfacomp(%s, %d, %p, true)\n",
pattern, (int) len, dfareg);
diff --git a/re.c b/re.c
index 6a100db..69cc50e 100644
--- a/re.c
+++ b/re.c
@@ -227,7 +227,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal)
rp->pat.newline_anchor = false; /* don't get \n in middle of string */
if (dfa && ! no_dfa) {
rp->dfareg = dfaalloc();
- dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n');
+ dfasyntax(rp->dfareg, & localeinfo, dfa_syn,
+ ignorecase ? DFA_CASE_FOLD : 0);
dfacomp(buf, len, rp->dfareg, true);
} else
rp->dfareg = NULL;
--
2.7.4