>From 08da80735ce86291350092c312681583184a526a Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Fri, 2 Sep 2016 15:53:28 -0700 Subject: [PATCH] Sync dfa.c with grep. --- ChangeLog | 5 +++ dfa.c | 114 +++++++++++++++++++----------------------------------- dfa.h | 28 +++++++++++--- helpers/ChangeLog | 4 ++ helpers/testdfa.c | 10 ++++- re.c | 3 +- 6 files changed, 81 insertions(+), 83 deletions(-) diff --git a/ChangeLog b/ChangeLog index 3288f48..f5f4e7f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2016-09-02 Paul Eggert + + * dfa.c, dfa.h: Sync with grep. + * re.c (make_regexp): Adjust to DFA API changes. + 2016-09-02 Arnold D. Robbins * dfa.c: Sync with grep. diff --git a/dfa.c b/dfa.c index 5d68af2..39031e8 100644 --- a/dfa.c +++ b/dfa.c @@ -59,7 +59,6 @@ #define _(str) gettext (str) #include -#include #include "xalloc.h" @@ -363,6 +362,10 @@ struct regex_syntax /* Flag for case-folding letters into sets. */ bool case_fold; + /* True if ^ and $ match only the start and end of data, and do not match + end-of-line within data. */ + bool anchor; + /* End-of-line byte in data. */ unsigned char eolbyte; @@ -782,7 +785,7 @@ unibyte_word_constituent (struct dfa const *dfa, unsigned char c) static int char_context (struct dfa const *dfa, unsigned char c) { - if (c == dfa->syntax.eolbyte) + if (c == dfa->syntax.eolbyte && !dfa->syntax.anchor) return CTX_NEWLINE; if (unibyte_word_constituent (dfa, c)) return CTX_LETTER; @@ -2699,18 +2702,9 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) is to fail miserably. */ if (d->searchflag) { - /* Find the state(s) corresponding to the positions of state 0. */ - copy (&d->states[0].elems, &follows); - separate_contexts = state_separate_contexts (&follows); - state = state_index (d, &follows, separate_contexts ^ CTX_ANY); - if (separate_contexts & CTX_NEWLINE) - state_newline = state_index (d, &follows, CTX_NEWLINE); - else - state_newline = state; - if (separate_contexts & CTX_LETTER) - state_letter = state_index (d, &follows, CTX_LETTER); - else - state_letter = state; + state_newline = 0; + state_letter = d->min_trcount - 1; + state = d->initstate_notbol; for (i = 0; i < NOTCHAR; ++i) trans[i] = unibyte_word_constituent (d, i) ? state_letter : state; @@ -3075,16 +3069,14 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp, Both P and MBP must be no larger than END. */ static unsigned char const * skip_remains_mb (struct dfa *d, unsigned char const *p, - unsigned char const *mbp, char const *end, wint_t *wcp) + unsigned char const *mbp, char const *end) { - wint_t wc = WEOF; + wint_t wc; if (d->syntax.never_trail[*p]) return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); - if (wcp != NULL) - *wcp = wc; return mbp; } @@ -3141,46 +3133,22 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, for (;;) { - if (multibyte) + while ((t = trans[s]) != NULL) { - while ((t = trans[s]) != NULL) + if (s < d->min_trcount) { - s1 = s; - - if (s < d->min_trcount) + if (!multibyte || d->states[s].mbps.nelem == 0) { - if (d->min_trcount == 1) - { - if (d->states[s].mbps.nelem == 0) - { - do - { - while (t[*p] == 0) - p++; - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - while (t[*p] == 0); - } - else - p = mbp = skip_remains_mb (d, p, mbp, end, NULL); - } - else - { - wint_t wc; - mbp = skip_remains_mb (d, p, mbp, end, &wc); - - /* If d->min_trcount is greater than 1, maybe - transit to another initial state after skip. */ - if (p < mbp) - { - /* It's CTX_LETTER or CTX_NONE. CTX_NEWLINE - cannot happen, as we assume that a newline - is always a single byte character. */ - s1 = s = d->initstate_notbol; - p = mbp; - } - } + while (t[*p] == s) + p++; } + if (multibyte) + p = mbp = skip_remains_mb (d, p, mbp, end); + } + + if (multibyte) + { + s1 = s; if (d->states[s].mbps.nelem == 0 || d->localeinfo.sbctowc[*p] != WEOF || (char *) p >= end) @@ -3196,22 +3164,7 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, trans = d->trans; } } - } - else - { - if (s == 0) - { - t = trans[s]; - if (t) - { - while (t[*p] == 0) - p++; - s1 = 0; - s = t[*p++]; - } - } - - while ((t = trans[s]) != NULL) + else { s1 = t[*p++]; t = trans[s1]; @@ -3222,6 +3175,11 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, s1 = tmp; /* swap */ break; } + if (s < d->min_trcount) + { + while (t[*p] == s1) + p++; + } s = t[*p++]; } } @@ -3246,6 +3204,9 @@ dfaexec_main (struct dfa *d, char const *begin, char *end, bool allow_nl, if (d->success[s] & d->syntax.sbit[*p]) goto done; + if (multibyte && s < d->min_trcount) + p = mbp = skip_remains_mb (d, p, mbp, end); + s1 = s; if (!multibyte || d->states[s].mbps.nelem == 0 || (*p == eol && !allow_nl) @@ -3813,9 +3774,11 @@ dfamust (struct dfa const *d) bool exact = false; bool begline = false; bool endline = false; + size_t rj; bool need_begline = false; bool need_endline = false; bool case_fold_unibyte = d->syntax.case_fold && MB_CUR_MAX == 1; + struct dfamust *dm; for (ri = 0; ri < d->tindex; ++ri) { @@ -3992,7 +3955,7 @@ dfamust (struct dfa const *d) } } - size_t rj = ri + 2; + rj = ri + 2; if (d->tokens[ri + 1] == CAT) { for (; rj < d->tindex - 1; rj += 2) @@ -4021,7 +3984,7 @@ dfamust (struct dfa const *d) } done:; - struct dfamust *dm = NULL; + dm = NULL; if (*result) { dm = xmalloc (sizeof *dm); @@ -4057,7 +4020,7 @@ dfaalloc (void) /* Initialize DFA. */ void dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, - reg_syntax_t bits, bool fold, unsigned char eol) + reg_syntax_t bits, int dfaopts) { int i; memset (dfa, 0, offsetof (struct dfa, dfaexec)); @@ -4070,9 +4033,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const *linfo, dfa->canychar = -1; dfa->lex.cur_mb_len = 1; dfa->syntax.syntax_bits_set = true; + dfa->syntax.case_fold = (dfaopts & DFA_CASE_FOLD) != 0; + dfa->syntax.anchor = (dfaopts & DFA_ANCHOR) != 0; + dfa->syntax.eolbyte = dfaopts & DFA_EOL_NUL ? '\0' : '\n'; dfa->syntax.syntax_bits = bits; - dfa->syntax.case_fold = fold; - dfa->syntax.eolbyte = eol; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { diff --git a/dfa.h b/dfa.h index 1fd37ec..8608b10 100644 --- a/dfa.h +++ b/dfa.h @@ -26,7 +26,11 @@ #endif /* HAVE_STDBOOL_H */ #include -#define _GL_ATTRIBUTE_MALLOC +#if 3 <= __GNUC__ +# define _GL_ATTRIBUTE_MALLOC __attribute__ ((__malloc__)) +#else +# define _GL_ATTRIBUTE_MALLOC +#endif struct localeinfo; /* See localeinfo.h. */ @@ -50,15 +54,29 @@ struct dfa; calling dfafree() on it. */ extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC; +/* DFA options that can be ORed together, for dfasyntax's 4th arg. */ +enum + { + /* ^ and $ match only the start and end of data, and do not match + end-of-line within data. This is always false for grep, but + possibly true for other apps. */ + DFA_ANCHOR = 1 << 0, + + /* Ignore case while matching. */ + DFA_CASE_FOLD = 1 << 1, + + /* '\0' in data is end-of-line, instead of the traditional '\n'. */ + DFA_EOL_NUL = 1 << 2 + }; + /* Initialize or reinitialize a DFA. This must be called before any of the routines below. The arguments are: 1. The DFA to operate on. 2. Information about the current locale. - 3. The syntax bits described earlier in this file. - 4. The case-folding flag. - 5. The line terminator. */ + 3. Syntax bits described in regex.h. + 4. Additional DFA options described above. */ extern void dfasyntax (struct dfa *, struct localeinfo const *, - reg_syntax_t, bool, unsigned char); + reg_syntax_t, int); /* Build and return the struct dfamust from the given struct dfa. */ extern struct dfamust *dfamust (struct dfa const *); diff --git a/helpers/ChangeLog b/helpers/ChangeLog index 0958a02..6fc3af7 100644 --- a/helpers/ChangeLog +++ b/helpers/ChangeLog @@ -1,3 +1,7 @@ +2016-09-02 Paul Eggert + + * testdfa.c: Adjust to DFA API changes. + 2016-08-25 Arnold D. Robbins * 4.1.4: Release tar ball made. diff --git a/helpers/testdfa.c b/helpers/testdfa.c index 4495e11..fa7715f 100644 --- a/helpers/testdfa.c +++ b/helpers/testdfa.c @@ -44,6 +44,7 @@ #define _Noreturn #define _GL_ATTRIBUTE_PURE #include "dfa.h" +#include "localeinfo.h" const char *regexflags2str(int flags); char *databuf(int fd); @@ -71,7 +72,8 @@ void usage(const char *myname) int main(int argc, char **argv) { - int c, ret, try_backref; + int c, ret; + bool try_backref; struct re_pattern_buffer pat; struct re_registers regs; struct dfa *dfareg; @@ -84,6 +86,7 @@ int main(int argc, char **argv) char save; size_t count = 0; char *place; + struct localeinfo localeinfo; if (argc < 2) usage(argv[0]); @@ -158,7 +161,6 @@ int main(int argc, char **argv) dfa_syn = syn; if (ignorecase) dfa_syn |= RE_ICASE; - dfasyntax(dfa_syn, ignorecase, '\n'); re_set_syntax(syn); if ((rerr = re_compile_pattern(pattern, len, & pat)) != NULL) { @@ -171,6 +173,10 @@ int main(int argc, char **argv) pat.newline_anchor = false; /* don't get \n in middle of string */ dfareg = dfaalloc(); + init_localeinfo(&localeinfo); + dfasyntax(dfareg, &localeinfo, dfa_syn, + ignorecase ? DFA_CASE_FOLD : 0); + printf("Calling dfacomp(%s, %d, %p, true)\n", pattern, (int) len, dfareg); diff --git a/re.c b/re.c index 6a100db..69cc50e 100644 --- a/re.c +++ b/re.c @@ -227,7 +227,8 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool dfa, bool canfatal) rp->pat.newline_anchor = false; /* don't get \n in middle of string */ if (dfa && ! no_dfa) { rp->dfareg = dfaalloc(); - dfasyntax(rp->dfareg, & localeinfo, dfa_syn, ignorecase, '\n'); + dfasyntax(rp->dfareg, & localeinfo, dfa_syn, + ignorecase ? DFA_CASE_FOLD : 0); dfacomp(buf, len, rp->dfareg, true); } else rp->dfareg = NULL; -- 2.7.4