bug-gawk
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [bug-gawk] [PATCH] Arrange caller of dfa and regex.


From: Aharon Robbins
Subject: Re: [bug-gawk] [PATCH] Arrange caller of dfa and regex.
Date: Sun, 14 Aug 2016 21:39:21 +0300
User-agent: Heirloom mailx 12.5 6/20/10

Hi.

While waiting on your paperwork, I have worked on this patch a little
bit to make it (in my humble opinion) more readable.  Please review
and make sure that I didn't break anything. It still passes make check.
This is against current master.

Thanks!

Arnold
----------------
diff --git a/ChangeLog b/ChangeLog
index dbc826b..b70983c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2016-08-14         Norihiro Tanaka       <address@hidden>
+
+       * awk.h (struct Regexp): Remove dfa.  Use dfareg instead of it.  All
+       referers changed.
+       (avoid_dfa): Removed.
+       * re.c (research): Arrange caller of dfaexec and re_search.
+       * (avoid_dfa): Removed.  All callers changed.
+
 2016-08-14         Arnold D. Robbins     <address@hidden>
 
        * re.c (make_regexp): Only call dfasyntax if actually using
diff --git a/awk.h b/awk.h
index 5587cbc..0039123 100644
--- a/awk.h
+++ b/awk.h
@@ -206,8 +206,7 @@ typedef struct Regexp {
        struct re_pattern_buffer pat;
        struct re_registers regs;
        struct dfa *dfareg;
-       short dfa;
-       short has_anchor;       /* speed up of avoid_dfa kludge, temporary */
+       short has_anchor;       /* re has anchors which dfa avoids */
        short non_empty;        /* for use in fpat_parse_field */
        short has_meta;         /* re has meta chars so (probably) isn't simple 
string */
        short maybe_long;       /* re has meta chars that can match long text */
@@ -1650,7 +1649,6 @@ extern void reg_error(const char *s);
 extern Regexp *re_update(NODE *t);
 extern void resyntax(int syntax);
 extern void resetup(void);
-extern int avoid_dfa(NODE *re, char *str, size_t len);
 extern int reisstring(const char *text, size_t len, Regexp *re, const char 
*buf);
 extern int get_numbase(const char *str, bool use_locale);
 
diff --git a/interpret.h b/interpret.h
index 6b832c1..ae7c8b3 100644
--- a/interpret.h
+++ b/interpret.h
@@ -832,8 +832,7 @@ mod:
                                t2 = TOP_SCALAR();      /* switch expression */
                                t2 = force_string(t2);
                                rp = re_update(m);
-                               di = (research(rp, t2->stptr, 0, t2->stlen,
-                                                       avoid_dfa(m, t2->stptr, 
t2->stlen)) >= 0);
+                               di = (research(rp, t2->stptr, 0, t2->stlen, 0) 
>= 0);
                        } else {
                                t1 = POP_SCALAR();      /* case value */
                                t2 = TOP_SCALAR();      /* switch expression */
@@ -996,20 +995,8 @@ arrayfor:
                        t1 = *get_field(0, (Func_ptr *) 0);
 match_re:
                        rp = re_update(m);
-                       /*
-                        * Any place where research() is called with a last 
parameter of
-                        * zero, we need to use the avoid_dfa test. This 
appears here and
-                        * in the code for Op_K_case.
-                        *
-                        * A new or improved dfa that distinguishes 
beginning/end of
-                        * string from beginning/end of line will allow us to 
get rid of
-                        * this hack.
-                        *
-                        * The avoid_dfa() function is in re.c; it is not very 
smart.
-                        */
 
-                       di = research(rp, t1->stptr, 0, t1->stlen,
-                                                               avoid_dfa(m, 
t1->stptr, t1->stlen));
+                       di = research(rp, t1->stptr, 0, t1->stlen, 0);
                        di = (di == -1) ^ (op != Op_nomatch);
                        if (op != Op_match_rec) {
                                decr_sp();
diff --git a/re.c b/re.c
index 593ed16..eece38e 100644
--- a/re.c
+++ b/re.c
@@ -170,7 +170,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
 
        emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
        memset((char *) rp, 0, sizeof(*rp));
-       rp->dfareg = NULL;
        rp->pat.allocated = 0;  /* regex will allocate the buffer */
        emalloc(rp->pat.fastmap, char *, 256, "make_regexp");
 
@@ -226,11 +225,10 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
        /* gack. this must be done *after* re_compile_pattern */
        rp->pat.newline_anchor = false; /* don't get \n in middle of string */
        if (dfa && ! no_dfa) {
-               rp->dfa = true;
                rp->dfareg = dfaalloc();
                dfacomp(buf, len, rp->dfareg, true);
        } else
-               rp->dfa = false;
+               rp->dfareg = NULL;
        rp->has_anchor = has_anchor;
 
        /* Additional flags that help with RS as regexp. */
@@ -273,33 +271,30 @@ research(Regexp *rp, char *str, int start,
         * Always do dfa search if can; if it fails, we won't bother
         * with the regex search.
         *
-        * The dfa matcher doesn't have a no_bol flag, so don't bother
-        * trying it in that case.
-        *
         * 7/2016: The dfa matcher can't handle a case where searching
         * starts in the middle of a string, so don't bother trying it
         * in that case.
         */
-       if (rp->dfa && ! no_bol && start == 0) {
-               char save;
-               size_t count = 0;
+       if (rp->dfareg != NULL && start == 0) {
                struct dfa *superset = dfasuperset(rp->dfareg);
-               /*
-                * dfa likes to stick a '\n' right after the matched
-                * text.  So we just save and restore the character.
-                */
-               save = str[start+len];
+
                if (superset)
                        ret = dfaexec(superset, str+start, str+start+len,
                                                        true, NULL, NULL);
-               if (ret)
+
+               if (ret && ( (! need_start && ! no_bol && ! rp->has_anchor)
+                               || (! superset && dfaisfast(rp->dfareg))))
                        ret = dfaexec(rp->dfareg, str+start, str+start+len,
-                                               true, &count, &try_backref);
-               str[start+len] = save;
+                                               true, NULL, & try_backref);
        }
 
        if (ret) {
-               if (need_start || rp->dfa == false || try_backref) {
+               if (   rp->dfareg == NULL
+                   || start != 0
+                   || need_start
+                   || no_bol
+                   || rp->has_anchor
+                   || try_backref) {
                        /*
                         * Passing NULL as last arg speeds up search for cases
                         * where we don't need the start/end info.
@@ -328,7 +323,7 @@ refree(Regexp *rp)
                free(rp->regs.start);
        if (rp->regs.end)
                free(rp->regs.end);
-       if (rp->dfa) {
+       if (rp->dfareg) {
                dfafree(rp->dfareg);
                free(rp->dfareg);
        }
@@ -426,32 +421,6 @@ resetup()
        dfasyntax(syn, false, '\n');
 }
 
-/* avoid_dfa --- return true if we should not use the DFA matcher */
-
-int
-avoid_dfa(NODE *re, char *str, size_t len)
-{
-       char *end;
-
-       /*
-        * f = @/.../
-        * if ("foo" ~ f) ...
-        *
-        * This creates a Node_dynregex with NULL re_reg.
-        */
-       if (re->re_reg == NULL)
-               return false;
-
-       if (! re->re_reg->has_anchor)
-               return false;
-
-       for (end = str + len; str < end; str++)
-               if (*str == '\n')
-                       return true;
-
-       return false;
-}
-
 /* reisstring --- return true if the RE match is a simple string match */
 
 int



reply via email to

[Prev in Thread] Current Thread [Next in Thread]