bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Speedup matching of ^$ and cleanup


From: Paolo Bonzini
Subject: [PATCH] Speedup matching of ^$ and cleanup
Date: Fri, 11 Apr 2008 13:40:24 +0200
User-agent: Thunderbird 2.0.0.12 (Macintosh/20080213)

Regex matching has an optimization where only one match is tried for a regex anchored to the beginning of the buffer. While other anchors are resolved with the fastmap, this one allows further optimization and is special cased. However, because of a bug in create_cd_newstate, ^$ would be mistakenly treated as a non-anchoring match, and re_search_internal would try matching it at every position.

In fact, the bug is (almost) fixed by this hunk:

@@ -1682,8 +1680,6 @@ create_cd_newstate (const re_dfa_t *dfa,
        newstate->halt = 1;
       else if (type == OP_BACK_REF)
        newstate->has_backref = 1;
-      else if (type == ANCHOR)
-       constraint = node->opr.ctx_type;

       if (constraint)
        {

However, some complications in building the NFA prevent this from fixing the problem. Therefore, this patch cleans up the handling of anchors so that tests on type == ANCHOR are not necessary anymore. When creating the NFA (calc_first), I move the opr.ctx_type to the constraint field of re_token_t, and then I always look at it unconditionally, without special-casing ANCHORs. This also allows some simplification of duplicate_node_closure.

I attach a patch for glibc (against CVS), and one for gnulib, asking for gnulib approval, and glibc approval+commit. I tested both using the GNU sed testsuite.

Thanks,

Paolo
2008-04-11  Paolo Bonzini  <address@hidden>

        * posix/regcomp.c (optimize_utf8): Add a note on why we test 
opr.ctx_type.
        (calc_first): Initialize constraint field.
        (duplicate_node_closure): Use it instead of special casing ANCHORS.
        Use search_duplicated_node to avoid loops.  Fix grammar.
        (duplicate_node): Merge constraint field for all node types.
        (calc_eclosure_iter): Look at constraint field for all node types.
        * posix/regex_internal.c (create_cd_newstate)): Don't look at
        create_cd_newstate.

--- regcomp.c.pb        2008-04-11 11:47:32.000000000 +0200
+++ regcomp.c   2008-04-11 12:03:42.000000000 +0200
@@ -1038,7 +1038,9 @@ optimize_utf8 (re_dfa_t *dfa)
          case BUF_LAST:
            break;
          default:
-           /* Word anchors etc. cannot be handled.  */
+           /* Word anchors etc. cannot be handled.  It's okay to test
+              opr.ctx_type since constraints (for all DFA nodes) are
+              created by ORing one or more opr.ctx_type values.  */
            return;
          }
        break;
@@ -1318,6 +1320,8 @@ calc_first (void *extra, bin_tree_t *nod
       node->node_idx = re_dfa_add_node (dfa, node->token);
       if (BE (node->node_idx == -1, 0))
         return REG_ESPACE;
+      if (node->token.type == ANCHOR)
+        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
     }
   return REG_NOERROR;
 }
@@ -1446,22 +1450,17 @@ duplicate_node_closure (re_dfa_t *dfa, i
             destination.  */
          org_dest = dfa->edests[org_node].elems[0];
          re_node_set_empty (dfa->edests + clone_node);
-         if (dfa->nodes[org_node].type == ANCHOR)
+         /* If the node is root_node itself, it means the epsilon clsoure
+            has a loop.   Then tie it to the destination of the root_node.  */
+         if (org_node == root_node && clone_node != org_node)
            {
-             /* In case of the node has another constraint, append it.  */
-             if (org_node == root_node && clone_node != org_node)
-               {
-                 /* ...but if the node is root_node itself, it means the
-                    epsilon closure have a loop, then tie it to the
-                    destination of the root_node.  */
-                 ret = re_node_set_insert (dfa->edests + clone_node,
-                                           org_dest);
-                 if (BE (ret < 0, 0))
-                   return REG_ESPACE;
-                 break;
-               }
-             constraint |= dfa->nodes[org_node].opr.ctx_type;
+             ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
+             if (BE (ret < 0, 0))
+               return REG_ESPACE;
+             break;
            }
+         /* In case of the node has another constraint, add it.  */
+         constraint |= dfa->nodes[org_node].constraint;
          clone_dest = duplicate_node (dfa, org_dest, constraint);
          if (BE (clone_dest == -1, 0))
            return REG_ESPACE;
@@ -1479,7 +1478,7 @@ duplicate_node_closure (re_dfa_t *dfa, i
          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
          if (clone_dest == -1)
            {
-             /* There are no such a duplicated node, create a new one.  */
+             /* There is no such duplicated node, create a new one.  */
              reg_errcode_t err;
              clone_dest = duplicate_node (dfa, org_dest, constraint);
              if (BE (clone_dest == -1, 0))
@@ -1494,7 +1493,7 @@ duplicate_node_closure (re_dfa_t *dfa, i
            }
          else
            {
-             /* There are a duplicated node which satisfy the constraint,
+             /* There is a duplicated node which satisfies the constraint,
                 use it to avoid infinite loop.  */
              ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
              if (BE (ret < 0, 0))
@@ -1543,8 +1542,7 @@ duplicate_node (re_dfa_t *dfa, int org_i
   if (BE (dup_idx != -1, 1))
     {
       dfa->nodes[dup_idx].constraint = constraint;
-      if (dfa->nodes[org_idx].type == ANCHOR)
-       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
+      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
       dfa->nodes[dup_idx].duplicated = 1;
 
       /* Store the index of the original node.  */
@@ -1624,7 +1622,6 @@ static reg_errcode_t
 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
 {
   reg_errcode_t err;
-  unsigned int constraint;
   int i, incomplete;
   re_node_set eclosure;
   incomplete = 0;
@@ -1636,15 +1633,14 @@ calc_eclosure_iter (re_node_set *new_set
      We reference this value to avoid infinite loop.  */
   dfa->eclosures[node].nelem = -1;
 
-  constraint = ((dfa->nodes[node].type == ANCHOR)
-               ? dfa->nodes[node].opr.ctx_type : 0);
-  /* If the current node has constraints, duplicate all nodes.
-     Since they must inherit the constraints.  */
-  if (constraint
+  /* If the current node has constraints, duplicate all nodes
+     since they must inherit the constraints.  */
+  if (dfa->nodes[node].constraint
       && dfa->edests[node].nelem
       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
     {
-      err = duplicate_node_closure (dfa, node, node, node, constraint);
+      err = duplicate_node_closure (dfa, node, node, node,
+                                   dfa->nodes[node].constraint);
       if (BE (err != REG_NOERROR, 0))
        return err;
     }
--- regex_internal.c.pb 2008-04-11 11:47:41.000000000 +0200
+++ regex_internal.c    2008-04-11 11:52:20.000000000 +0200
@@ -1665,11 +1665,9 @@ create_cd_newstate (const re_dfa_t *dfa,
 
   for (i = 0 ; i < nodes->nelem ; i++)
     {
-      unsigned int constraint = 0;
       re_token_t *node = dfa->nodes + nodes->elems[i];
       re_token_type_t type = node->type;
-      if (node->constraint)
-       constraint = node->constraint;
+      unsigned int constraint = node->constraint;
 
       if (type == CHARACTER && !constraint)
        continue;
@@ -1682,8 +1680,6 @@ create_cd_newstate (const re_dfa_t *dfa,
        newstate->halt = 1;
       else if (type == OP_BACK_REF)
        newstate->has_backref = 1;
-      else if (type == ANCHOR)
-       constraint = node->opr.ctx_type;
 
       if (constraint)
        {
2008-04-11  Paolo Bonzini  <address@hidden>

        * lib/regcomp.c (optimize_utf8): Add a note on why we test opr.ctx_type.
        (calc_first): Initialize constraint field.
        (duplicate_node_closure): Use it instead of special casing ANCHORS.
        Use search_duplicated_node to avoid loops.  Fix grammar.
        (duplicate_node): Merge constraint field for all node types.
        (calc_eclosure_iter): Look at constraint field for all node types.
        * lib/regex_internal.c (create_cd_newstate)): Don't look at
        create_cd_newstate.

diff --git a/lib/regcomp.c b/lib/regcomp.c
index a02418d..dc15666 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -1057,7 +1057,9 @@ optimize_utf8 (re_dfa_t *dfa)
          case BUF_LAST:
            break;
          default:
-           /* Word anchors etc. cannot be handled.  */
+           /* Word anchors etc. cannot be handled.  It's okay to test
+              opr.ctx_type since constraints (for all DFA nodes) are
+              created by ORing one or more opr.ctx_type values.  */
            return;
          }
        break;
@@ -1344,6 +1346,8 @@ calc_first (void *extra, bin_tree_t *node)
       node->node_idx = re_dfa_add_node (dfa, node->token);
       if (BE (node->node_idx == REG_MISSING, 0))
         return REG_ESPACE;
+      if (node->token.type == ANCHOR)
+        dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
     }
   return REG_NOERROR;
 }
@@ -1473,21 +1477,18 @@ duplicate_node_closure (re_dfa_t *dfa, Idx 
top_org_node, Idx top_clone_node,
             destination.  */
          org_dest = dfa->edests[org_node].elems[0];
          re_node_set_empty (dfa->edests + clone_node);
-         if (dfa->nodes[org_node].type == ANCHOR)
+         clone_dest = search_duplicated_node (dfa, org_dest, constraint);
+         /* If the node is root_node itself, it means the epsilon closure
+            has a loop.  Then tie it to the destination of the root_node.  */
+         if (org_node == root_node && clone_node != org_node)
            {
-             /* In case of the node has another constraint, append it.  */
-             if (org_node == root_node && clone_node != org_node)
-               {
-                 /* ...but if the node is root_node itself, it means the
-                    epsilon closure have a loop, then tie it to the
-                    destination of the root_node.  */
-                 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
-                 if (BE (! ok, 0))
-                   return REG_ESPACE;
-                 break;
-               }
-             constraint |= dfa->nodes[org_node].opr.ctx_type;
+             ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
+             if (BE (! ok, 0))
+               return REG_ESPACE;
+             break;
            }
+         /* In case the node has another constraint, append it.  */
+         constraint |= dfa->nodes[org_node].constraint;
          clone_dest = duplicate_node (dfa, org_dest, constraint);
          if (BE (clone_dest == REG_MISSING, 0))
            return REG_ESPACE;
@@ -1505,7 +1506,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, 
Idx top_clone_node,
          clone_dest = search_duplicated_node (dfa, org_dest, constraint);
          if (clone_dest == REG_MISSING)
            {
-             /* There are no such a duplicated node, create a new one.  */
+             /* There is no such duplicated node, create a new one.  */
              reg_errcode_t err;
              clone_dest = duplicate_node (dfa, org_dest, constraint);
              if (BE (clone_dest == REG_MISSING, 0))
@@ -1520,7 +1521,7 @@ duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, 
Idx top_clone_node,
            }
          else
            {
-             /* There are a duplicated node which satisfy the constraint,
+             /* There is a duplicated node which satisfy the constraint,
                 use it to avoid infinite loop.  */
              ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
              if (BE (! ok, 0))
@@ -1569,8 +1570,7 @@ duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int 
constraint)
   if (BE (dup_idx != REG_MISSING, 1))
     {
       dfa->nodes[dup_idx].constraint = constraint;
-      if (dfa->nodes[org_idx].type == ANCHOR)
-       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
+      dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
       dfa->nodes[dup_idx].duplicated = 1;
 
       /* Store the index of the original node.  */
@@ -1652,7 +1652,6 @@ static reg_errcode_t
 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
 {
   reg_errcode_t err;
-  unsigned int constraint;
   Idx i;
   bool incomplete;
   bool ok;
@@ -1666,15 +1665,14 @@ calc_eclosure_iter (re_node_set *new_set, re_dfa_t 
*dfa, Idx node, bool root)
      We reference this value to avoid infinite loop.  */
   dfa->eclosures[node].nelem = REG_MISSING;
 
-  constraint = ((dfa->nodes[node].type == ANCHOR)
-               ? dfa->nodes[node].opr.ctx_type : 0);
-  /* If the current node has constraints, duplicate all nodes.
-     Since they must inherit the constraints.  */
-  if (constraint
+  /* If the current node has constraints, duplicate all nodes
+     since they must inherit the constraints.  */
+  if (dfa->nodes[node].constraint
       && dfa->edests[node].nelem
       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
     {
-      err = duplicate_node_closure (dfa, node, node, node, constraint);
+      err = duplicate_node_closure (dfa, node, node, node,
+                                   dfa->nodes[node].constraint);
       if (BE (err != REG_NOERROR, 0))
        return err;
     }
diff --git a/lib/regex_internal.c b/lib/regex_internal.c
index 2129888..7f9a3ae 100644
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -1689,11 +1689,9 @@ create_cd_newstate (const re_dfa_t *dfa, const 
re_node_set *nodes,
 
   for (i = 0 ; i < nodes->nelem ; i++)
     {
-      unsigned int constraint = 0;
       re_token_t *node = dfa->nodes + nodes->elems[i];
       re_token_type_t type = node->type;
-      if (node->constraint)
-       constraint = node->constraint;
+      unsigned int constraint = node->constraint;
 
       if (type == CHARACTER && !constraint)
        continue;
@@ -1706,8 +1704,6 @@ create_cd_newstate (const re_dfa_t *dfa, const 
re_node_set *nodes,
        newstate->halt = 1;
       else if (type == OP_BACK_REF)
        newstate->has_backref = 1;
-      else if (type == ANCHOR)
-       constraint = node->opr.ctx_type;
 
       if (constraint)
        {

reply via email to

[Prev in Thread] Current Thread [Next in Thread]