bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [RFC PATCH] dfa: make position_sets grow automatically


From: Jim Meyering
Subject: Re: [RFC PATCH] dfa: make position_sets grow automatically
Date: Mon, 04 Jul 2011 15:46:02 +0200

Paolo Bonzini wrote:
> Inspired by the recent heap overrun patch, I propose to make all
> position_sets grow dynamically.  What do you think?
>
> Paolo Bonzini (5):
>   dfa: use a separate data type for grps
>   dfa: introduce alloc_posset
>   dfa: remove dead assignment
>   dfa: move nalloc to position_set structure
>   dfa: automatically resize position_sets
>
>  src/dfa.c |   85 
> +++++++++++++++++++++++++++++++++----------------------------
>  1 files changed, 46 insertions(+), 39 deletions(-)

Thanks.  This looks like a fine improvement.
It passed all of my tests.

However, I've included a few suggestions below
to make it a little more readable.

> Subject: [PATCH 1/5] dfa: use a separate data type for grps
>
> * src/dfa.c (leaf_set): New.
> (dfastate): Use leaf_set for grps, as the constraint field is unused.
> ---
>  src/dfa.c |   29 +++++++++++++++++++----------
>  1 files changed, 19 insertions(+), 10 deletions(-)
>
> diff --git a/src/dfa.c b/src/dfa.c
> index a36b80a..d875517 100644
> --- a/src/dfa.c
> +++ b/src/dfa.c
> @@ -244,6 +244,13 @@ typedef struct
>    int nelem;                 /* Number of elements in this set. */
>  } position_set;
>
> +/* Sets of leaves are also stored as arrays. */
> +typedef struct
> +{
> +  unsigned int *elems;               /* Elements of this position set. */
> +  int nelem;                 /* Number of elements in this set. */

Please make this size_t or at least "unsigned int", not "int".
(I see that the existing position_set also uses "int",
but it too should be changed, eventually)

> +} leaf_set;
...

> Subject: [PATCH 2/5] dfa: introduce alloc_posset
>
> * src/dfa.c (alloc_posset): New function, use it throughout.
...

> Subject: [PATCH 3/5] dfa: remove dead assignment
>
> * src/dfa.c (transit_state): transit_state_consume_1char will clear follows,
> do not do this ourselves.
...

> Subject: [PATCH 4/5] dfa: move nalloc to position_set structure
>
> * src/dfa.c (position_set): Add alloc.
> (alloc_posset): Initialize it.
> (dfaanalyze): Use it instead of the nalloc array or nelem.
> ---
>  src/dfa.c |   16 +++++++---------
>  1 files changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/src/dfa.c b/src/dfa.c
> index f174cc9..2d3bb6f 100644
> --- a/src/dfa.c
> +++ b/src/dfa.c
> @@ -242,6 +242,7 @@ typedef struct
>  {
>    position *elems;           /* Elements of this position set. */
>    int nelem;                 /* Number of elements in this set. */
> +  int alloc;                    /* Number of elements allocated in ELEMS.  */

Please make this size_t, not int.
All uses are consistent with using an unsigned type like size_t.

Also, I find that n_alloc is more readable than "alloc".
Similarly, I would be inclined to change "nelem" to "n_used",
to avoid confusion.  The former could easily be confused with the
number of allocated elements.  However, changing "nelem" is perhaps
better done separately from this series.

Finally, be consistent in using TABs to indent across to the
aligned comment, to line up with the two preceding declarations.
...

> Subject: [PATCH 5/5] dfa: automatically resize position_sets
>
> * src/dfa.c (insert, copy, merge): Resize arrays here.
> (dfaanalyze): Do not track number of allocated elements here.
> (dfastate): Allocate mbps with only one element.

I actually made the latter int->size_t change and was surprised
to see that it caused nearly every use of grep to fail.
That prompted me to write this patch
to fix the problems thus exposed:

diff --git a/src/dfa.c b/src/dfa.c
index 467cc2b..b1806af 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -242,7 +242,7 @@ typedef struct
 {
   position *elems;             /* Elements of this position set. */
   int nelem;                   /* Number of elements in this set. */
-  int alloc;                    /* Number of elements allocated in ELEMS.  */
+  size_t alloc;                    /* Number of elements allocated in ELEMS.  
*/
 } position_set;

 /* Sets of leaves are also stored as arrays. */
@@ -409,14 +409,17 @@ static void regexp (void);
 #define REALLOC(p, t, n) ((p) = xnrealloc (p, n, sizeof (t)))

 /* Reallocate an array of type t if nalloc is too small for index. */
-#define REALLOC_IF_NECESSARY(p, t, nalloc, index)       \
-  do                                                    \
-    if ((nalloc) <= (index))                            \
-      {                                                 \
-        size_t new_nalloc = (index) + ! (p);            \
-        (p) = x2nrealloc (p, &new_nalloc, sizeof (t));  \
-        (nalloc) = new_nalloc;                          \
-      }                                                 \
+#define REALLOC_IF_NECESSARY(p, t, nalloc, index)         \
+  do                                                      \
+    {                                                     \
+      assert (sizeof (t) == sizeof *(p));                 \
+      if (0 < (index) && (nalloc) <= (index))             \
+        {                                                 \
+          size_t new_nalloc = (index) + ! (p);            \
+          (p) = x2nrealloc (p, &new_nalloc, sizeof (t));  \
+          (nalloc) = new_nalloc;                          \
+        }                                                 \
+    }                                                     \
   while (false)


First, just for readability, I prefer to enclose multi-line-but-single-stmt
blocks with braces, so that when we insert a line at the "top" of the
loop, we don't mistakenly displace the sole statement.

The important change is the added "0 < (index)" conjunct.
Otherwise we end up comparing unsigned and signed, because with
the existing semantics, we sometimes get an "index" value of -1.

I added the assertion to show that we can omit the "t" parameter.

Finally, to avoid the risk of error with negative "index", I suggest to
change the semantics by incrementing that argument by 1 at each macro use
and changing the name to "n_items".

When I went to do that, it appears that this use is already incremented:

  if (MB_CUR_MAX > 1)
    {
      REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
                           dfa->mbcsets_alloc, dfa->nmbcsets + 1);

      /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
         We will update dfa->multibyte_prop[] in addtok(), because we can't
         decide the index in dfa->tokens[].  */

      /* Initialize work area.  */
      work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);
      memset (work_mbc, 0, sizeof *work_mbc);
    }

as well as this one and a few more:

                      REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
                                           ch_classes_al,
                                           work_mbc->nch_classes + 1);
                      work_mbc->ch_classes[work_mbc->nch_classes++] = wt;

Here's a complete patch:

diff --git a/src/dfa.c b/src/dfa.c
index 467cc2b..dd323a6 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -242,7 +242,7 @@ typedef struct
 {
   position *elems;             /* Elements of this position set. */
   int nelem;                   /* Number of elements in this set. */
-  int alloc;                    /* Number of elements allocated in ELEMS.  */
+  size_t alloc;                    /* Number of elements allocated in ELEMS.  
*/
 } position_set;

 /* Sets of leaves are also stored as arrays. */
@@ -409,14 +409,17 @@ static void regexp (void);
 #define REALLOC(p, t, n) ((p) = xnrealloc (p, n, sizeof (t)))

 /* Reallocate an array of type t if nalloc is too small for index. */
-#define REALLOC_IF_NECESSARY(p, t, nalloc, index)       \
-  do                                                    \
-    if ((nalloc) <= (index))                            \
-      {                                                 \
-        size_t new_nalloc = (index) + ! (p);            \
-        (p) = x2nrealloc (p, &new_nalloc, sizeof (t));  \
-        (nalloc) = new_nalloc;                          \
-      }                                                 \
+#define REALLOC_IF_NECESSARY(p, nalloc, n_required)       \
+  do                                                      \
+    {                                                     \
+      assert (0 <= (n_required));                         \
+      if ((nalloc) <= (n_required))                       \
+        {                                                 \
+          size_t new_nalloc = (n_required) + !(p);        \
+          (p) = x2nrealloc (p, &new_nalloc, sizeof (*p)); \
+          (nalloc) = new_nalloc;                          \
+        }                                                 \
+    }                                                     \
   while (false)


@@ -520,7 +523,7 @@ charclass_index (charclass const s)
   for (i = 0; i < dfa->cindex; ++i)
     if (equal(s, dfa->charclasses[i]))
       return i;
-  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);
+  REALLOC_IF_NECESSARY(dfa->charclasses, dfa->calloc, dfa->cindex + 1);
   ++dfa->cindex;
   copyset(s, dfa->charclasses[i]);
   return i;
@@ -788,8 +791,7 @@ parse_bracket_exp (void)
   ch_classes_al = equivs_al = coll_elems_al = 0;
   if (MB_CUR_MAX > 1)
     {
-      REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,
-                           dfa->mbcsets_alloc, dfa->nmbcsets + 1);
+      REALLOC_IF_NECESSARY(dfa->mbcsets, dfa->mbcsets_alloc, dfa->nmbcsets + 
1);

       /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.
          We will update dfa->multibyte_prop[] in addtok(), because we can't
@@ -873,7 +875,7 @@ parse_bracket_exp (void)

                       if (ch_classes_al == 0)
                         MALLOC(work_mbc->ch_classes, wctype_t, 
++ch_classes_al);
-                      REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,
+                      REALLOC_IF_NECESSARY(work_mbc->ch_classes,
                                            ch_classes_al,
                                            work_mbc->nch_classes + 1);
                       work_mbc->ch_classes[work_mbc->nch_classes++] = wt;
@@ -897,7 +899,7 @@ parse_bracket_exp (void)
                     {
                       if (equivs_al == 0)
                         MALLOC(work_mbc->equivs, char*, ++equivs_al);
-                      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,
+                      REALLOC_IF_NECESSARY(work_mbc->equivs,
                                            equivs_al,
                                            work_mbc->nequivs + 1);
                       work_mbc->equivs[work_mbc->nequivs++] = elem;
@@ -908,7 +910,7 @@ parse_bracket_exp (void)
                     {
                       if (coll_elems_al == 0)
                         MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);
-                      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,
+                      REALLOC_IF_NECESSARY(work_mbc->coll_elems,
                                            coll_elems_al,
                                            work_mbc->ncoll_elems + 1);
                       work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
@@ -961,9 +963,9 @@ parse_bracket_exp (void)
                   MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);
                   MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);
                 }
-              REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+              REALLOC_IF_NECESSARY(work_mbc->range_sts,
                                    range_sts_al, work_mbc->nranges + 1);
-              REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+              REALLOC_IF_NECESSARY(work_mbc->range_ends,
                                    range_ends_al, work_mbc->nranges + 1);
               work_mbc->range_sts[work_mbc->nranges] =
                 case_fold ? towlower(wc) : (wchar_t)wc;
@@ -973,10 +975,10 @@ parse_bracket_exp (void)
 #ifndef GREP
               if (case_fold && (iswalpha(wc) || iswalpha(wc2)))
                 {
-                  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                  REALLOC_IF_NECESSARY(work_mbc->range_sts,
                                        range_sts_al, work_mbc->nranges + 1);
                   work_mbc->range_sts[work_mbc->nranges] = towupper(wc);
-                  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                  REALLOC_IF_NECESSARY(work_mbc->range_ends,
                                        range_ends_al, work_mbc->nranges + 1);
                   work_mbc->range_ends[work_mbc->nranges++] = towupper(wc2);
                 }
@@ -1028,7 +1030,7 @@ parse_bracket_exp (void)
               wc = towlower(wc);
               if (!setbit_wc (wc, ccl))
                 {
-                  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+                  REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
                                        work_mbc->nchars + 1);
                   work_mbc->chars[work_mbc->nchars++] = wc;
                 }
@@ -1040,7 +1042,7 @@ parse_bracket_exp (void)
             }
           if (!setbit_wc (wc, ccl))
             {
-              REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,
+              REALLOC_IF_NECESSARY(work_mbc->chars, chars_al,
                                    work_mbc->nchars + 1);
               work_mbc->chars[work_mbc->nchars++] = wc;
             }
@@ -1423,15 +1425,15 @@ addtok_mb (token t, int mbprop)
 #if MBS_SUPPORT
   if (MB_CUR_MAX > 1)
     {
-      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,
-                           dfa->tindex);
+      REALLOC_IF_NECESSARY(dfa->multibyte_prop, dfa->nmultibyte_prop,
+                           dfa->tindex + 1);
       dfa->multibyte_prop[dfa->tindex] = mbprop;
     }
 #else
   (void) mbprop;
 #endif

-  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
+  REALLOC_IF_NECESSARY(dfa->tokens, dfa->talloc, dfa->tindex + 1);
   dfa->tokens[dfa->tindex++] = t;

   switch (t)
@@ -1839,7 +1841,7 @@ dfaparse (char const *s, size_t len, struct dfa *d)
 static void
 copy (position_set const *src, position_set *dst)
 {
-  REALLOC_IF_NECESSARY(dst->elems, position, dst->alloc, src->nelem - 1);
+  REALLOC_IF_NECESSARY(dst->elems, dst->alloc, src->nelem);
   memcpy(dst->elems, src->elems, sizeof(dst->elems[0]) * src->nelem);
   dst->nelem = src->nelem;
 }
@@ -1877,7 +1879,7 @@ insert (position p, position_set *s)
       return;
     }

-  REALLOC_IF_NECESSARY(s->elems, position, s->alloc, count);
+  REALLOC_IF_NECESSARY(s->elems, s->alloc, count + 1);
   for (i = count; i > lo; i--)
     s->elems[i] = s->elems[i - 1];
   s->elems[lo] = p;
@@ -1891,7 +1893,7 @@ merge (position_set const *s1, position_set const *s2, 
position_set *m)
 {
   int i = 0, j = 0;

-  REALLOC_IF_NECESSARY(m->elems, position, m->alloc, s1->nelem + s2->nelem - 
1);
+  REALLOC_IF_NECESSARY(m->elems, m->alloc, s1->nelem + s2->nelem);
   m->nelem = 0;
   while (i < s1->nelem && j < s2->nelem)
     if (s1->elems[i].index > s2->elems[j].index)
@@ -1956,7 +1958,7 @@ state_index (struct dfa *d, position_set const *s, int 
newline, int letter)
     }

   /* We'll have to create a new state. */
-  REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex);
+  REALLOC_IF_NECESSARY(d->states, d->salloc, d->sindex + 1);
   d->states[i].hash = hash;
   alloc_posset(&d->states[i].elems, s->nelem);
   copy(s, &d->states[i].elems);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]