[PATCH 1/4] dfa: wrap charclass inside a struct

bug-gnulib
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 1/4] dfa: wrap charclass inside a struct

From:	Paul Eggert
Subject:	[PATCH 1/4] dfa: wrap charclass inside a struct
Date:	Fri, 30 Dec 2016 01:03:19 -0800
This lets GCC check for aliases more accurately.
On my platform (gcc Ubuntu 5.4.0-6ubuntu1~16.04.4 x86-64,
en_US.utf8 locale) this makes 'grep -Fi -f list.txt list.txt >out'
about 1% faster, where list.txt is generated by 'aspell dump
master | head -n 100000 >list.txt'.  Also, it shrinks the grep
text by 64 bytes, woohoo!  See Bug#22239.
* lib/dfa.c (charclass): Wrap inside a struct.  All uses changed.
(CHARCLASS_INIT, tstbit, setbit, clrbit, zeroset, fillset, notset)
(equal, emptyset, charclass_index, setbit_wc, setbit_case_fold_c):
Adjust to this, e.g., by using charclass * rather than charclass.
All callers changed as needed.
(copyset): Remove.  All uses changed to simple assignment.
(parse_bracket_exp): Use zeroset instead of memset.
---
 ChangeLog |  15 ++++++
 lib/dfa.c | 180 ++++++++++++++++++++++++++++++--------------------------------
 2 files changed, 102 insertions(+), 93 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e41d7d8..27ee040 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2016-12-30  Paul Eggert  <address@hidden>
+
+       dfa: wrap charclass inside a struct
+       On my platform (gcc Ubuntu 5.4.0-6ubuntu1~16.04.4 x86-64,
+       en_US.utf8 locale) this makes 'grep -Fi -f list.txt list.txt >out'
+       about 5% faster, where list.txt is generated by 'aspell dump
+       master | head -n 100000 >list.txt'.  See Bug#22239.
+       * lib/dfa.c (charclass): Wrap inside a struct.  All uses changed.
+       (CHARCLASS_INIT, tstbit, setbit, clrbit, zeroset, fillset, notset)
+       (equal, emptyset, charclass_index, setbit_wc, setbit_case_fold_c):
+       Adjust to this, e.g., by using charclass * rather than charclass.
+       All callers changed as needed.
+       (copyset): Remove.  All uses changed to simple assignment.
+       (parse_bracket_exp): Use zeroset instead of memset.
+
 2016-12-30  Jim Meyering  <address@hidden>
 
        maint.mk: update list of intprops.h symbol names
diff --git a/lib/dfa.c b/lib/dfa.c
index 3e1f35d..a4ccf8c 100644
--- a/lib/dfa.c
+++ b/lib/dfa.c
@@ -87,10 +87,10 @@ enum { CHARCLASS_WORD_BITS = 64 };
 
 /* An initializer for a charclass whose 32-bit words are A through H.  */
 #define CHARCLASS_INIT(a, b, c, d, e, f, g, h)         \
-    {                                                  \
+   {{                                                  \
       CHARCLASS_PAIR (a, b), CHARCLASS_PAIR (c, d),    \
       CHARCLASS_PAIR (e, f), CHARCLASS_PAIR (g, h)     \
-    }
+   }}
 
 /* The maximum useful value of a charclass_word; all used bits are 1.  */
 static charclass_word const CHARCLASS_WORD_MASK
@@ -103,7 +103,7 @@ enum
 };
 
 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
-typedef charclass_word charclass[CHARCLASS_WORDS];
+typedef struct { charclass_word w[CHARCLASS_WORDS]; } charclass;
 
 /* Convert a possibly-signed character to an unsigned character.  This is
    a bit safer than casting to unsigned char, since it catches some type
@@ -671,69 +671,64 @@ prtok (token t)
 /* Stuff pertaining to charclasses.  */
 
 static bool
-tstbit (unsigned int b, charclass const c)
-{
-  return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
-}
-
-static void
-setbit (unsigned int b, charclass c)
+tstbit (unsigned int b, charclass const *c)
 {
-  c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
+  return c->w[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 }
 
 static void
-clrbit (unsigned int b, charclass c)
+setbit (unsigned int b, charclass *c)
 {
-  c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
-                                  << b % CHARCLASS_WORD_BITS);
+  charclass_word one = 1;
+  c->w[b / CHARCLASS_WORD_BITS] |= one << b % CHARCLASS_WORD_BITS;
 }
 
 static void
-copyset (charclass const src, charclass dst)
+clrbit (unsigned int b, charclass *c)
 {
-  memcpy (dst, src, sizeof (charclass));
+  charclass_word one = 1;
+  c->w[b / CHARCLASS_WORD_BITS] &= ~(one << b % CHARCLASS_WORD_BITS);
 }
 
 static void
-zeroset (charclass s)
+zeroset (charclass *s)
 {
-  memset (s, 0, sizeof (charclass));
+  memset (s, 0, sizeof *s);
 }
 
 static void
-fillset (charclass s)
+fillset (charclass *s)
 {
   int i;
   for (i = 0; i < CHARCLASS_WORDS; i++)
-    s[i] = CHARCLASS_WORD_MASK;
+    s->w[i] = CHARCLASS_WORD_MASK;
 }
 
 static void
-notset (charclass s)
+notset (charclass *s)
 {
   int i;
   for (i = 0; i < CHARCLASS_WORDS; ++i)
-    s[i] = CHARCLASS_WORD_MASK & ~s[i];
+    s->w[i] = CHARCLASS_WORD_MASK & ~s->w[i];
 }
 
 static bool
-equal (charclass const s1, charclass const s2)
+equal (charclass const *s1, charclass const *s2)
 {
   charclass_word w = 0;
   int i;
   for (i = 0; i < CHARCLASS_WORDS; i++)
-    w |= s1[i] ^ s2[i];
+    w |= s1->w[i] ^ s2->w[i];
   return w == 0;
 }
 
 static bool
-emptyset (charclass const s)
+emptyset (charclass const *s)
 {
   charclass_word w = 0;
   int i;
   for (i = 0; i < CHARCLASS_WORDS; i++)
-    w |= s[i];
+    w |= s->w[i];
   return w == 0;
 }
 
@@ -817,17 +812,17 @@ maybe_realloc (void *pa, ptrdiff_t i, ptrdiff_t *nitems,
 
 /* In DFA D, find the index of charclass S, or allocate a new one.  */
 static ptrdiff_t
-charclass_index (struct dfa *d, charclass const s)
+charclass_index (struct dfa *d, charclass *s)
 {
   ptrdiff_t i;
 
   for (i = 0; i < d->cindex; ++i)
-    if (equal (s, d->charclasses[i]))
+    if (equal (s, &d->charclasses[i]))
       return i;
   d->charclasses = maybe_realloc (d->charclasses, d->cindex, &d->calloc,
                                   TOKEN_MAX - CSET, sizeof *d->charclasses);
   ++d->cindex;
-  copyset (s, d->charclasses[i]);
+  d->charclasses[i] = *s;
   return i;
 }
 
@@ -853,7 +848,7 @@ char_context (struct dfa const *dfa, unsigned char c)
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
 static bool
-setbit_wc (wint_t wc, charclass c)
+setbit_wc (wint_t wc, charclass *c)
 {
   int b = wctob (wc);
   if (b == EOF)
@@ -866,7 +861,7 @@ setbit_wc (wint_t wc, charclass c)
 /* Set a bit for B and its case variants in the charclass C.
    MB_CUR_MAX must be 1.  */
 static void
-setbit_case_fold_c (int b, charclass c)
+setbit_case_fold_c (int b, charclass *c)
 {
   int ub = toupper (b);
   int i;
@@ -1021,7 +1016,7 @@ parse_bracket_exp (struct dfa *dfa)
   else
     work_mbc = NULL;
 
-  memset (ccl, 0, sizeof ccl);
+  zeroset (&ccl);
   FETCH_WC (dfa, c, wc, _("unbalanced ["));
   if (c == '^')
     {
@@ -1087,7 +1082,7 @@ parse_bracket_exp (struct dfa *dfa)
                   else
                     for (c2 = 0; c2 < NOTCHAR; ++c2)
                       if (pred->func (c2))
-                        setbit (c2, ccl);
+                        setbit (c2, &ccl);
                 }
               else
                 known_bracket_exp = false;
@@ -1148,7 +1143,7 @@ parse_bracket_exp (struct dfa *dfa)
                     {
                       int ci;
                       for (ci = c; ci <= c2; ci++)
-                        setbit (ci, ccl);
+                        setbit (ci, &ccl);
                       if (dfa->syntax.case_fold)
                         {
                           int uc = toupper (c);
@@ -1157,7 +1152,7 @@ parse_bracket_exp (struct dfa *dfa)
                             {
                               int uci = toupper (ci);
                               if (uc <= uci && uci <= uc2)
-                                setbit (ci, ccl);
+                                setbit (ci, &ccl);
                             }
                         }
                     }
@@ -1174,9 +1169,9 @@ parse_bracket_exp (struct dfa *dfa)
       if (!dfa->localeinfo.multibyte)
         {
           if (dfa->syntax.case_fold)
-            setbit_case_fold_c (c, ccl);
+            setbit_case_fold_c (c, &ccl);
           else
-            setbit (c, ccl);
+            setbit (c, &ccl);
           continue;
         }
 
@@ -1191,7 +1186,7 @@ parse_bracket_exp (struct dfa *dfa)
                             : 1);
           folded[0] = wc;
           for (i = 0; i < n; i++)
-            if (!setbit_wc (folded[i], ccl))
+            if (!setbit_wc (folded[i], &ccl))
               {
                 work_mbc->chars
                   = maybe_realloc (work_mbc->chars, work_mbc->nchars,
@@ -1211,19 +1206,19 @@ parse_bracket_exp (struct dfa *dfa)
   if (dfa->localeinfo.multibyte)
     {
       work_mbc->invert = invert;
-      work_mbc->cset = emptyset (ccl) ? -1 : charclass_index (dfa, ccl);
+      work_mbc->cset = emptyset (&ccl) ? -1 : charclass_index (dfa, &ccl);
       return MBCSET;
     }
 
   if (invert)
     {
       assert (!dfa->localeinfo.multibyte);
-      notset (ccl);
+      notset (&ccl);
       if (dfa->syntax.syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
-        clrbit ('\n', ccl);
+        clrbit ('\n', &ccl);
     }
 
-  return CSET + charclass_index (dfa, ccl);
+  return CSET + charclass_index (dfa, &ccl);
 }
 
 struct lexptr
@@ -1480,16 +1475,16 @@ lex (struct dfa *dfa)
             goto normal_char;
           if (dfa->canychar == (size_t) -1)
             {
-              fillset (ccl);
+              fillset (&ccl);
               if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
-                clrbit ('\n', ccl);
+                clrbit ('\n', &ccl);
               if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
-                clrbit ('\0', ccl);
+                clrbit ('\0', &ccl);
               if (dfa->localeinfo.multibyte)
                 for (c2 = 0; c2 < NOTCHAR; c2++)
                   if (dfa->localeinfo.sbctowc[c2] == WEOF)
-                    clrbit (c2, ccl);
-              dfa->canychar = charclass_index (dfa, ccl);
+                    clrbit (c2, &ccl);
+              dfa->canychar = charclass_index (dfa, &ccl);
             }
           dfa->lex.laststart = false;
           return dfa->lex.lasttok = (dfa->localeinfo.multibyte
@@ -1502,14 +1497,14 @@ lex (struct dfa *dfa)
             goto normal_char;
           if (!dfa->localeinfo.multibyte)
             {
-              zeroset (ccl);
+              zeroset (&ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
                 if (isspace (c2))
-                  setbit (c2, ccl);
+                  setbit (c2, &ccl);
               if (c == 'S')
-                notset (ccl);
+                notset (&ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1535,14 +1530,14 @@ lex (struct dfa *dfa)
 
           if (!dfa->localeinfo.multibyte)
             {
-              zeroset (ccl);
+              zeroset (&ccl);
               for (c2 = 0; c2 < NOTCHAR; ++c2)
                 if (dfa->syntax.sbit[c2] == CTX_LETTER)
-                  setbit (c2, ccl);
+                  setbit (c2, &ccl);
               if (c == 'W')
-                notset (ccl);
+                notset (&ccl);
               dfa->lex.laststart = false;
-              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
             }
 
           /* FIXME: see if optimizing this, as is done with ANYCHAR and
@@ -1577,9 +1572,9 @@ lex (struct dfa *dfa)
 
           if (dfa->syntax.case_fold && isalpha (c))
             {
-              zeroset (ccl);
-              setbit_case_fold_c (c, ccl);
-              return dfa->lex.lasttok = CSET + charclass_index (dfa, ccl);
+              zeroset (&ccl);
+              setbit_case_fold_c (c, &ccl);
+              return dfa->lex.lasttok = CSET + charclass_index (dfa, &ccl);
             }
 
           return dfa->lex.lasttok = c;
@@ -1730,16 +1725,15 @@ add_utf8_anychar (struct dfa *dfa)
   if (dfa->utf8_anychar_classes[0] == 0)
     for (i = 0; i < n; i++)
       {
-        charclass c;
-        copyset (utf8_classes[i], c);
+        charclass c = utf8_classes[i];
         if (i == 1)
           {
             if (!(dfa->syntax.syntax_bits & RE_DOT_NEWLINE))
-              clrbit ('\n', c);
+              clrbit ('\n', &c);
             if (dfa->syntax.syntax_bits & RE_DOT_NOT_NULL)
-              clrbit ('\0', c);
+              clrbit ('\0', &c);
           }
-        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, c);
+        dfa->utf8_anychar_classes[i] = CSET + charclass_index (dfa, &c);
       }
 
   /* A valid UTF-8 character is
@@ -2278,18 +2272,18 @@ epsclosure (position_set *initial, struct dfa const *d)
    character included in C.  */
 
 static int
-charclass_context (struct dfa const *dfa, charclass c)
+charclass_context (struct dfa const *dfa, charclass const *c)
 {
   int context = 0;
   unsigned int j;
 
   for (j = 0; j < CHARCLASS_WORDS; ++j)
     {
-      if (c[j] & dfa->syntax.newline[j])
+      if (c->w[j] & dfa->syntax.newline.w[j])
         context |= CTX_NEWLINE;
-      if (c[j] & dfa->syntax.letters[j])
+      if (c->w[j] & dfa->syntax.letters.w[j])
         context |= CTX_LETTER;
-      if (c[j] & ~(dfa->syntax.letters[j] | dfa->syntax.newline[j]))
+      if (c->w[j] & ~(dfa->syntax.letters.w[j] | dfa->syntax.newline.w[j]))
         context |= CTX_NONE;
     }
 
@@ -2631,7 +2625,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
   group.elems = xnmalloc (d->nleaves, sizeof *group.elems);
   group.nelem = 0;
 
-  fillset (label);
+  fillset (&label);
 
   for (i = 0; i < d->states[s].elems.nelem; ++i)
     {
@@ -2640,21 +2634,21 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
       bool matched = false;
       if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR)
         {
-          zeroset (matches);
-          setbit (d->tokens[pos.index], matches);
+          zeroset (&matches);
+          setbit (d->tokens[pos.index], &matches);
           if (d->tokens[pos.index] == uc)
             matched = true;
         }
       else if (d->tokens[pos.index] >= CSET)
         {
-          copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-          if (tstbit (uc, d->charclasses[d->tokens[pos.index] - CSET]))
+          matches = d->charclasses[d->tokens[pos.index] - CSET];
+          if (tstbit (uc, &d->charclasses[d->tokens[pos.index] - CSET]))
             matched = true;
         }
-       else if (d->tokens[pos.index] == ANYCHAR)
-         {
-          copyset (d->charclasses[d->canychar], matches);
-          if (tstbit (uc, d->charclasses[d->canychar]))
+      else if (d->tokens[pos.index] == ANYCHAR)
+        {
+          matches = d->charclasses[d->canychar];
+          if (tstbit (uc, &d->charclasses[d->canychar]))
             matched = true;
 
           /* ANYCHAR must match with a single character, so we must put
@@ -2683,18 +2677,18 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NEWLINE))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= ~d->syntax.newline[j];
+              matches.w[j] &= ~d->syntax.newline.w[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_LETTER))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= ~d->syntax.letters[j];
+              matches.w[j] &= ~d->syntax.letters.w[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NONE))
             for (j = 0; j < CHARCLASS_WORDS; ++j)
-              matches[j] &= d->syntax.letters[j] | d->syntax.newline[j];
+              matches.w[j] &= d->syntax.letters.w[j] | d->syntax.newline.w[j];
 
           /* If there are no characters left, there's no point in going on.  */
-          for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
+          for (j = 0; j < CHARCLASS_WORDS && !matches.w[j]; j++)
             continue;
           if (j == CHARCLASS_WORDS)
             continue;
@@ -2702,7 +2696,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
           /* If we have reset the bit that made us declare "matched", reset
              that indicator, too.  This is required to avoid an infinite loop
              with this command: echo cx | LC_ALL=C grep -E 'c\b[x ]'  */
-          if (!tstbit (uc, matches))
+          if (!tstbit (uc, &matches))
             matched = false;
         }
 
@@ -2711,7 +2705,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
       prtok (d->tokens[pos.index]);
       fprintf (stderr, " of");
       for (j = 0; j < NOTCHAR; j++)
-        if (tstbit (j, matches))
+        if (tstbit (j, &matches))
           fprintf (stderr, " 0x%02zx", j);
       fprintf (stderr, "\n");
 #endif
@@ -2719,13 +2713,13 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
       if (matched)
         {
           for (k = 0; k < CHARCLASS_WORDS; ++k)
-            label[k] &= matches[k];
+            label.w[k] &= matches.w[k];
           group.elems[group.nelem++] = pos.index;
         }
       else
         {
           for (k = 0; k < CHARCLASS_WORDS; ++k)
-            label[k] &= ~matches[k];
+            label.w[k] &= ~matches.w[k];
         }
     }
 
@@ -2778,7 +2772,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
         }
 
       /* Find out if the new state will want any context information.  */
-      possible_contexts = charclass_context (d, label);
+      possible_contexts = charclass_context (d, &label);
       separate_contexts = state_separate_contexts (&follows);
 
       /* Find the state(s) corresponding to the union of the follows.  */
@@ -2814,7 +2808,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
 
   /* Set the transitions for each character in the label.  */
   for (i = 0; i < NOTCHAR; i++)
-    if (tstbit (i, label))
+    if (tstbit (i, &label))
       switch (d->syntax.sbit[i])
         {
         case CTX_NEWLINE:
@@ -2845,7 +2839,7 @@ dfastate (state_num s, struct dfa *d, unsigned char uc, 
state_num trans[])
 
   /* Keep the newline transition in a special place so we can use it as
      a sentinel.  */
-  if (tstbit (d->syntax.eolbyte, label))
+  if (tstbit (d->syntax.eolbyte, &label))
     {
       d->newlines[s] = trans[d->syntax.eolbyte];
       trans[d->syntax.eolbyte] = -1;
@@ -3466,8 +3460,8 @@ dfassbuild (struct dfa *d)
         case BACKREF:
           {
             charclass ccl;
-            fillset (ccl);
-            sup->tokens[j++] = CSET + charclass_index (sup, ccl);
+            fillset (&ccl);
+            sup->tokens[j++] = CSET + charclass_index (sup, &ccl);
             sup->tokens[j++] = STAR;
             if (d->tokens[i + 1] == QMARK || d->tokens[i + 1] == STAR
                 || d->tokens[i + 1] == PLUS)
@@ -3984,7 +3978,7 @@ dfamust (struct dfa const *d)
               charclass *ccl = &d->charclasses[t - CSET];
               int j;
               for (j = 0; j < NOTCHAR; j++)
-                if (tstbit (j, *ccl))
+                if (tstbit (j, ccl))
                   break;
               if (! (j < NOTCHAR))
                 {
@@ -3993,7 +3987,7 @@ dfamust (struct dfa const *d)
                 }
               t = j;
               while (++j < NOTCHAR)
-                if (tstbit (j, *ccl)
+                if (tstbit (j, ccl)
                     && ! (case_fold_unibyte
                           && toupper (j) == toupper (t)))
                   break;
@@ -4095,10 +4089,10 @@ dfasyntax (struct dfa *dfa, struct localeinfo const 
*linfo,
       switch (dfa->syntax.sbit[uc])
         {
         case CTX_LETTER:
-          setbit (uc, dfa->syntax.letters);
+          setbit (uc, &dfa->syntax.letters);
           break;
         case CTX_NEWLINE:
-          setbit (uc, dfa->syntax.newline);
+          setbit (uc, &dfa->syntax.newline);
           break;
         }
 
-- 
2.7.4
[Prev in Thread]
Current Thread
[Next in Thread]
[PATCH 1/4] dfa: wrap charclass inside a struct, Paul Eggert <=
- [PATCH 4/4] dfa: shorten sbit, success, Paul Eggert, 2016/12/30
- [PATCH 2/4] dfa: minor performance tweak, Paul Eggert, 2016/12/30
- [PATCH 3/4] dfa: simplify multibyte_prop etc., Paul Eggert, 2016/12/30
Prev by Date: [PATCH 3/4] dfa: simplify multibyte_prop etc.
Next by Date: Re: stdint broken on mingw
Previous by thread: avoid new "syntax-check" failure
Next by thread: [PATCH 4/4] dfa: shorten sbit, success
Index(es):
- Date
- Thread