From 740048e66e7c55a8e42f4f7e4c24256a61506f70 Mon Sep 17 00:00:00 2001
From: Paul Eggert
Date: Fri, 23 Dec 2016 12:25:24 -0800
Subject: [PATCH 4/8] grep: specialize word-finding functions
This improves performance a bit.
* src/dfasearch.c, src/kwsearch.c (wordchar):
Remove; now in searchutils.c.
* src/grep.c (main): Call wordinit if -w.
* src/search.h: Adjust.
* src/searchutils.c: Include verify.h.
(word_start): New static var.
(wordchar): Move here from dfasearch.c and kwsearch.c.
(wordinit, wordchars_count, wordchar_next, wordchar_prev):
New functions.
(mb_prev_wc, mb_next_wc): Remove.
All callers changed to use the new functions instead.
---
src/dfasearch.c | 11 ++-----
src/grep.c | 1 +
src/kwsearch.c | 11 ++-----
src/search.h | 5 +--
src/searchutils.c | 91 +++++++++++++++++++++++++++++++++++++++++++------------
5 files changed, 80 insertions(+), 39 deletions(-)
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 24a36cd..87e1f7e 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -26,13 +26,6 @@
struct localeinfo localeinfo;
-/* Whether -w considers WC to be a word constituent. */
-static bool
-wordchar (wint_t wc)
-{
- return wc == L'_' || iswalnum (wc);
-}
-
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
@@ -394,8 +387,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
while (match <= best_match)
{
regoff_t shorter_len = 0;
- if (!wordchar (mb_prev_wc (beg, match, end - 1))
- && !wordchar (mb_next_wc (match + len, end - 1)))
+ if (! wordchar_next (match + len, end - 1)
+ && ! wordchar_prev (beg, match, end - 1))
goto assess_pattern_match;
if (len > 0)
{
diff --git a/src/grep.c b/src/grep.c
index 3729ae0..f9d1d86 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -2651,6 +2651,7 @@ main (int argc, char **argv)
break;
case 'w':
+ wordinit ();
match_words = true;
break;
diff --git a/src/kwsearch.c b/src/kwsearch.c
index 5596ebd..b30dfd0 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,13 +21,6 @@
#include
#include "search.h"
-/* Whether -w considers WC to be a word constituent. */
-static bool
-wordchar (wint_t wc)
-{
- return wc == L'_' || iswalnum (wc);
-}
-
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
@@ -140,10 +133,10 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
char const *bol = memrchr (mb_start, eol, beg - mb_start);
if (bol)
mb_start = bol + 1;
- if (! wordchar (mb_prev_wc (mb_start, beg, buf + size)))
+ if (! wordchar_prev (mb_start, beg, buf + size))
for (;;)
{
- if (! wordchar (mb_next_wc (beg + len, buf + size)))
+ if (! wordchar_next (beg + len, buf + size))
{
if (start_ptr)
goto success_in_beg_and_len;
diff --git a/src/search.h b/src/search.h
index 1ff5be2..6fe1797 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,10 +46,11 @@ _GL_INLINE_HEADER_BEGIN
typedef signed char mb_len_map_t;
/* searchutils.c */
+extern void wordinit (void);
extern kwset_t kwsinit (bool);
+extern size_t wordchar_next (char const *, char const *);
+extern bool wordchar_prev (char const *, char const *, char const *);
extern ptrdiff_t mb_goback (char const **, char const *, char const *);
-extern wint_t mb_prev_wc (char const *, char const *, char const *);
-extern wint_t mb_next_wc (char const *, char const *);
/* dfasearch.c */
extern struct localeinfo localeinfo;
diff --git a/src/searchutils.c b/src/searchutils.c
index deaab60..e0a1db3 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -22,6 +22,30 @@
#define SYSTEM_INLINE _GL_EXTERN_INLINE
#include "search.h"
+#include
+
+/* For each byte B, word_start[B] is 1 if B is a single-byte character
+ that is a word constituent, 0 if B cannot start a word constituent,
+ and -1 if B might be or might not be the start of a word
+ constituent. */
+static wint_t word_start[NCHAR];
+verify (WEOF != 0 && WEOF != 1);
+
+/* Whether -w considers WC to be a word constituent. */
+static bool
+wordchar (wint_t wc)
+{
+ return wc == L'_' || iswalnum (wc);
+}
+
+void
+wordinit (void)
+{
+ for (int i = 0; i < NCHAR; i++)
+ word_start[i] = (localeinfo.sbclen[i] == -2 ? WEOF
+ : wordchar (localeinfo.sbctowc[i]));
+}
+
kwset_t
kwsinit (bool mb_trans)
{
@@ -93,27 +117,56 @@ mb_goback (char const **mb_start, char const *cur, char const *end)
return p == cur ? 0 : cur - p0;
}
-/* In the buffer BUF, return the wide character that is encoded just
- before CUR. The buffer ends at END. Return WEOF if there is no
- wide character just before CUR. */
-wint_t
-mb_prev_wc (char const *buf, char const *cur, char const *end)
+/* Examine the start of BUF (of size SIZE) for word constituents.
+ If COUNTALL, examine as many as possible; otherwise, examine at most one.
+ Return the total number of bytes in the examined characters. */
+static size_t
+wordchars_count (char const *buf, char const *end, bool countall)
{
- if (cur == buf)
- return WEOF;
- char const *p = buf;
- cur--;
- cur -= mb_goback (&p, cur, end);
- return mb_next_wc (cur, end);
+ size_t n = 0;
+ mbstate_t mbs = { 0 };
+ while (n < end - buf)
+ {
+ wint_t ws = word_start[to_uchar (buf[n])];
+ if (ws == 0)
+ break;
+ else if (ws == 1)
+ n++;
+ else
+ {
+ wchar_t wc = 0;
+ size_t wcbytes = mbrtowc (&wc, buf + n, end - buf - n, &mbs);
+ if (!wordchar (wc))
+ break;
+ n += wcbytes + !wcbytes;
+ }
+ if (!countall)
+ break;
+ }
+ return n;
}
-/* Return the wide character that is encoded at CUR. The buffer ends
- at END. Return WEOF if there is no wide character encoded at CUR. */
-wint_t
-mb_next_wc (char const *cur, char const *end)
+/* If BUF starts with a word constituent, return the number of bytes
+ used to represent it; otherwise, return zero. The buffer ends at END. */
+size_t
+wordchar_next (char const *buf, char const *end)
{
- wchar_t wc;
- mbstate_t mbs = { 0 };
- return (end - cur != 0 && mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2
- ? wc : WEOF);
+ return wordchars_count (buf, end, false);
+}
+
+/* In the buffer BUF, return true if the character whose encoding
+ contains the byte before CUR is a word constituent. The buffer
+ ends at END. */
+bool
+wordchar_prev (char const *buf, char const *cur, char const *end)
+{
+ if (buf == cur)
+ return false;
+ cur--;
+ wint_t ws = word_start[to_uchar (*cur)];
+ if (! localeinfo.multibyte)
+ return ws == 1;
+ char const *p = buf;
+ cur -= mb_goback (&p, cur, end);
+ return wordchar_next (cur, end) != 0;
}
--
2.7.4