From 58bf6894293ee52145ebe5223acd685ef25f744f Mon Sep 17 00:00:00 2001 From: Norihiro Tanaka Date: Mon, 15 Dec 2014 23:40:17 +0900 Subject: [PATCH] dfa: improvement for checking of multibyte character boundary When found single bytes that cannot occur inside a multibyte character we can skip check for multibyte character boundary before the character. The improvement speeds up about 40% for input string which doesn't match even the first part of a pattern. * src/dfa.c (always_single_byte): Add a new variable. It caches whether each byte can occur inside a multibyte character or not. (dfaalwayssb): Add a new function. (dfacomp): Use it. (skip_remains_mb): If an input character is single bytes that cannot occur inside a multibyte character, skip check for multibyte character boundary until there. --- src/dfa.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/dfa.c b/src/dfa.c index 806cb04..059c5b2 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -451,6 +451,18 @@ struct dfa static void dfamust (struct dfa *dfa); static void regexp (void); +/* True if each byte can not occur inside a multibyte character */ +static bool always_single_byte[NOTCHAR]; + +static void +dfaalwayssb (void) +{ + size_t i; + unsigned char const uc[] = { '\0', '\n', '\r', '.', '/' }; + for (i = 0; i < sizeof uc / sizeof uc[0]; ++i) + always_single_byte[uc[i]] = true; +} + static void dfambcache (struct dfa *d) { @@ -3279,6 +3291,8 @@ skip_remains_mb (struct dfa *d, unsigned char const *p, unsigned char const *mbp, char const *end, wint_t *wcp) { wint_t wc = WEOF; + if (always_single_byte[*p]) + return p; while (mbp < p) mbp += mbs_to_wchar (&wc, (char const *) mbp, end - (char const *) mbp, d); @@ -3713,6 +3727,7 @@ void dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { dfainit (d); + dfaalwayssb (); dfambcache (d); dfaparse (s, len, d); dfamust (d); -- 2.2.0