[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: speed up u8_strstr
From: |
Bruno Haible |
Subject: |
Re: speed up u8_strstr |
Date: |
Fri, 21 Jan 2011 14:08:32 +0100 |
User-agent: |
KMail/1.9.9 |
> u16_strstr and
> u32_strstr should have all the properties that strstr and mbsstr have. So the
> right thing to do is to adapt all of tests/test-strstr.c into
> tests/unistr/test-u-strstr.h. I've done this, and now I get test failures:
>
> /bin/sh: line 5: 24308 Alarm clock EXEEXT='' srcdir='.'
> MAKE='make' ${dir}$tst
> FAIL: test-u16-strstr
> /bin/sh: line 5: 24325 Alarm clock EXEEXT='' srcdir='.'
> MAKE='make' ${dir}$tst
> FAIL: test-u32-strstr
This required a change of the algorithm in lib/unistr/u-strstr.h. Once the tests
pass, I'm committing this combined patch:
2011-01-21 Pádraig Brady <address@hidden>
Bruno Haible <address@hidden>
Make uN_strstr functions O(n) worst-case.
* lib/unistr/u-strstr.h (FUNC): In the 8-bit case, use strstr. In the
16-bit and 32-bit unit cases, use the unibyte algorithm from
lib/mbsstr.c.
* lib/unistr/u8-strstr.c: Include <string.h>.
(UNIT_IS_UINT8_T): New macro.
* lib/unistr/u16-strstr.c: Include malloca.h and str-kmp.h.
(U_STRLEN, U_STRNLEN): New macros.
* lib/unistr/u32-strstr.c: Include malloca.h and str-kmp.h.
(U_STRLEN, U_STRNLEN): New macros.
* modules/unistr/u8-strstr (Depends-on): Add strstr.
(configure.ac): Update required libunistring version.
* modules/unistr/u16-strstr (Files): Add lib/str-kmp.h.
(Depends-on): Add unistr/u16-strlen, unistr/u16-strnlen, stdbool,
malloca.
(configure.ac): Update required libunistring version.
* modules/unistr/u32-strstr (Files): Add lib/str-kmp.h.
(Depends-on): Add unistr/u32-strlen, unistr/u32-strnlen, stdbool,
malloca.
(configure.ac): Update required libunistring version.
--- lib/unistr/u-strstr.h.orig Fri Jan 21 13:56:17 2011
+++ lib/unistr/u-strstr.h Fri Jan 21 13:54:33 2011
@@ -1,6 +1,6 @@
/* Substring test for UTF-8/UTF-16/UTF-32 strings.
Copyright (C) 1999, 2002, 2006, 2010-2011 Free Software Foundation, Inc.
- Written by Bruno Haible <address@hidden>, 2002.
+ Written by Bruno Haible <address@hidden>, 2002, 2005.
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
@@ -38,22 +38,94 @@
}
#endif
- /* Search for needle's first unit. */
- for (; *haystack != 0; haystack++)
- if (*haystack == first)
+#if UNIT_IS_UINT8_T
+ return (uint8_t *) strstr ((const char *) haystack, (const char *) needle);
+#else
+ {
+ /* Minimizing the worst-case complexity:
+ Let n = U_STRLEN(haystack), m = U_STRLEN(needle).
+ The naïve algorithm is O(n*m) worst-case.
+ The Knuth-Morris-Pratt algorithm is O(n) worst-case but it needs a
+ memory allocation.
+ To achieve linear complexity and yet amortize the cost of the
+ memory allocation, we activate the Knuth-Morris-Pratt algorithm
+ only once the naïve algorithm has already run for some time; more
+ precisely, when
+ - the outer loop count is >= 10,
+ - the average number of comparisons per outer loop is >= 5,
+ - the total number of comparisons is >= m.
+ But we try it only once. If the memory allocation attempt failed,
+ we don't retry it. */
+ bool try_kmp = true;
+ size_t outer_loop_count = 0;
+ size_t comparison_count = 0;
+ size_t last_ccount = 0; /* last comparison count */
+ const UNIT *needle_last_ccount = needle; /* = needle + last_ccount */
+
+ /* Speed up the following searches of needle by caching its first
+ character. */
+ UNIT b = *needle++;
+
+ for (;; haystack++)
{
- /* Compare with needle's remaining units. */
- const UNIT *hptr = haystack + 1;
- const UNIT *nptr = needle + 1;
- for (;;)
+ if (*haystack == 0)
+ /* No match. */
+ return NULL;
+
+ /* See whether it's advisable to use an asymptotically faster
+ algorithm. */
+ if (try_kmp
+ && outer_loop_count >= 10
+ && comparison_count >= 5 * outer_loop_count)
{
- if (*hptr != *nptr)
- break;
- hptr++; nptr++;
- if (*nptr == 0)
- return (UNIT *) haystack;
+ /* See if needle + comparison_count now reaches the end of
+ needle. */
+ if (needle_last_ccount != NULL)
+ {
+ needle_last_ccount +=
+ U_STRNLEN (needle_last_ccount,
+ comparison_count - last_ccount);
+ if (*needle_last_ccount == 0)
+ needle_last_ccount = NULL;
+ last_ccount = comparison_count;
+ }
+ if (needle_last_ccount == NULL)
+ {
+ /* Try the Knuth-Morris-Pratt algorithm. */
+ const UNIT *result;
+ bool success =
+ knuth_morris_pratt (haystack,
+ needle - 1, U_STRLEN (needle - 1),
+ &result);
+ if (success)
+ return (UNIT *) result;
+ try_kmp = false;
+ }
}
- }
- return NULL;
+ outer_loop_count++;
+ comparison_count++;
+ if (*haystack == b)
+ /* The first character matches. */
+ {
+ const UNIT *rhaystack = haystack + 1;
+ const UNIT *rneedle = needle;
+
+ for (;; rhaystack++, rneedle++)
+ {
+ if (*rneedle == 0)
+ /* Found a match. */
+ return (UNIT *) haystack;
+ if (*rhaystack == 0)
+ /* No match. */
+ return NULL;
+ comparison_count++;
+ if (*rhaystack != *rneedle)
+ /* Nothing in this round. */
+ break;
+ }
+ }
+ }
+ }
+#endif
}
--- lib/unistr/u8-strstr.c.orig Fri Jan 21 13:56:17 2011
+++ lib/unistr/u8-strstr.c Fri Jan 21 13:05:06 2011
@@ -20,10 +20,13 @@
/* Specification. */
#include "unistr.h"
+#include <string.h>
+
/* FIXME: Maybe walking the string via u8_mblen is a win? */
#define FUNC u8_strstr
#define UNIT uint8_t
#define U_STRCHR u8_strchr
#define U_STRMBTOUC u8_strmbtouc
+#define UNIT_IS_UINT8_T 1
#include "u-strstr.h"
--- lib/unistr/u16-strstr.c.orig Fri Jan 21 13:56:17 2011
+++ lib/unistr/u16-strstr.c Fri Jan 21 13:51:24 2011
@@ -20,10 +20,18 @@
/* Specification. */
#include "unistr.h"
+#include "malloca.h"
+
/* FIXME: Maybe walking the string via u16_mblen is a win? */
-#define FUNC u16_strstr
#define UNIT uint16_t
+
+#define CANON_ELEMENT(c) c
+#include "str-kmp.h"
+
+#define FUNC u16_strstr
#define U_STRCHR u16_strchr
#define U_STRMBTOUC u16_strmbtouc
+#define U_STRLEN u16_strlen
+#define U_STRNLEN u16_strnlen
#include "u-strstr.h"
--- lib/unistr/u32-strstr.c.orig Fri Jan 21 13:56:17 2011
+++ lib/unistr/u32-strstr.c Fri Jan 21 13:51:16 2011
@@ -20,7 +20,15 @@
/* Specification. */
#include "unistr.h"
-#define FUNC u32_strstr
+#include "malloca.h"
+
#define UNIT uint32_t
+
+#define CANON_ELEMENT(c) c
+#include "str-kmp.h"
+
+#define FUNC u32_strstr
#define U_STRCHR u32_strchr
+#define U_STRLEN u32_strlen
+#define U_STRNLEN u32_strnlen
#include "u-strstr.h"
--- modules/unistr/u8-strstr.orig Fri Jan 21 13:56:17 2011
+++ modules/unistr/u8-strstr Fri Jan 21 13:05:06 2011
@@ -9,9 +9,10 @@
unistr/base
unistr/u8-strchr
unistr/u8-strmbtouc
+strstr
configure.ac:
-gl_LIBUNISTRING_MODULE([0.9], [unistr/u8-strstr])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u8-strstr])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U8_STRSTR
--- modules/unistr/u16-strstr.orig Fri Jan 21 13:56:17 2011
+++ modules/unistr/u16-strstr Fri Jan 21 13:50:49 2011
@@ -4,14 +4,19 @@
Files:
lib/unistr/u16-strstr.c
lib/unistr/u-strstr.h
+lib/str-kmp.h
Depends-on:
unistr/base
unistr/u16-strchr
unistr/u16-strmbtouc
+unistr/u16-strlen
+unistr/u16-strnlen
+stdbool
+malloca
configure.ac:
-gl_LIBUNISTRING_MODULE([0.9], [unistr/u16-strstr])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u16-strstr])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U16_STRSTR
--- modules/unistr/u32-strstr.orig Fri Jan 21 13:56:17 2011
+++ modules/unistr/u32-strstr Fri Jan 21 13:50:56 2011
@@ -4,13 +4,18 @@
Files:
lib/unistr/u32-strstr.c
lib/unistr/u-strstr.h
+lib/str-kmp.h
Depends-on:
unistr/base
unistr/u32-strchr
+unistr/u32-strlen
+unistr/u32-strnlen
+stdbool
+malloca
configure.ac:
-gl_LIBUNISTRING_MODULE([0.9], [unistr/u32-strstr])
+gl_LIBUNISTRING_MODULE([0.9.4], [unistr/u32-strstr])
Makefile.am:
if LIBUNISTRING_COMPILE_UNISTR_U32_STRSTR