bug-gnulib
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] New function fstrcmp_if_higher.


From: Bruno Haible
Subject: Re: [PATCH] New function fstrcmp_if_higher.
Date: Sun, 14 Sep 2008 21:16:32 +0200
User-agent: KMail/1.5.4

Hi Ralf,

An excellent patch! I measure a speedup of "msgmerge af.po coreutils.pot"
from 376 sec. to 152 sec.

I changed your patch a bit
  1. to avoid new function calls - more inlining.
  2. to include a formal proof of the upper bound.
  3. To leave the modified fstrcmp function the freedom what to return
     when the result is known to be < bound - either 0 or the original
     result. This saves a few instructions at the end.
I also changed the test code to be easier to read. Applied this:


2008-09-14  Ralf Wildenhues  <address@hidden>

        * lib/fstrcmp.h (fstrcmp_bounded): New declaration.
        (fstrcmp): Define in terms of fstrcmp_bounded.
        * lib/fstrcmp.c (fstrcmp_bounded): Renamed from fstrcmp. Add 
lower_bound argument.
        Return quickly if the result is certainly < lower_bound.
        * tests/test-fstrcmp.c (check_fstrcmp): Test also fstrcmp_bounded.

*** lib/fstrcmp.h.orig  2008-09-14 21:04:36.000000000 +0200
--- lib/fstrcmp.h       2008-09-14 19:03:36.000000000 +0200
***************
*** 1,5 ****
  /* Fuzzy string comparison.
!    Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc.
  
     This file was written by Peter Miller <address@hidden>
  
--- 1,6 ----
  /* Fuzzy string comparison.
!    Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software
!    Foundation, Inc.
  
     This file was written by Peter Miller <address@hidden>
  
***************
*** 24,32 ****
  #endif
  
  /* Fuzzy compare of S1 and S2.  Return a measure for the similarity of S1
!    and S1.  The higher the result, the more similar the strings are.  */
  extern double fstrcmp (const char *s1, const char *s2);
  
  #ifdef __cplusplus
  }
  #endif
--- 25,43 ----
  #endif
  
  /* Fuzzy compare of S1 and S2.  Return a measure for the similarity of S1
!    and S1.  The higher the result, the more similar the strings are.
!    The result is bounded between 0 (meaning very dissimilar strings) and
!    1 (meaning identical strings).  */
  extern double fstrcmp (const char *s1, const char *s2);
  
+ /* Like fstrcmp (S1, S2), except that if the result is < LOWER_BOUND, an
+    arbitrary other value < LOWER_BOUND can be returned.  */
+ extern double fstrcmp_bounded (const char *s1, const char *s2,
+                              double lower_bound);
+ 
+ /* A shortcut for fstrcmp.  Avoids a function call.  */
+ #define fstrcmp(s1,s2) fstrcmp_bounded (s1, s2, 0.0)
+ 
  #ifdef __cplusplus
  }
  #endif
*** lib/fstrcmp.c.orig  2008-09-14 21:04:36.000000000 +0200
--- lib/fstrcmp.c       2008-09-14 20:59:40.000000000 +0200
***************
*** 97,142 ****
  gl_once_define(static, keys_init_once)
  
  
- /* NAME
-       fstrcmp - fuzzy string compare
- 
-    SYNOPSIS
-       double fstrcmp(const char *, const char *);
- 
-    DESCRIPTION
-       The fstrcmp function may be used to compare two string for
-       similarity.  It is very useful in reducing "cascade" or
-       "secondary" errors in compilers or other situations where
-       symbol tables occur.
- 
-    RETURNS
-       double; 0 if the strings are entirly dissimilar, 1 if the
-       strings are identical, and a number in between if they are
-       similar.  */
- 
  double
! fstrcmp (const char *string1, const char *string2)
  {
    struct context ctxt;
!   int xvec_length;
!   int yvec_length;
    int i;
  
    size_t fdiag_len;
    int *buffer;
    size_t bufmax;
  
    /* set the info for each string.  */
    ctxt.xvec = string1;
-   xvec_length = strlen (string1);
    ctxt.yvec = string2;
-   yvec_length = strlen (string2);
- 
-   /* short-circuit obvious comparisons */
-   if (xvec_length == 0 && yvec_length == 0)
-     return 1.0;
-   if (xvec_length == 0 || yvec_length == 0)
-     return 0.0;
  
    /* Set TOO_EXPENSIVE to be approximate square root of input size,
       bounded below by 256.  */
--- 97,148 ----
  gl_once_define(static, keys_init_once)
  
  
  double
! fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
  {
    struct context ctxt;
!   int xvec_length = strlen (string1);
!   int yvec_length = strlen (string2);
    int i;
  
    size_t fdiag_len;
    int *buffer;
    size_t bufmax;
  
+   /* short-circuit obvious comparisons */
+   if (xvec_length == 0 || yvec_length == 0)
+     return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);
+ 
+   if (lower_bound > 0)
+     {
+       /* Compute a quick upper bound.
+        Each edit is an insertion or deletion of an element, hence modifies
+        the length of the sequence by at most 1.
+        Therefore, when starting from a sequence X and ending at a sequence Y,
+        with N edits,  | yvec_length - xvec_length | <= N.  (Proof by
+        induction over N.)
+        So, at the end, we will have
+          xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |.
+        and hence
+          result
+            = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count))
+              / (xvec_length + yvec_length)
+            <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
+               / (xvec_length + yvec_length)
+            = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
+        */
+       volatile double upper_bound =
+       (double) (2 * MIN (xvec_length, yvec_length))
+       / (xvec_length + yvec_length);
+ 
+       if (upper_bound < lower_bound)
+       /* Return an arbitrary value < LOWER_BOUND.  */
+       return 0.0;
+     }
+ 
    /* set the info for each string.  */
    ctxt.xvec = string1;
    ctxt.yvec = string2;
  
    /* Set TOO_EXPENSIVE to be approximate square root of input size,
       bounded below by 256.  */
*** tests/test-fstrcmp.c.orig   2008-09-14 21:04:36.000000000 +0200
--- tests/test-fstrcmp.c        2008-09-14 19:00:42.000000000 +0200
***************
*** 45,52 ****
       compliant by default, to avoid that msgmerge results become platform and
       compiler option dependent.  'volatile' is a portable alternative to gcc's
       -ffloat-store option.  */
!   volatile double result = fstrcmp (string1, string2);
!   return (result == expected);
  }
  
  int
--- 45,77 ----
       compliant by default, to avoid that msgmerge results become platform and
       compiler option dependent.  'volatile' is a portable alternative to gcc's
       -ffloat-store option.  */
!   {
!     volatile double result = fstrcmp (string1, string2);
!     if (!(result == expected))
!       return false;
!   }
!   {
!     volatile double result = fstrcmp_bounded (string1, string2, expected);
!     if (!(result == expected))
!       return false;
!   }
!   {
!     double bound = expected * 0.5; /* implies bound <= expected */
!     volatile double result = fstrcmp_bounded (string1, string2, bound);
!     if (!(result == expected))
!       return false;
!   }
!   {
!     double bound = (1 + expected) * 0.5;
!     if (expected < bound)
!       {
!       volatile double result = fstrcmp_bounded (string1, string2, bound);
!       if (!(result < bound))
!         return false;
!       }
!   }
! 
!   return true;
  }
  
  int





reply via email to

[Prev in Thread] Current Thread [Next in Thread]