mldonkey-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...


From: mldonkey-commits
Subject: [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...
Date: Fri, 01 Sep 2006 17:17:58 +0000

CVSROOT:        /sources/mldonkey
Module name:    mldonkey
Changes by:     spiralvoice <spiralvoice>       06/09/01 17:17:58

Modified files:
        distrib        : ChangeLog 
        src/daemon/driver: driverInteractive.ml 
        src/utils/lib  : levenshtein.ml 

Log message:
        patch #5361

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/mldonkey/distrib/ChangeLog?cvsroot=mldonkey&r1=1.985&r2=1.986
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverInteractive.ml?cvsroot=mldonkey&r1=1.96&r2=1.97
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/utils/lib/levenshtein.ml?cvsroot=mldonkey&r1=1.1&r2=1.2

Patches:
Index: distrib/ChangeLog
===================================================================
RCS file: /sources/mldonkey/mldonkey/distrib/ChangeLog,v
retrieving revision 1.985
retrieving revision 1.986
diff -u -b -r1.985 -r1.986
--- distrib/ChangeLog   1 Sep 2006 16:32:43 -0000       1.985
+++ distrib/ChangeLog   1 Sep 2006 17:17:58 -0000       1.986
@@ -15,6 +15,7 @@
 =========
 
 2006/09/01
+5361: Faster filenames_variability computation (pango)
 5360: HTML: Increase webinterface buffer to avoid problems with long output
 5315: EDK: Change some internal server data to option values
 5354: New option: max_filenames, maximum number of different filenames

Index: src/daemon/driver/driverInteractive.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverInteractive.ml,v
retrieving revision 1.96
retrieving revision 1.97
diff -u -b -r1.96 -r1.97
--- src/daemon/driver/driverInteractive.ml      1 Sep 2006 16:22:14 -0000       
1.96
+++ src/daemon/driver/driverInteractive.ml      1 Sep 2006 17:17:58 -0000       
1.97
@@ -2194,9 +2194,35 @@
   else
     Printf.bprintf buf "%s" result
 
+module UnionFind = struct
+  type t = int array
+  let create_sets n =
+    Array.init n (fun i -> i) (* each element is its own leader *)
+  let find_leader t i =
+    let rec fix_point i =
+      let parent = t.(i) in
+      if parent <> i then fix_point parent 
+      else i in
+    let leader = fix_point i in
+    t.(i) <- leader;
+    leader
+  let merge_sets t i j =
+    let leaderi = find_leader t i in
+    let leaderj = find_leader t j in
+    t.(leaderi) <- leaderj
+  let number_of_sets t =
+    let nsets = ref 0 in
+    Array.iteri (fun i ti ->
+      if i = ti then incr nsets) t;
+    !nsets
+end
+
 let filenames_variability o list =
   let debug = false in
 
+  (* over this number of filenames, exact variability is not computed
+     (too expensive) *)
+  let bypass_threshold = 100 in
   (* minimum distance that must exist between two groups of filenames
      so they're considered separate *)
   let gap_threshold = 4 in
@@ -2212,48 +2238,37 @@
   let canonized_words s =
     let len = String.length s in
     let current_word = Buffer.create len in
-    let rec aux i wl =
+    let rec outside_word i wl =
       if i < len then
-       if not (is_alphanum s.[i]) then aux (i + 1) wl
-       else begin
+       if not (is_alphanum s.[i]) then outside_word (i + 1) wl
+       else begin (* start of a new word *)
          Buffer.add_char current_word (Char.lowercase s.[i]);
-         aux2 (i + 1) wl
+         inside_word (i + 1) wl
        end
       else wl
-    and aux2 i wl =
+    and inside_word i wl =
       if i < len then 
-       if not (is_alphanum s.[i]) then begin
+       if not (is_alphanum s.[i]) then begin (* end of the word *)
          let wl = Buffer.contents current_word :: wl in
          Buffer.reset current_word;
-         aux i wl
+         outside_word i wl
        end else begin
          Buffer.add_char current_word (Char.lowercase s.[i]);
-         aux2 (i + 1) wl
+         inside_word (i + 1) wl
        end
       else Buffer.contents current_word :: wl
     in
-    aux 0 [] in
+    outside_word 0 [] in
 
   let costs = { 
     Levenshtein.insert_cost = 1; 
     Levenshtein.delete_cost = 1; 
     Levenshtein.replace_cost = 2 } in
+  (* we can only assume the distance is symetric if insert and
+     delete costs are the same *)
+  assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
   let dist = Levenshtein.ForWords.distance costs in
 
-  (* fold over all the pairs that can be made with the elements of l *)
-  let list_pair_fold f acc l =
-    let rec aux acc e1 l =
-      let rec aux2 acc e1 l =
-       match l with
-       | [] -> acc
-       | h :: q -> aux2 (f acc e1 h) e1 q in
-      match l with
-      | [] -> acc
-      | h :: q -> aux (aux2 acc e1 l) h q in
-    match l with
-    | [] -> acc
-    | h :: q -> aux acc h q in
-
   let score_list =
     List.map (fun fileinfo ->
       (* canonize filenames by keeping only lowercase words, and
@@ -2264,88 +2279,20 @@
          Array.of_list (List.sort String.compare (canonized_words fn)) in
        if List.mem new_fn acc then acc else new_fn :: acc
       ) [] fileinfo.file_names) in
-      (* precalculate all Levenshtein distances 
-        That's currently the most expensive phase when lots of
-        different filenames exist 
-      *)
-      let n = Array.length fns in
-      let matrix = Array.make_matrix n n 0 in
-      (* we can only assume the matrix is symetric if insert and
-        delete costs are the same *)
-      assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
-      for i = 0 to n - 2 do
+
+      let nfilenames = Array.length fns in
+      if nfilenames > bypass_threshold then
+       fileinfo, bypass_threshold
+      else
+       let unionfind_sets = UnionFind.create_sets nfilenames in
+       for i = 0 to nfilenames - 2 do
        let d1 = dist fns.(i) in
-       for j = i + 1 to n - 1 do
-         matrix.(i).(j) <- d1 fns.(j)
+         for j = i + 1 to nfilenames - 1 do
+           if d1 fns.(j) < gap_threshold then
+             UnionFind.merge_sets unionfind_sets i j
        done
       done;
-
-      (* for debugging only *)
-      let rec string_of_set (s, d) = 
-       Printf.sprintf "[%s] (%d)" 
-         (String.concat "/" 
-           (List.map (fun i -> 
-             String.concat " " (Array.to_list fns.(i))
-           ) s)) d in
-      
-      (* there's one more cluster than gaps between clusters *)
-      let number_of_clusters (s, d) = d + 1 in
-      
-      let pair_dist i1 i2 =
-       (* again we assume the matrix is symetric,
-          and that i1 <> i2 *)
-       if i1 < i2 then matrix.(i1).(i2) else matrix.(i2).(i1) in
-
-      (* usual definition of distance between two sets 
-        d(E1,E2) = min { e1 in E1, e2 in E2 / d(e1,e2) }
-      *)
-      let rec sets_dist s1 s2 =
-       match s1, s2 with
-       | ([i1], _), ([i2], _) -> pair_dist i1 i2
-       | ([i1], _), (h2 :: q2, d2) ->
-           min (pair_dist i1 h2) (sets_dist s1 (q2, d2)) 
-       | (h1 :: q1, d1), s2 -> 
-           min (sets_dist ([h1], d1) s2) (sets_dist (q1, d1) s2) 
-       | _ -> assert false in
-      
-      (* initially, each filename is in its own set *)
-      let initial_list_of_sets = 
-       let rec aux n l =
-         let n1 = n - 1 in
-         if n1 >= 0 then 
-           aux n1 (([n1], 0) :: l) 
-         else l in
-       aux n [] in
-
-      let gap d =
-       if d < gap_threshold then 0 else 1 in
-      
-      let rec coalesce_sets ls =
-       match ls with
-       | [s] -> s
-       | _ ->
-           (* find two sets with minimal distance and coalesce them *)
-           match (
-             list_pair_fold (fun acc e1 e2 ->
-               let d = sets_dist e1 e2 in
-               match acc with
-               | None -> Some (e1, e2, d)
-               | Some (bs1, bs2, min_dist) ->
-                   if d < min_dist then Some (e1, e2, d)
-                   else acc) None ls) with
-           | Some (((s1, d1) as e1), ((s2, d2) as e2), min_dist) ->
-               if debug then
-                 Printf.printf "Coalesce\n%s and\n%s (distance %d)\n"
-                   (string_of_set e1) (string_of_set e2) min_dist;
-               coalesce_sets 
-                 ((s1 @ s2, d1 + d2 + (gap min_dist)) ::
-                   (List.filter (fun e -> e != e1 && e != e2) ls))
-           | None -> assert false
-      in
-      let coalesced_set = coalesce_sets initial_list_of_sets in
-      let nclusters = number_of_clusters coalesced_set in
-      
-      fileinfo, nclusters
+       fileinfo, UnionFind.number_of_sets unionfind_sets
     ) list in
 
   (* files with most clusters at the end of results table *)

Index: src/utils/lib/levenshtein.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/utils/lib/levenshtein.ml,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- src/utils/lib/levenshtein.ml        21 Jul 2006 19:29:24 -0000      1.1
+++ src/utils/lib/levenshtein.ml        1 Sep 2006 17:17:58 -0000       1.2
@@ -25,37 +25,39 @@
   let distance lc =
     fun s1 ->
       let l1 = C.length s1 in
-      let m = Array.make (l1 + 1) 0 in
-      let n = Array.make (l1 + 1) 0 in
+      let current_row = Array.make (l1 + 1) 0 in
+      let next_row = Array.make (l1 + 1) 0 in
       fun s2 ->
        let l2 = C.length s2 in
        (* invariant:
           matrix.(a).(b) = 
           levenshtein.distance lc (String.sub s1 0 a) (String.sub s2 0 b) 
         
-          m.(i) = matrix.(i).(j)
-          n.(i) = matrix.(i).(j+1) *)
-       m.(0) <- 0;
+          current_row.(i) = matrix.(i).(j)
+          next_row.(i) = matrix.(i).(j+1) *)
+       current_row.(0) <- 0;
        for i = 1 to l1 do
-         m.(i) <- m.(i - 1) + lc.delete_cost
+         current_row.(i) <- current_row.(i - 1) + lc.delete_cost
        done;
-       let rec aux j m n =
-         if j = l2 then m.(l1)
+       let min_3int a b c : int =
+         let min = if a <= b then a else b in
+         if min <= c then min else c in
+       let rec aux j current_row next_row =
+         if j = l2 then current_row.(l1)
          else
            let c2 = C.get s2 j in
-           n.(0) <- m.(0) + lc.insert_cost;
+           next_row.(0) <- current_row.(0) + lc.insert_cost;
            for i = 1 to l1 do
              let i1 = i - 1 in
-             n.(i) <-
-               min
-               (min 
-                 (n.(i1) + lc.delete_cost)
-                 (m.(i) + lc.insert_cost))
-               (m.(i1) + 
-                 (if C.equal (C.get s1 i1) c2 then 0 else lc.replace_cost))
+             next_row.(i) <-
+               min_3int
+                 (next_row.(i1) + lc.delete_cost)
+                 (current_row.(i) + lc.insert_cost)
+                 (if C.equal (C.get s1 i1) c2 then current_row.(i1) 
+                  else current_row.(i1) + lc.replace_cost)
            done;
-           aux (j + 1) n m in
-       aux 0 m n
+           aux (j + 1) next_row current_row in (* swap rows *)
+       aux 0 current_row next_row
 end
 
 module ChainString = struct




reply via email to

[Prev in Thread] Current Thread [Next in Thread]