[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...
From: |
mldonkey-commits |
Subject: |
[Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr... |
Date: |
Fri, 01 Sep 2006 17:17:58 +0000 |
CVSROOT: /sources/mldonkey
Module name: mldonkey
Changes by: spiralvoice <spiralvoice> 06/09/01 17:17:58
Modified files:
distrib : ChangeLog
src/daemon/driver: driverInteractive.ml
src/utils/lib : levenshtein.ml
Log message:
patch #5361
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/mldonkey/distrib/ChangeLog?cvsroot=mldonkey&r1=1.985&r2=1.986
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/daemon/driver/driverInteractive.ml?cvsroot=mldonkey&r1=1.96&r2=1.97
http://cvs.savannah.gnu.org/viewcvs/mldonkey/src/utils/lib/levenshtein.ml?cvsroot=mldonkey&r1=1.1&r2=1.2
Patches:
Index: distrib/ChangeLog
===================================================================
RCS file: /sources/mldonkey/mldonkey/distrib/ChangeLog,v
retrieving revision 1.985
retrieving revision 1.986
diff -u -b -r1.985 -r1.986
--- distrib/ChangeLog 1 Sep 2006 16:32:43 -0000 1.985
+++ distrib/ChangeLog 1 Sep 2006 17:17:58 -0000 1.986
@@ -15,6 +15,7 @@
=========
2006/09/01
+5361: Faster filenames_variability computation (pango)
5360: HTML: Increase webinterface buffer to avoid problems with long output
5315: EDK: Change some internal server data to option values
5354: New option: max_filenames, maximum number of different filenames
Index: src/daemon/driver/driverInteractive.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/daemon/driver/driverInteractive.ml,v
retrieving revision 1.96
retrieving revision 1.97
diff -u -b -r1.96 -r1.97
--- src/daemon/driver/driverInteractive.ml 1 Sep 2006 16:22:14 -0000
1.96
+++ src/daemon/driver/driverInteractive.ml 1 Sep 2006 17:17:58 -0000
1.97
@@ -2194,9 +2194,35 @@
else
Printf.bprintf buf "%s" result
+module UnionFind = struct
+ type t = int array
+ let create_sets n =
+ Array.init n (fun i -> i) (* each element is its own leader *)
+ let find_leader t i =
+ let rec fix_point i =
+ let parent = t.(i) in
+ if parent <> i then fix_point parent
+ else i in
+ let leader = fix_point i in
+ t.(i) <- leader;
+ leader
+ let merge_sets t i j =
+ let leaderi = find_leader t i in
+ let leaderj = find_leader t j in
+ t.(leaderi) <- leaderj
+ let number_of_sets t =
+ let nsets = ref 0 in
+ Array.iteri (fun i ti ->
+ if i = ti then incr nsets) t;
+ !nsets
+end
+
let filenames_variability o list =
let debug = false in
+ (* over this number of filenames, exact variability is not computed
+ (too expensive) *)
+ let bypass_threshold = 100 in
(* minimum distance that must exist between two groups of filenames
so they're considered separate *)
let gap_threshold = 4 in
@@ -2212,48 +2238,37 @@
let canonized_words s =
let len = String.length s in
let current_word = Buffer.create len in
- let rec aux i wl =
+ let rec outside_word i wl =
if i < len then
- if not (is_alphanum s.[i]) then aux (i + 1) wl
- else begin
+ if not (is_alphanum s.[i]) then outside_word (i + 1) wl
+ else begin (* start of a new word *)
Buffer.add_char current_word (Char.lowercase s.[i]);
- aux2 (i + 1) wl
+ inside_word (i + 1) wl
end
else wl
- and aux2 i wl =
+ and inside_word i wl =
if i < len then
- if not (is_alphanum s.[i]) then begin
+ if not (is_alphanum s.[i]) then begin (* end of the word *)
let wl = Buffer.contents current_word :: wl in
Buffer.reset current_word;
- aux i wl
+ outside_word i wl
end else begin
Buffer.add_char current_word (Char.lowercase s.[i]);
- aux2 (i + 1) wl
+ inside_word (i + 1) wl
end
else Buffer.contents current_word :: wl
in
- aux 0 [] in
+ outside_word 0 [] in
let costs = {
Levenshtein.insert_cost = 1;
Levenshtein.delete_cost = 1;
Levenshtein.replace_cost = 2 } in
+ (* we can only assume the distance is symetric if insert and
+ delete costs are the same *)
+ assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
let dist = Levenshtein.ForWords.distance costs in
- (* fold over all the pairs that can be made with the elements of l *)
- let list_pair_fold f acc l =
- let rec aux acc e1 l =
- let rec aux2 acc e1 l =
- match l with
- | [] -> acc
- | h :: q -> aux2 (f acc e1 h) e1 q in
- match l with
- | [] -> acc
- | h :: q -> aux (aux2 acc e1 l) h q in
- match l with
- | [] -> acc
- | h :: q -> aux acc h q in
-
let score_list =
List.map (fun fileinfo ->
(* canonize filenames by keeping only lowercase words, and
@@ -2264,88 +2279,20 @@
Array.of_list (List.sort String.compare (canonized_words fn)) in
if List.mem new_fn acc then acc else new_fn :: acc
) [] fileinfo.file_names) in
- (* precalculate all Levenshtein distances
- That's currently the most expensive phase when lots of
- different filenames exist
- *)
- let n = Array.length fns in
- let matrix = Array.make_matrix n n 0 in
- (* we can only assume the matrix is symetric if insert and
- delete costs are the same *)
- assert (costs.Levenshtein.insert_cost = costs.Levenshtein.delete_cost);
- for i = 0 to n - 2 do
+
+ let nfilenames = Array.length fns in
+ if nfilenames > bypass_threshold then
+ fileinfo, bypass_threshold
+ else
+ let unionfind_sets = UnionFind.create_sets nfilenames in
+ for i = 0 to nfilenames - 2 do
let d1 = dist fns.(i) in
- for j = i + 1 to n - 1 do
- matrix.(i).(j) <- d1 fns.(j)
+ for j = i + 1 to nfilenames - 1 do
+ if d1 fns.(j) < gap_threshold then
+ UnionFind.merge_sets unionfind_sets i j
done
done;
-
- (* for debugging only *)
- let rec string_of_set (s, d) =
- Printf.sprintf "[%s] (%d)"
- (String.concat "/"
- (List.map (fun i ->
- String.concat " " (Array.to_list fns.(i))
- ) s)) d in
-
- (* there's one more cluster than gaps between clusters *)
- let number_of_clusters (s, d) = d + 1 in
-
- let pair_dist i1 i2 =
- (* again we assume the matrix is symetric,
- and that i1 <> i2 *)
- if i1 < i2 then matrix.(i1).(i2) else matrix.(i2).(i1) in
-
- (* usual definition of distance between two sets
- d(E1,E2) = min { e1 in E1, e2 in E2 / d(e1,e2) }
- *)
- let rec sets_dist s1 s2 =
- match s1, s2 with
- | ([i1], _), ([i2], _) -> pair_dist i1 i2
- | ([i1], _), (h2 :: q2, d2) ->
- min (pair_dist i1 h2) (sets_dist s1 (q2, d2))
- | (h1 :: q1, d1), s2 ->
- min (sets_dist ([h1], d1) s2) (sets_dist (q1, d1) s2)
- | _ -> assert false in
-
- (* initially, each filename is in its own set *)
- let initial_list_of_sets =
- let rec aux n l =
- let n1 = n - 1 in
- if n1 >= 0 then
- aux n1 (([n1], 0) :: l)
- else l in
- aux n [] in
-
- let gap d =
- if d < gap_threshold then 0 else 1 in
-
- let rec coalesce_sets ls =
- match ls with
- | [s] -> s
- | _ ->
- (* find two sets with minimal distance and coalesce them *)
- match (
- list_pair_fold (fun acc e1 e2 ->
- let d = sets_dist e1 e2 in
- match acc with
- | None -> Some (e1, e2, d)
- | Some (bs1, bs2, min_dist) ->
- if d < min_dist then Some (e1, e2, d)
- else acc) None ls) with
- | Some (((s1, d1) as e1), ((s2, d2) as e2), min_dist) ->
- if debug then
- Printf.printf "Coalesce\n%s and\n%s (distance %d)\n"
- (string_of_set e1) (string_of_set e2) min_dist;
- coalesce_sets
- ((s1 @ s2, d1 + d2 + (gap min_dist)) ::
- (List.filter (fun e -> e != e1 && e != e2) ls))
- | None -> assert false
- in
- let coalesced_set = coalesce_sets initial_list_of_sets in
- let nclusters = number_of_clusters coalesced_set in
-
- fileinfo, nclusters
+ fileinfo, UnionFind.number_of_sets unionfind_sets
) list in
(* files with most clusters at the end of results table *)
Index: src/utils/lib/levenshtein.ml
===================================================================
RCS file: /sources/mldonkey/mldonkey/src/utils/lib/levenshtein.ml,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- src/utils/lib/levenshtein.ml 21 Jul 2006 19:29:24 -0000 1.1
+++ src/utils/lib/levenshtein.ml 1 Sep 2006 17:17:58 -0000 1.2
@@ -25,37 +25,39 @@
let distance lc =
fun s1 ->
let l1 = C.length s1 in
- let m = Array.make (l1 + 1) 0 in
- let n = Array.make (l1 + 1) 0 in
+ let current_row = Array.make (l1 + 1) 0 in
+ let next_row = Array.make (l1 + 1) 0 in
fun s2 ->
let l2 = C.length s2 in
(* invariant:
matrix.(a).(b) =
levenshtein.distance lc (String.sub s1 0 a) (String.sub s2 0 b)
- m.(i) = matrix.(i).(j)
- n.(i) = matrix.(i).(j+1) *)
- m.(0) <- 0;
+ current_row.(i) = matrix.(i).(j)
+ next_row.(i) = matrix.(i).(j+1) *)
+ current_row.(0) <- 0;
for i = 1 to l1 do
- m.(i) <- m.(i - 1) + lc.delete_cost
+ current_row.(i) <- current_row.(i - 1) + lc.delete_cost
done;
- let rec aux j m n =
- if j = l2 then m.(l1)
+ let min_3int a b c : int =
+ let min = if a <= b then a else b in
+ if min <= c then min else c in
+ let rec aux j current_row next_row =
+ if j = l2 then current_row.(l1)
else
let c2 = C.get s2 j in
- n.(0) <- m.(0) + lc.insert_cost;
+ next_row.(0) <- current_row.(0) + lc.insert_cost;
for i = 1 to l1 do
let i1 = i - 1 in
- n.(i) <-
- min
- (min
- (n.(i1) + lc.delete_cost)
- (m.(i) + lc.insert_cost))
- (m.(i1) +
- (if C.equal (C.get s1 i1) c2 then 0 else lc.replace_cost))
+ next_row.(i) <-
+ min_3int
+ (next_row.(i1) + lc.delete_cost)
+ (current_row.(i) + lc.insert_cost)
+ (if C.equal (C.get s1 i1) c2 then current_row.(i1)
+ else current_row.(i1) + lc.replace_cost)
done;
- aux (j + 1) n m in
- aux 0 m n
+ aux (j + 1) next_row current_row in (* swap rows *)
+ aux 0 current_row next_row
end
module ChainString = struct
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/01
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/01
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr...,
mldonkey-commits <=
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/03
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/03
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/07
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/12
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/13
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/13
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/16
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/23
- [Mldonkey-commits] mldonkey distrib/ChangeLog src/daemon/driver/dr..., mldonkey-commits, 2006/09/23