(* Copyright (C) 2008 Mauricio Fernandez http//eigenclass.org *) open Printf open ExtList module H = Hashtbl let re = Pcre.regexp "^/ongoing/When/\\d\\d\\dx/\\d\\d\\d\\d/\\d\\d/\\d\\d/[^ .]+$" let self_ref_re = Pcre.regexp "^\"http://www.tbray.org/ongoing/" let (@@) f x = f x let (+!), (/!), (-!) = Int64.add, Int64.div, Int64.sub let u_hits, u_bytes, s404s, clients, refs = H.create 8192, H.create 8192, H.create 8192, H.create 1384, H.create 16384 let incr_count h k = try incr (H.find h k) with Not_found -> H.add h k (ref 1) let add_to_num_count h k n = try let r = H.find h k in r := Int64.add !r n with Not_found -> H.add h k (ref n) let record client u bytes ref = if bytes <> 0L then add_to_num_count u_bytes u bytes; if Pcre.pmatch ~rex:re u then begin incr_count u_hits u; incr_count clients client; if ref <> "\"-\"" && not (Pcre.pmatch ~rex:self_ref_re ref) then incr_count refs (String.sub ref 1 (String.length ref - 2)) end let report ?(n = 10) compf print_f label hash = let keep n x ((elms, set) as t) = if elms = 0 then (1, PMap.add x true set) else let min, _ = PMap.min_binding set in if compf x min <= 0 then t else if elms < n then (elms + 1, PMap.add x true set) else (elms, PMap.add x true (PMap.remove_min_binding set)) in let elements (_, set) = List.rev (PMap.foldi (fun k _ l -> k :: l) set []) in printf "Top %s:\n" label; List.iter print_f @@ List.rev @@ elements @@ H.fold (fun k v s -> keep n (!v, k) s) hash (0, PMap.create compf); printf "\n" let chunks n filename = let len = (Unix.LargeFile.stat filename).Unix.LargeFile.st_size in let size = len /! (Int64.of_int n) in let rec loop ic acc start = if start >= len then (close_in ic; acc) else let stop = start +! size in if stop >= len then (close_in ic; (start, len) :: acc) else begin LargeFile.seek_in ic stop; try ignore (input_line ic); let p = LargeFile.pos_in ic in loop ic ((start, p) :: acc) p with End_of_file -> close_in ic; (start, len) :: acc end in List.rev (loop (open_in filename) [] 0L) open Str_util let b = make_word_info 1024 let proc_line ~max read str = if Int64.compare read max >= 0 then raise Exit; find_words b str; if num_words b >= 11 && word b str 5 = "\"GET" then begin let client = word b str 0 and u = word b str 6 in let status = word b str 8 and ref = word b str 10 in match status with "200" -> record client u (try Int64.of_string (word b str 9) with _ -> Int64.zero) ref | "304" -> record client u 0L ref | "404" -> incr_count s404s u | _ -> () end; read +! (Int64.of_int (String.length str + 1)) (* the newline *) let proc_chunk (start, stop) = H.clear u_bytes; List.iter H.clear [u_hits; s404s; clients; refs]; let to_l h = H.fold (fun k v l -> (k, !v) :: l) h [] in let max = stop -! start in let ic = open_in_bin Sys.argv.(1) in let read = ref 0L in LargeFile.seek_in ic start; try while true do read := proc_line ~max !read (input_line ic) done; assert false with End_of_file | Exit -> (to_l u_hits, to_l u_bytes, to_l s404s, to_l clients, to_l refs) let merge (u_hits', u_bytes', s404s', clients', refs') = let add f newh oldh = List.iter (fun (k, v) -> try let old = Hashtbl.find oldh k in old := f !old v with Not_found -> Hashtbl.add oldh k (ref v)) newh in add (+!) u_bytes' u_bytes; List.iter (fun (a, b) -> add (+) a b) [u_hits', u_hits; s404s', s404s; clients', clients; refs', refs] let () = let compare_pairs compare (a1, b1) (a2, b2) = match compare a1 a2 with 0 -> String.compare b2 b1 (* reverse *) | r -> r in let shrink s = if String.length s > 60 then String.sub s 0 60 ^ "..." else s in let r1 ?n = report ?n (compare_pairs compare) (fun (n, s) -> printf " %10d: %s\n" n (shrink s)) in let mega = 1024. *. 1024. in let r2 ?n = report ?n (compare_pairs Int64.compare) (fun (n, s) -> printf " %9.1fM: %s\n" (Int64.to_float n /. mega) (shrink s)) in let workers = try int_of_string (Array.get Sys.argv 2) with _ -> 1 in let nchunks = try int_of_string (Array.get Sys.argv 3) with _ -> workers in let rec loop = function [] -> () | l -> let h, tl = List.take workers l, List.drop workers l in List.iter (fun f -> merge (f ())) (List.map (Parallel.invoke proc_chunk) h); loop tl in loop (chunks nchunks (Array.get Sys.argv 1)); let len = H.length in printf "%d resources, %d 404s, %d clients\n\n" (len u_hits) (len s404s) (len clients); r1 "URIs by hit" u_hits; r2 "URIs by bytes" u_bytes; r1 "404s" s404s; r1 "client addresses" clients; r1 "referrers" refs;