Google Groups

Re: best and fastest way to read lines from a file?


Ashish Agarwal Oct 1, 2007 2:53 PM
Posted in group: fa.caml
Try this:

(* test.ml *)
let lines fname =
  let cin = open_in fname in
  let f _ =
    try Some (input_line cin)
    with End_of_file -> None
  in
    Stream.from f

let count = ref 0
let strm = lines "your-input-file.txt"
let _ = Stream.iter (fun _ -> incr count) strm
let _ = print_endline ("Done: " ^ (string_of_int !count))

On my machine (using some other file I had with 637114 lines)...
$ time ./test.py
Done:  637114

real    0m0.509s
user    0m0.412s
sys     0m0.096s

$ ocamlopt test.ml
$ time ./a.out
Done: 637114

real    0m0.305s
user    0m0.236s
sys     0m0.067s

On Oct 1, 5:28 pm, YC <yinso.c...@gmail.com> wrote:
> Hi all -
>
> Newbie question: I'm wondering what's the most efficient way to read in a
> file line by line?  I wrote a routine in both python and ocaml to read in a
> file with 345K lines to do line count and was surprised that python's code
> run roughly 3x faster.
>
> I thought the speed should be equivalent and/or somewhat in ocaml favor,
> given this is an IO-bound comparison, but perhaps Python's simplistic for
> loop have a read-ahead buffer built-in, and perhaps ocaml's input channel is
> unbuffered, but I'm not sure how to write a buffered code that can do a line
> by line read-in.
>
> Any insight is appreciated, thanks ;)
>
> yc
>
> Python code:
> # test.py
> #!/usr/bin/python
>
> file = <345k-line.txt>
> count = 0
> for line in open (file, "r"):
>     count = count + 1
> print "Done: ", count
>
> OCaml code:
> (* test.ml *)
> let rec line_count filename =
>   let f = open_in filename in
>   let rec loop file count =
>     try
>       ignore (input_line file);
>       loop file (count + 1)
>     with
>       End_of_file -> count
>   in
>     loop f 0;;
>
> let count = line_count <345k-line.txt> in
>     Printf.printf "Done: %d" count;;
>
> Test
> $ time ./test.py
> Done: 345001
>
> real    0m0.416s
> user   0m0.101s
> sys    0m0.247s
>
> $ ocamlopt -o test test.ml
> $ time ./test
> Done: 345001
> real    0m1.483s
> user   0m0.631s
> sys    0m0.685s
>
> _______________________________________________
> Caml-list mailing list. Subscription management:http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
> Archives:http://caml.inria.fr
> Beginner's list:http://groups.yahoo.com/group/ocaml_beginners
> Bug reports:http://caml.inria.fr/bin/caml-bugs