The point is that I read an article by Moura et al. whose title is "A
functional language to implement the divide-and-conquer delaunay
triangulation algorithm" (Applied mathematics and computation, 2005,
vol. 168, no1, pp. 178-191). The author works with OCAML. Then I
decided to give OCAML a try. However, OCAML turned out to be 20 times
slower than Bigloo or Stalin. Although this results is compatible with
the findings of Dr. Moura, I believe that it arises from my lack of
experience with OCAML. I used Bigarray, since I am dealing with more
than 2000000 elements (much more). I am posting a small benchmark that
shows the same behavior as the original programs, i.e., Scheme is 20
times faster than OCAML. I would be happy if one could tell me what I
did wrong. I am working with OCAML 3.10.2 since the new OCAML 3.11.0
is not available for Windows. By the way, C, Clean, Bigloo and Stalin
all have compatible speed and memory usage. In my machine, the
benchmark produce a result after 2 minutes
Here is how I compile OCAML
>ocamlopt bigarray.cmxa -ccopt -O3 -unsafe matone.ml -o matone.exe
Here is the benchmark:
matone 5000
Here is how I compile Scheme:
bigloo -Obench -farithmetic -O3 matb.scm -o matb.exe
Here is how I run the Scheme code:
matb 5000
Here is the OCAML program:
open Bigarray;;
let make_system r=
let c= r+1 in
let m= Array1.create float64 c_layout (r * c) in
let xx= ref 0.0 and s= ref 0.0 in
for i= 0 to r-1 do
s := 0.0;
for j=0 to r-1 do
xx := Random.float 100.0;
s := s.contents +. xx.contents;
Array1.set m (i*c+j) xx.contents
done;
Array1.set m (i*c+r) s.contents
done;
m
;;
let iii= ref 0;;
let swapit m c k l=
let t = ref 0.0 in
iii := iii.contents + 1;
for j=0 to c-1 do
t := Array1.get m (k*c+j);
Array1.set m (k*c+j) (Array1.get m (l*c+j));
Array1.set m (l*c+j) t.contents
done
;;
let find_max m c k idx=
let i= ref idx in
for l= k+1 to c-2 do
if (abs_float (Array1.get m (l*c+k))) >
(abs_float (Array1.get m (i.contents * c + k))) then
i := l
done;
if i.contents <> k then swapit m c k i.contents
;;
let solvit mm r c=
let rat = ref 0.0 and tx= ref 0.0 and mkk= ref 0.0 in
for k=0 to r-2 do
find_max mm c k k;
mkk := Array1.get mm (c*k + k);
for i= k+1 to r-1 do
rat := (Array1.get mm (i*c + k)) /. mkk.contents;
for j=k to c-1 do
Array1.set mm (i*c+j) ( (Array1.get mm (i*c+j)) -.
rat.contents *. (Array1.get mm
(k*c + j)))
done
done
done;
for i=r-1 downto 0 do
tx := 0.0;
for j=i+1 to r-1 do
tx := tx.contents -. (Array1.get mm (i*c + j)) *.
(Array1.get mm (j*c + r))
done;
Array1.set mm (i*c+r) ((( Array1.get mm (i*c + r)) +.
tx.contents) /.
(Array1.get mm (i*c + i)))
done
;;
let nels () =
try
int_of_string Sys.argv.(1)
with
_ -> 5000
;;
let r= nels () in
let c= (r+1) in
let mm= make_system r in
(
solvit mm r c;
for i= 0 to (min (r-1) 10) do
Printf.printf "%8.3f " (Array1.get mm (i*c+r))
done;
print_newline();
Printf.printf "Number of swaps= %d\n" iii.contents;
Printf.printf "\nOcaml Time= %f\n" (Sys.time())
);;
Here is the Scheme program:
;; Compile: bigloo -Obench -farithmetic -O3 matb.scm -o matb.exe
(module example
(main main)
(option (set! *genericity* #f))
(extern
(macro printf::int (::string ::double) "printf")
(clock::int () "clock") ))
(define iii 0)
(define-macro ($ v c i j)
`(f64vector-ref ,v (+fx (*fx ,i ,c) ,j) ) )
(define-macro ($! v c i j val)
`(f64vector-set! ,v (+fx (*fx ,i ,c) ,j) ,val))
(define-macro (!= x y)
`(not (=fx ,x ,y)))
(define (prt m r c)
(do ( (i 0 (+fx i 1)) ) ( (>=fx i r) )
(newline)
(do ((j 0 (+fx j 1)) ) ( (>=fx j c) )
(printf " %4.3f " ($ m c i j) ) )))
(define (make-system r)
(let* ( (c (+fx r 1))
(m (make-f64vector (*fx r c) 0.0 ))
(xx 0.0)
(s 0.0) )
(do ( (i 0 (+fx i 1)) ) ( (>=fx i r) m)
(set! s 0.0)
(do ((j 0 (+fx j 1) ) ) ( (>=fx j r) ($! m c i j s) )
(set! xx (fixnum->flonum (random 3873)))
(set! s (+fl s xx))
($! m c i j xx )) ) ))
(define (swapit m c k l)
(let ((t 0.0))
(set! iii (+fx iii 1))
(do ( (j 0 (+fx j 1)) ) ( (>=fx j c) )
(set! t ($ m c k j ) )
($! m c k j ($ m c l j) )
($! m c l j t) ) ) )
(define (find-max m c k i)
(do ( (l (+fx k 1) (+fx l 1)) )
( (>=fx l (-fx c 1)) (when (!= i k) (swapit m c k i )))
(when (>fl (absfl ($ m c l k)) (absfl ($ m c i k)) )
(set! i l) ) ))
(define (solvit m r)
(let ( (c (+fx r 1))
(rat 0.0)
(mkk 0.0))
(do ( (k 0 (+fx k 1)) ) ( (>=fx k (-fx r 1)))
(find-max m c k k)
(set! mkk ($ m c k k) )
(do ( ( i (+fx k 1)(+fx i 1)) ) ( (>=fx i r))
(set! rat (/fl ($ m c i k) mkk ))
(do ( (j k (+fx j 1))) ( (>=fx j c) )
($! m c i j (-fl ($ m c i j)
(*fl rat ($ m c k j ) ) ) )
)
)
)
(do ( (i (-fx r 1) (-fx i 1) ) ) ((<fx i 0) m)
(do ( (j (+fx i 1) (+fx j 1))
(tx 0.0 (-fl tx (*fl ($ m c i j)
($ m c j r )) )) )
( (>=fx j r)
($! m c i r
(/fl (+fl ($ m c i r ) tx)
($ m c i i)) ) ) ))
)
)
(define (elms argv)
(cond ( (<fx (length argv) 2) 2000)
( (string->number (cadr argv)) (string->number (cadr argv)) )
(else 2000)))
(define (main argv)
(let* ( (r (elms argv)) (c (+fx r 1)) (m (solvit (make-system r)
r) ) )
(do ( (i 0 (+fx i 1))) ( (>=fx i (min r 10)) )
(printf " %4.3f " ($ m c i r) ) )
(newline) (display "Bigloo time= ") (display (clock))
(newline) (print "Number of swaps= " iii)) )
> The point is that I read an article by Moura et al. whose title is "A
> functional language to implement the divide-and-conquer delaunay
> triangulation algorithm" (Applied mathematics and computation, 2005,
> vol. 168, no1, pp. 178-191). The author works with OCAML. Then I
> decided to give OCAML a try. However, OCAML turned out to be 20 times
> slower than Bigloo or Stalin.
If you are comparing compiled Ocaml code to compiled scheme code and
the Ocmal executable is 20 times slower, you are doing *something*
wrong.
> I believe that it arises from my lack of
> experience with OCAML.
Agreed.
> I used Bigarray, since I am dealing with more
> than 2000000 elements (much more). I am posting a small benchmark that
> shows the same behavior as the original programs, i.e., Scheme is 20
> times faster than OCAML. I would be happy if one could tell me what I
> did wrong. I am working with OCAML 3.10.2 since the new OCAML 3.11.0
> is not available for Windows.
3.10.2 is fine.
> Here is how I compile OCAML
>
>>ocamlopt bigarray.cmxa -ccopt -O3 -unsafe matone.ml -o matone.exe
I really don't think the "-ccopt -O3" will do anything here.
> Here is the OCAML program:
I've looked at this code and it looks like C code transliterated
into Ocaml. Idiomatic Ocaml code rarely uses references and rarely
uses for loops.
I don't have time to look at this in detail, but I will point out
a few things.
> open Bigarray;;
>
> let make_system r=
> let c= r+1 in
> let m= Array1.create float64 c_layout (r * c) in
> let xx= ref 0.0 and s= ref 0.0 in
> for i= 0 to r-1 do
> s := 0.0;
> for j=0 to r-1 do
> xx := Random.float 100.0;
> s := s.contents +. xx.contents;
> Array1.set m (i*c+j) xx.contents
> done;
> Array1.set m (i*c+r) s.contents
> done;
> m
> ;;
Most Ocaml programmers use "!var" to access the data in a reference
rather than var.contents.
In the above, you have xx as a float ref, but the float isn't used as
a reference. Remove the "let xx = ref 0.0" and have the following in
the inner loop:
for j=0 to r-1 do
let xx = Random.float in
s := s.contents +. xx;
Array1.set m (i*c+j) xx
done;
and similar throughout the program. This removes a number of pointer
de-references and makes the code far easier for the compile to
optimize.
> let swapit m c k l=
> let t = ref 0.0 in
> iii := iii.contents + 1;
> for j=0 to c-1 do
> t := Array1.get m (k*c+j);
> Array1.set m (k*c+j) (Array1.get m (l*c+j));
> Array1.set m (l*c+j) t.contents
> done
> ;;
Surely this swapit can be done with the blit function of
the Bigarray module.
Also it looks like you are trying to manipulate a 2 D array
(ie Matrix) but use storage for a 1 D array. Why not use the
Array2 module in the Bigarray module? Then if that is a row
swap operation, there is probably a much easier way to do
it.
>
> let find_max m c k idx=
> let i= ref idx in
> for l= k+1 to c-2 do
> if (abs_float (Array1.get m (l*c+k))) >
> (abs_float (Array1.get m (i.contents * c + k))) then
> i := l
> done;
> if i.contents <> k then swapit m c k i.contents
> ;;
You are repeating work here dereferenceing the array multiple. times
Instead of just storing the array index, store the value as well so
that the for loop becomes:
for l= k+1 to c-2 do
let test_value = abs_float (Array1.get m (l*c+k)) in
if test_value > stored_value then
( i := l ;
stored_value := test_value ;
) ;
done
Looking back over what I have written, it seems that part of the
problem is due to the Bigarray module. This module was a hack to
get around limitation on memory size on 32 bit systems.
If you are able to get access to a 64 bit system you will almost
certainly be able to use regular arrays instead of Bigarrays. I'm
pretty sure regular arrays are easier for the Ocaml compile to
optimize. You also get the advantage of the extra registers on
x86_64 which the Ocaml compiler makes good use of.
HTH,
Erik
--
-----------------------------------------------------------------
Erik de Castro Lopo
-----------------------------------------------------------------
The National Multiple Sclerosis Society of America recently started an
advertising campaign with the slogan "MS: It's not a software company".
Seasoned IT professionals will have no trouble telling the two MS's
apart. One is a debilitating and surprisingly widespread affliction
that renders the sufferer barely able to perform the simplest task.
The other is a disease.
(* ocamlopt bigarray.cmxa -unsafe -ccopt -O3 matone.ml -o matone.exe
*)
open Bigarray;;
type farray = (float, float64_elt, c_layout) Array1.t
let make_system r=
let c= r+1 in
let m= Array1.create float64 c_layout (r * c) in
let s= ref 0.0 in
for i= 0 to r-1 do
s := 0.0;
for j=0 to r-1 do
let xx = Random.float 100.0 in
( s := s.contents +. xx;
m.{i*c+j} <- xx)
done;
m.{i*c+r} <- s.contents
done;
m
;;
let iii= ref 0;;
let swapit (m:farray) c k l=
let t = ref 0.0 in
iii := iii.contents + 1;
for j=0 to c-1 do
t := Array1.get m (k*c+j);
m.{k*c+j} <- m.{l*c+j};
m.{l*c+j} <- t.contents
done
;;
let find_max (m:farray) c k idx=
let i= ref idx in
for l= k+1 to c-2 do
if (abs_float m.{l*c+k}) >
(abs_float m.{i.contents * c + k}) then
i := l
done;
if i.contents <> k then swapit m c k i.contents
;;
let solvit (mm: farray) r c=
let rat = ref 0.0 and tx= ref 0.0 and mkk= ref 0.0 in
for k=0 to r-2 do
find_max mm c k k;
mkk := mm.{c*k + k};
for i= k+1 to r-1 do
rat := mm.{i*c + k} /. mkk.contents;
for j=k to c-1 do
mm.{i*c+j} <- ( mm.{i*c+j} -.
rat.contents *. mm.{k*c + j})
done
done
done;
for i=r-1 downto 0 do
tx := 0.0;
for j=i+1 to r-1 do
tx := tx.contents -. mm.{i*c + j} *. mm.{j*c + r}
done;
mm.{i*c+r} <- (mm.{i*c + r} +. tx.contents) /.
mm.{i*c + i}
done
;;
let nels () =
try
int_of_string Sys.argv.(1)
with
_ -> 5000
;;
let r= nels () in
let c= (r+1) in
let mm= make_system r in
(
solvit mm r c;
for i= 0 to (min (r-1) 10) do
Printf.printf "%8.3f " mm.{i*c+r}
Indeed it will not. I would also advise against using -unsafe. In
particular, it provides no speedup here.
>> Here is the OCAML program:
>
> I've looked at this code and it looks like C code transliterated
> into Ocaml. Idiomatic Ocaml code rarely uses references and rarely
> uses for loops.
References and for loops are ideal for numerical computations like this
because it is the only way to get ocamlopt to do some optimizations.
However, they must not be overused (which is what you are alluding to, I
believe).
> Looking back over what I have written, it seems that part of the
> problem is due to the Bigarray module. This module was a hack to
> get around limitation on memory size on 32 bit systems.
Yes. By far the biggest problem is the dereferencing of polymorphic
bigarrays, which incurs a generic run-time lookup in OCaml that is very
slow compared to ordinary array lookup.
Simply adding type annotations reduces the time taken from 46s to 7.1s
> If you are able to get access to a 64 bit system you will almost
> certainly be able to use regular arrays instead of Bigarrays. I'm
> pretty sure regular arrays are easier for the Ocaml compile to
> optimize. You also get the advantage of the extra registers on
> x86_64 which the Ocaml compiler makes good use of.
Indeed. Using 64-bit instead and converting the bigarrays to ordinary arrays
reduces the time from 7.1s to 3.3s, which is 14x faster than the original.
Incidentally, switching to 64-bit gives a substantial performance
improvement in many languages.
Regarding the remaining optimizations, there is an OCaml Journal article
that describes the process of porting and optimizing the SciMark2 benchmark
to OCaml and that includes very similar code to this:
http://www.ffconsultancy.com/products/ocaml_journal/?u
--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
http://www.ffconsultancy.com/?u
> I received many suggestions to improve my OCAML program. The most
> important one was type declaration for the array; this helps the
> compiler inline array references. In any case, now the OCAML code is
> only three times slower than the Bigloo code. I hope to receive more
> suggestions, in order to improve the OCAML code until it is as fast as
> the Bigloo code. Here is the new OCAML code, after adding type
> declarations to all array arguments:
>
>
> (* ocamlopt bigarray.cmxa -unsafe -ccopt -O3 matone.ml -o matone.exe
> *)
As mentioned elsewhere `-ccopt -O3' is pointless, ocaml doesn't
compile through C. Secondly with 3.10 every (big)array access is
bounds checked, with 3.11 you can avoid that with either
Bigarray.ArrayN.unsafe_[gs]et or -unsafe option. Furthermore ocaml and
bigloo compiled code prints different things when invoked with, say,
./matone 500, are you sure that those two things are equivalent at all?
[..snip..]
--
mailto:av1...@comtv.ru
The array used in the benchmark is randomly generated. This means that
theoretically one language could get a slightly more favorable array
than the other, an array that does not require too many swaps.
However, I ran the benchmar many times, and noticed that both
languages get about the same number of swaps. In any case, after
reading your article, I required Bigloo to check array bonds. Its
speed dropped 50%. You are right, with array checks, Bigloo slows down
to something closer to the OCAML speed. Now Bigloo is less than two
times faster than OCAML. BTW, porting the program and the compiler to
a 64 bit machine does speed up OCAML, but it speeds up Bigloo too.
No array checks:
D:\stalin>bigloo -Obench -O3 matb.scm -o matb.exe
0
D:\stalin>matb 2000
1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Bigloo time= 14093
Number of swaps= 1992
With array checks:
D:\stalin>bigloo -O3 matb.scm -o matb.exe
libdir:(. d:\bigloo\lib\bigloo\3.1b) - /d/bigloo/lib/bigloo/3.1b
0
D:\stalin>matb 2000
1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Bigloo time= 26843
Number of swaps= 1987
Ocaml 3.10.2 (3.11.0 still not available for Windows):
D:\stalin\ocaml>ocamlopt bigarray.cmxa matone.ml -o matone.exe
D:\stalin\ocaml>matone 2000
1.000 1.000 1.000 1.000 1.000 1.000 1.000
1.000 1.000
1.000 1.000
Number of swaps= 1993
Ocaml Time= 48.093000
D:\stalin\ocaml>
> On 9 dez, 12:27, m...@pulsesoft.com wrote:
>> phi50...@yahoo.ca writes:
[..snip..]
What stops your from building it yourself?
> D:\stalin\ocaml>ocamlopt bigarray.cmxa matone.ml -o matone.exe
>
> D:\stalin\ocaml>matone 2000
> 1.000 1.000 1.000 1.000 1.000 1.000 1.000
> 1.000 1.000
> 1.000 1.000
> Number of swaps= 1993
>
> Ocaml Time= 48.093000
>
> D:\stalin\ocaml>
>
>
--
mailto:av1...@comtv.ru
I tried to build it myself, of course, but failed. Maybe you (or
somebody else) could tell me what I did wrong. Here are the steps that
I followed:
1 -- I edited d:/msys/1.0/etc/profile adding: export FLEXLINKFLAGS="-
nocygpath -L /d/msys/MinGW/lib"
Since I am like Skinner's doves, I also tried all combinations of
paths
and / \
2 -- The configure step ended without much ado. However, there was a
message saying that labltk would not be support. I have tk inside
MinGW, but it was not able to find it. Maybe you can tell me why.
3 -- make world ended with an error:
libcamlrun.a(main.o):main.c:(.text+0x26): undefined reference to
`caml_expand_command_line'
libcamlrun.a(sys.o):sys.c:(.text+0x4ea): undefined reference to
`caml_win32_random_seed'
libcamlrun.a(startup.o):startup.c:(.text+0x936): undefined reference
to `caml_signal_thread'
libcamlrun.a(signals_byt.o):signals_byt.c:(.text+0x46): undefined
reference to `caml_win32_signal'
libcamlrun.a(signals_byt.o):signals_byt.c:(.text+0xab): undefined
reference to `caml_win32_signal'
collect2: ld returned 1 exit status
make[2]: *** [ocamlrun] Error 1
make[2]: Leaving directory `/home/ocaml-3.11.0/byterun'
make[1]: *** [coldstart] Error 2
make[1]: Leaving directory `/home/ocaml-3.11.0'
make: *** [world] Error 2
Any hints?
> On 9 dez, 14:00, m...@pulsesoft.com wrote:
>> phi50...@yahoo.ca writes:
>> > On 9 dez, 12:27, m...@pulsesoft.com wrote:
>> >> phi50...@yahoo.ca writes:
>>
>> [..snip..]
>>
>>
>>
[..snip..]
> I tried to build it myself, of course, but failed. Maybe you (or
> somebody else) could tell me what I did wrong. Here are the steps that
> I followed:
I don't believe i've ever built OCaml with mingw. So, unfortunatelly,
can not help you there. However people who contributed this method of
getting OCaml work on Windows frequent the OCaml mailing list which
you can write to asking for guidance.
I think you have four options here:
a. Wait for someone more knowledgable to answer you here
b. Ask on OCaml ML
c. Try to build it with Cygwin
d. Try to build it with MS tools (Compiler/assembler/linker are part of
freely available DDK)
[..snip..]
--
mailto:av1...@comtv.ru