[Caml-list] We should all be forking

Jon Harrop

unread,

Jun 5, 2007, 2:21:28 PM6/5/07

to Caml List

Following on from our "why not fork" discussion, here is my most elegant
concurrent ray tracer written in vanilla OCaml.

This program simply runs four processes (forking off three) in parallel to
improve performance when 2-4 CPUs are available (increase "d" if you have >4
CPUs).

This isn't as sophisticated as the JoCaml version but it does double
performance (taking the time down from 4s to 2s) on my dual-core machine,
making it faster than any language that uses a concurrent GC. I don't know
about you guys, but this is a complete revelation for me!

I believe the performance relies upon the Linux kernel lazily copying the
process. Does OSX also do that?

Anyway, here is the code:

let delta = sqrt epsilon_float

type vec = {x:float; y:float; z:float}
let ( *| ) s r = {x = s *. r.x; y = s *. r.y; z = s *. r.z}
let ( +| ) a b = {x = a.x +. b.x; y = a.y +. b.y; z = a.z +. b.z}
let ( -| ) a b = {x = a.x -. b.x; y = a.y -. b.y; z = a.z -. b.z}
let dot a b = a.x *. b.x +. a.y *. b.y +. a.z *. b.z
let length r = sqrt(dot r r)
let unitise r = 1. /. length r *| r

type scene =
Sphere of vec * float
| Group of vec * float * scene * scene * scene * scene * scene

let ray_sphere {x=dx; y=dy; z=dz} {x=vx; y=vy; z=vz} r =
let disc = vx *. vx +. vy *. vy +. vz *. vz -. r *. r in
if disc < 0. then infinity else
let b = vx *. dx +. vy *. dy +. vz *. dz in
let b2 = b *. b in
if b2 < disc then infinity else
let disc = sqrt(b2 -. disc) in
let t1 = b -. disc in
if t1 > 0. then t1 else b +. disc

let ray_sphere' {x=ox; y=oy; z=oz} {x=dx; y=dy; z=dz} {x=cx; y=cy; z=cz} r =
let vx = cx -. ox and vy = cy -. oy and vz = cz -. oz in
let vv = vx *. vx +. vy *. vy +. vz *. vz in
let b = vx *. dx +. vy *. dy +. vz *. dz in
let disc = b *. b -. vv +. r *. r in
disc >= 0. && b +. sqrt disc >= 0.

type hit = {l: float; nx: float; ny: float; nz: float}

let rec intersect ({x=dx; y=dy; z=dz} as dir) hit = function
Sphere ({x=cx; y=cy; z=cz} as center, radius) ->
let l' = ray_sphere dir center radius in
if l' >= hit.l then hit else
let x = l' *. dx -. cx in
let y = l' *. dy -. cy in
let z = l' *. dz -. cz in
let il = 1. /. sqrt(x *. x +. y *. y +. z *. z) in
{l = l'; nx = il *. x; ny = il *. y; nz = il *. z}
| Group (center, radius, a, b, c, d, e) ->
let l' = ray_sphere dir center radius in
if l' >= hit.l then hit else
let f h s = intersect dir h s in
f (f (f (f (f hit a) b) c) d) e

let rec intersect' orig dir = function
Sphere (center, radius) -> ray_sphere' orig dir center radius
| Group (center, radius, a, b, c, d, e) ->
let f s = intersect' orig dir s in
ray_sphere' orig dir center radius && (f a || f b || f c || f d || f e)

let neg_light = unitise { x = 1.; y = 3.; z = -2. }

let rec ray_trace dir scene =
let hit = intersect dir {l=infinity; nx=0.; ny=0.; nz=0.} scene in
if hit.l = infinity then 0. else
let n = {x = hit.nx; y = hit.ny; z = hit.nz} in
let g = dot n neg_light in
if g < 0. then 0. else
if intersect' (hit.l *| dir +| delta *| n) neg_light scene then 0. else
g

let fold5 f x a b c d e = f (f (f (f (f x a) b) c) d) e

let rec create level c r =
let obj = Sphere (c, r) in
if level = 1 then obj else
let a = 3. *. r /. sqrt 12. in
let rec bound (c, r) = function
Sphere (c', r') -> c, max r (length (c -| c') +. r')
| Group (_, _, v, w, x, y, z) -> fold5 bound (c, r) v w x y z in
let aux x' z' = create (level - 1) (c +| {x=x'; y=a; z=z'}) (0.5 *. r) in
let w = aux (-.a) (-.a) and x = aux a (-.a) in
let y = aux (-.a) a and z = aux a a in
let c, r = fold5 bound (c +| {x=0.; y=r; z=0.}, 0.) obj w x y z in
Group (c, r, obj, w, x, y, z)

let level, n =
try int_of_string Sys.argv.(1), int_of_string Sys.argv.(2) with _ -> 9, 512

let scene = create level { x = 0.; y = -1.; z = 4. } 1. and ss = 4

module String = struct
include String

let init n f =
if n=0 then "" else
let s = String.make n (f 0) in
for i=1 to n-1 do
s.[i] <- f i
done;
s
end

let raster y =
y, String.init n
(fun x ->
let g = ref 0. in
for dx = 0 to ss - 1 do
for dy = 0 to ss - 1 do
let aux x d = float x -. float n /. 2. +. float d /. float ss in
let dir = unitise {x = aux x dx; y = aux y dy; z = float n } in
g := !g +. ray_trace dir scene
done
done;
let g = 0.5 +. 255. *. !g /. float (ss*ss) in
char_of_int (int_of_float g))

let rasters d i =
Array.init ((n+d-i)/d) (fun j -> raster(d*j+i))

let invoke (f : 'a -> 'b) x : unit -> 'b =
let input, output = Unix.pipe() in
match Unix.fork() with
| 0 ->
Unix.close input;
let output = Unix.out_channel_of_descr output in
Marshal.to_channel output (try `Res(f x) with e -> `Exn e) [];
exit 0
| _ ->
Unix.close output;
let input = Unix.in_channel_of_descr input in
fun () ->
match Marshal.from_channel input with
| `Res x -> x
| `Exn e -> raise e

let ( |> ) x f = f x

let map (f : 'a -> 'b) a : 'b array =
let n = Array.length a in
let aux i x =
if i<n-1 then invoke f x else
let f_x = f x in
fun () -> f_x in
Array.mapi aux a |>
Array.map (fun f -> f())

let () =
let d = 4 in
let sort a = Array.sort compare a; a in
let image =
Array.init d (fun i -> i) |>
map (rasters d) |>
Array.to_list |>
Array.concat |>
sort |>
Array.map snd in
Printf.printf "P5\n%d %d\n255\n" n n;
for y = n - 1 downto 0 do
print_string image.(y)
done

--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
OCaml for Scientists
http://www.ffconsultancy.com/products/ocaml_for_scientists/?e

_______________________________________________
Caml-list mailing list. Subscription management:
http://yquem.inria.fr/cgi-bin/mailman/listinfo/caml-list
Archives: http://caml.inria.fr
Beginner's list: http://groups.yahoo.com/group/ocaml_beginners
Bug reports: http://caml.inria.fr/bin/caml-bugs

Jonathan Bryant

unread,

Jun 5, 2007, 2:28:18 PM6/5/07

to Jon Harrop, Caml List

>
> I believe the performance relies upon the Linux kernel lazily
> copying the
> process. Does OSX also do that?

The man page under OSX doesn't mention anything about copy-on-write.
That doesn't necessarily mean anything, though...

--Jonathan

Joel Reymont

unread,

Jun 5, 2007, 2:33:38 PM6/5/07

to Jon Harrop, Caml List

On Jun 5, 2007, at 7:13 PM, Jon Harrop wrote:

> This isn't as sophisticated as the JoCaml version but it does double
> performance (taking the time down from 4s to 2s) on my dual-core
> machine,

I must have missed the JoCaml version. Where is it?

Thanks, Joel

--
http://topdog.cc - EasyLanguage to C# translator
http://wagerlabs.com - Blog

Jon Harrop

unread,

Jun 5, 2007, 2:54:09 PM6/5/07

to Joel Reymont, Caml List

On Tuesday 05 June 2007 19:32:25 Joel Reymont wrote:
> On Jun 5, 2007, at 7:13 PM, Jon Harrop wrote:
> > This isn't as sophisticated as the JoCaml version but it does double
> > performance (taking the time down from 4s to 2s) on my dual-core
> > machine,
>
> I must have missed the JoCaml version. Where is it?
>
> Thanks, Joel

http://jocaml.inria.fr/#examples

There is also a JoCaml mailing list for JoCaml-related ramblings, and it could
do with more rambling. ;-)

--
Dr Jon D Harrop, Flying Frog Consultancy Ltd.
OCaml for Scientists
http://www.ffconsultancy.com/products/ocaml_for_scientists/?e

_______________________________________________

Chris King

unread,

Jun 5, 2007, 3:14:28 PM6/5/07

to Jon Harrop, Caml List

On 6/5/07, Jon Harrop <j...@ffconsultancy.com> wrote:
> Following on from our "why not fork" discussion, here is my most elegant
> concurrent ray tracer written in vanilla OCaml.
>
> This program simply runs four processes (forking off three) in parallel to
> improve performance when 2-4 CPUs are available (increase "d" if you have >4
> CPUs).

I'm curious, have you considered trying camlp3l [1] to parallelize
your raytracer? It uses high-level constructs ("skeletons") to
describe parallelizable functions, e.g. for a raytracer you can
essentially just say "farm out each row to a different computing
node". It uses marshalling over TCP/IP sockets, so can be distributed
if you like. The forking you just do beforehand in a shell script
(though it gives an example where you do it in O'Caml too).

[1] http://camlp3l.inria.fr/

Joel Reymont

unread,

Jun 5, 2007, 3:21:32 PM6/5/07

to Chris King, Caml List

I'm gonna go bonkers soon. Suppose I know and quite like Erlang but
dislike its syntax and performance on non-IO things. I'm also quite
fond of OCaml.

Should I go with JoCaml? Camlp3l? Why?

Thanks, Joel

On Jun 5, 2007, at 8:13 PM, Chris King wrote:

> I'm curious, have you considered trying camlp3l [1] to parallelize
> your raytracer? It uses high-level constructs ("skeletons") to
> describe parallelizable functions, e.g. for a raytracer you can
> essentially just say "farm out each row to a different computing
> node".

--

http://topdog.cc - EasyLanguage to C# translator
http://wagerlabs.com - Blog

_______________________________________________

Erik de Castro Lopo

unread,

Jun 5, 2007, 5:04:49 PM6/5/07

to caml...@inria.fr

Joel Reymont wrote:

> I'm gonna go bonkers soon. Suppose I know and quite like Erlang but
> dislike its syntax and performance on non-IO things. I'm also quite
> fond of OCaml.

I'm very keen on Ocaml and started learning Erlang for the concurrency.
However, Erlang's syntax, dynamic typing and "everything runs in the
erl shell" thing started to annoy me.

So JoCaml is currently looking *really* good.

> Should I go with JoCaml? Camlp3l? Why?

I think we let Jon Harrop and others (maybe even me) thrash this out
on a number of examples and then enjoy the benefits:-).

Erik
--
-----------------------------------------------------------------
Erik de Castro Lopo
-----------------------------------------------------------------
"I do have a policy of mandatory evisceration where the crime of
whitespace in filenames is perpetrated on my systems. This has
inspired a new range of designer colostomy bag covers in corduroy,
velvet, and the ever popular denim."
-- jdub on the SLUG mailing list

Christopher Cramer

unread,

Jun 5, 2007, 6:32:15 PM6/5/07

to caml...@inria.fr

Jon Harrop:

> I believe the performance relies upon the Linux kernel lazily copying
> the process. Does OSX also do that?

It's called copy-on-write and I would be surprised if OSX didn't also
do it.

The only way to start a new process is to fork, so even if you're just
running another program you fork first, and then replace the process
image with the new program with exec. If the fork had to copy the entire
process image before just throwing it away upon exec, I think Unix,
which is based around a philosophy of piping between multiple processes,
would have abandoned fork a long time ago. Then again, there is vfork,
so I guess they almost did abandon it at one point.

This method doesn't work well at all on Windows, as I understand
it.

BTW your code looks a lot nicer than mine. The |> is brilliant, I think.
I'm a little surprised you cut the speed in half without using select.
And looking at your code I just realized I have a bug in mine...

Eric Cooper

unread,

Jun 5, 2007, 6:55:24 PM6/5/07

to caml...@inria.fr

On Tue, Jun 05, 2007 at 03:30:47PM -0700, Christopher Cramer wrote:
> If the fork had to copy the entire process image before just
> throwing it away upon exec, I think Unix, which is based around a
> philosophy of piping between multiple processes, would have
> abandoned fork a long time ago. Then again, there is vfork, so I
> guess they almost did abandon it at one point.

vfork was introduced in BSD Unix when support for page-based virtual
memory was added for the DEC VAX. There was an earlier Bell Labs
port of Unix to the VAX that just mimicked the PDP-11's segment
registers. In that scheme, I believe text segments were shared, so
that only the data segment had to be copied on fork, but copy-on-write
is the way to go on any architecture with page tables.

--
Eric Cooper e c c @ c m u . e d u

Brian Hurt

unread,

Jun 5, 2007, 7:54:42 PM6/5/07

to Jon Harrop, Caml List

On Tue, 5 Jun 2007, Jon Harrop wrote:

>
> Following on from our "why not fork" discussion, here is my most elegant
> concurrent ray tracer written in vanilla OCaml.
>
> This program simply runs four processes (forking off three) in parallel to
> improve performance when 2-4 CPUs are available (increase "d" if you have >4
> CPUs).
>
> This isn't as sophisticated as the JoCaml version but it does double
> performance (taking the time down from 4s to 2s) on my dual-core machine,
> making it faster than any language that uses a concurrent GC. I don't know
> about you guys, but this is a complete revelation for me!

This actually doesn't surprise me. Big linear algebra problems- like ray
tracing- are almost embarassingly parallel. This is why the supercomputer
market died a decade and a half ago- it was just too easy to take the
problems supercomputers were designed to handle and spread them out over
an cluster of workstations or pcs. What needs to be shared in the ray
tracing example? The model, obviously. And the results have to be
collected. And what subset of the problem (which collection of rays) need
to be worked on. I note that the big-iron database machine (aka
mainframe) market is alive and well. Some problems don't parallelize
quite so easily.

My point here is to caution against making too much soup from this lone
oyster. Simply because this program, and a large class of other programs
like it, parallelize easily, doesn't mean all will.

On the other hand, for programs that do parallelize this way, JoCaml looks
like a wonderful gift.

Brian

Oliver Bandel

unread,

Jun 6, 2007, 5:30:22 AM6/6/07

to caml...@inria.fr

On Tue, Jun 05, 2007 at 03:30:47PM -0700, Christopher Cramer wrote:

> Jon Harrop:
> > I believe the performance relies upon the Linux kernel lazily copying
> > the process. Does OSX also do that?
>
> It's called copy-on-write and I would be surprised if OSX didn't also
> do it.
>
> The only way to start a new process is to fork, so even if you're just
> running another program you fork first, and then replace the process
> image with the new program with exec. If the fork had to copy the entire
> process image before just throwing it away upon exec, I think Unix,
> which is based around a philosophy of piping between multiple processes,
> would have abandoned fork a long time ago. Then again, there is vfork,
> so I guess they almost did abandon it at one point.
>

[...]

vfork is only (!!!) for a fork-exec combination.

So, be aware: do not use vfork, if you don't exec right after it!

Ciao,
Oliver

Shawn W.

unread,

Jun 6, 2007, 1:51:18 PM6/6/07

to caml...@yquem.inria.fr

On Jun 6, 2007, at 2:28 AM, Oliver Bandel wrote:
>
> vfork is only (!!!) for a fork-exec combination.
>
> So, be aware: do not use vfork, if you don't exec right after it!
>

Don't use it even then! The only functions a vfork()ed child process
can call are an exec*() one or _exit() (And then I think only execv()
and execve() are safe). It can't return from the function vfork() was
called in. It can't safely modify variables.

In other words, it can't do any error handling to speak of if the
exec fails. Sure, it's faster than a fork() (Even on a OS that does
copy-on-right memory pages -- which most modern ones do) because you
don't copy process table structures.... but it's way too fragile when
something goes wrong. Which will happen, sooner or later.

--
Shawn W.
sha...@speakeasy.org

Richard Jones

unread,

Jun 7, 2007, 8:05:40 AM6/7/07

to Jon Harrop, Caml List

How close are we to real-time raytracing then? Given that 4 & even 8
core machines may become commonplace soon.

Rich.

--
Richard Jones
Red Hat

Luc Maranget

unread,

Jun 14, 2007, 6:11:09 AM6/14/07

to Richard Jones, Caml List

>
> How close are we to real-time raytracing then? Given that 4 & even 8
> core machines may become commonplace soon.
>
> Rich.
>
> --
> Richard Jones
> Red Hat

For your information there is now a more realistic ray tracer in JoCaml.
<http://jocaml.inria.fr/#examples>
Speedups can be impressive, but no real-time yet, even on
a 2x11x2Ghz grid.

The JoCaml ray tracer is a distributed/concurrent version of
the Camls'R Us entry at ICFP 2000 programming contest.

--
Luc Maranget