Small ray-tracer for Chicken Scheme

WJ

unread,

Apr 4, 2015, 11:46:35 PM4/4/15

to

[ Can anyone explain why Chicken is so slow?
The Racket version runs 17 times as fast as this.
]

;; Ray tracer for Chicken Scheme.
;; Creates graphic file "junk-trash.ppm".
;; Converted from the original in C at
;; https://wiki.cs.auckland.ac.nz/enggen131/index.php/User:Dols008
;;
;; Compile with
;; csc -optimize-level 3 ray-trace.scm

(use extras)
(use srfi-99) ;; records (must install egg)
(use srfi-42) ;; do-ec (must install egg)

(define-constant Width 400)
(define-constant Height Width)

(print Width " x " Height )
(define Start-Time (current-seconds))

(define (square x) (* x x))

(define (add list1 list2)
(map + list1 list2))

(define (sub list1 list2)
(map - list1 list2))

(define (scale seq n)
(map (cut * <> n) seq))

(define (mul list1 list2)
(map * list1 list2))

(define (dot list1 list2)
(apply + (mul list1 list2)))

(define (squared-length alist)
(dot alist alist))

(define (normal alist)
(let ((len (sqrt (squared-length alist))))
(map (cut / <> len) alist)))

(define-record-type Ray #t #t (pos) (dir))

(define-record-type Light #t #t pos color)

(define-record-type Sphere #t #t
pos radius color shine reflect)

(define (ray-hit-sphere sphere ray)
(let* ((diff (sub (Sphere-pos sphere) (Ray-pos ray)))
(proj (dot diff (Ray-dir ray)))
(closest (add (Ray-pos ray) (scale (Ray-dir ray) proj)))
(tangent (sub closest (Sphere-pos sphere)))
(sq-tangent-length (squared-length tangent))
(sq-radius (square (Sphere-radius sphere))))
(if (> sq-tangent-length sq-radius)
0
(- proj (sqrt (- sq-radius sq-tangent-length))))))

(define (calc-lighting pos norm ray sphere light)
(let* ((rel (normal (sub (Light-pos light) pos)))
(diffuse (max (dot rel norm) 0))
(diff-col (scale (Light-color light) diffuse))
(eye (sub (Ray-pos ray) pos))
(half (normal (add eye rel)))
(specular (dot half norm))
(specular (expt (max specular 0) 64))
(spec-col (scale (Light-color light) specular)))
(add (mul (Sphere-color sphere) diff-col)
(scale spec-col (Sphere-shine sphere)))))

(define NUM_SPHERES 7)
(define NUM_LIGHTS 3)
(define spheres (make-vector NUM_SPHERES ))
(define lights (make-vector NUM_LIGHTS ))

(define (build-scene)
(do-ec (:range i 5)
(let* ((theta (* 0.4 (- i 2)))
(pos (list (* 3 (sin theta)) (* -3 (cos theta)) 5.0)))
(vector-set! spheres i (make-Sphere pos 1 '(0.8 0.1 0.1) 0.2 0))))

(vector-set! spheres 5 (make-Sphere
'(-3 1 5) 2 '(1 1 0.99) 0.5 1.0 ))
(vector-set! spheres 6 (make-Sphere
'(3 -3 15) 8 '(0.75 0.5 0) 0.5 1.0 ))

(vector-set! lights 0 (make-Light '(2 2 1) '(1 1 1) ))
(vector-set! lights 1 (make-Light '(-4 0 5) '(0.1 0.5 0.1)) )
(vector-set! lights 2 (make-Light '(4 0 5) '(0.1 0.5 0.1))))

(define MAX_RECURSION_DEPTH 2)

(define (trace ray depth )
(let ((hit #f)
(color (list 0 0 0))
(dist 0))
(do-ec (:range i NUM_SPHERES)
(let ((d (ray-hit-sphere (vector-ref spheres i) ray)))
(when (> d 0)
(when (or (not hit) (< d dist))
(set! dist d)
(set! hit i)))))
(if hit
(let* ((pos (add (Ray-pos ray) (scale (Ray-dir ray) dist)))
(norm (normal (sub pos
(Sphere-pos (vector-ref spheres hit))))))
(do-ec (:range i NUM_LIGHTS)
(set! color (add color
(calc-lighting pos norm ray (vector-ref spheres hit)
(vector-ref lights i)))))
(when (< depth MAX_RECURSION_DEPTH)
(let ((reflect-ray (make-Ray pos
(sub (Ray-dir ray)
(scale norm (* 2 (dot (Ray-dir ray) norm)))))))
(set! color
(add color
(scale (trace reflect-ray (+ 1 depth))
(Sphere-reflect (vector-ref spheres hit)))))))
color)
(scale (add (Ray-dir ray) '(1 1 1)) 0.125))))

(define r (make-Ray '(0 0 0) '(0 0 0)))
(define color '(0 0 0))
(define image '())

(build-scene)

(print "-----Rendering------")

(do-ec (:range y Height)
(begin
(when (zero? (modulo y (floor (+ 1 (/ Height 20)))))
(display "#")) ; Progress indicator.
(do-ec (:range x Width)
(begin
(Ray-dir-set! r
(normal
(list (- (/ x Width) 0.5)
(- (- (/ y Height) 0.5))
0.5)))
(set! color (trace r 0))
(set! color
(map (lambda (c) (min c 1)) color))
(set! image (cons color image))))))

(print)
(print (- (current-seconds) Start-Time) " seconds")

(let ((port (open-output-file "junk-trash.ppm" #:binary)))
(for-each
(cut display <> port)
(list "P6\n" Width " " Height "\n255\n"))
(do-ec (:list trio (reverse image))
(for-each
(cut write-byte <> port)
(map (lambda (x) (inexact->exact (round (* x 255.0)))) trio)))
(close-output-port port))

Daniel Leslie

unread,

Apr 5, 2015, 11:42:06 PM4/5/15

to

I ran it through perf.

Looks like you're really putting Chicken's GC through its paces, about half the cycles are spent on marking things for GC. My guess is that you're blowing through the stack.

Your trace function isn't tail-recursive and performs a slew of temporary allocations that would stay in scope. Chicken's GC isn't so great at being intelligent about cleaning things up that won't be touched again, despite staying in scope, and runs all of its allocations in the stack. So it's probably spending a lot of time shuffling stuff out of the stack and into the heap to make up for the absence of stuff falling out of scope.

That's just my late-evening, post-easter-wine guess. :)

-Dan

Daniel Leslie

unread,

Apr 5, 2015, 11:44:17 PM4/5/15

to

> Chicken's GC isn't so great at being intelligent about cleaning things up that won't be touched again, despite staying in scope, and runs all of its allocations in the stack.

Er, this should have been written, "Chicken's GC isn't so great at being intelligent about cleaning things up that won't be touched again if they don't fall out of scope."

;)

-Dan

WJ

unread,

Apr 6, 2015, 12:36:05 AM4/6/15

to

Daniel Leslie wrote:

> I ran it through perf.
>
> Looks like you're really putting Chicken's GC through its
> paces, about half the cycles are spent on marking things for
> GC. My guess is that you're blowing through the stack.
>
> Your trace function isn't tail-recursive and performs a slew
> of temporary allocations that would stay in scope. Chicken's
> GC isn't so great at being intelligent about cleaning things
> up that won't be touched again,

Thanks for the feedback.

Even though trace isn't tail-recursive,
MAX_RECURSION_DEPTH is set to a low value.
I tried setting it even lower and got these
timings:

;; 95 seconds
;; (define MAX_RECURSION_DEPTH 2)
;; 73 seconds
;; (define MAX_RECURSION_DEPTH 1)
;; 67 seconds
(define MAX_RECURSION_DEPTH 0)

On the same machine, the Racket version runs in about
5.3 seconds with MAX_RECURSION_DEPTH set to 2.
Interpreted Gauche runs its version in about 64 seconds
(MAX_RECURSION_DEPTH 2).

The Larceny benchmarks that you linked to earlier show
interpreted Gauche beating compiled Chicken at several
tasks:

http://www.larcenists.org/benchmarksGenuineR7Linux.html

I can't help but conclude that the implementors of Chicken
don't know much about generating fast code.

This is very disappointing.

jeffrey.ma...@gmail.com

unread,

Apr 6, 2015, 9:30:37 AM4/6/15

to

This appears to be a variant of a benchmark that was discussed about a decade ago:

http://www.ffconsultancy.com/languages/ray_tracer/

From that page:

The fastest languages are C++ and OCaml in 64-bit and C++ and Stalin-compiled Scheme in 32-bit.

In order to achieve competitive run-time performance, the Stalin Scheme compiler clearly does an incredible job of optimising unspecialised code and can even beat all of the other implementations in this benchmark with only a few low-level optimisations.

That page does not reference the 64 bit version of Stalin, which the following does:

http://comp.lang.scheme.narkive.com/mLuRFCAD/64-bit-stalin-and-the-ray-tracer

From that page:

Once again, Stalin-compiled Scheme is one of the fastest languages (between
C++ and OCaml).

John Cowan

unread,

Apr 7, 2015, 7:55:12 AM4/7/15

to

On Monday, April 6, 2015 at 12:36:05 AM UTC-4, WJ wrote:

> The Larceny benchmarks that you linked to earlier show
> interpreted Gauche beating compiled Chicken at several
> tasks:
>
> http://www.larcenists.org/benchmarksGenuineR7Linux.html
>
> I can't help but conclude that the implementors of Chicken
> don't know much about generating fast code.

Chicken's R7RS mode currently requires full numeric tower
support, which is not yet optimized. Chicken can do a much
better job in its default fixnum-flonum-only mode. (Note
that the full tower is not required by R7RS-small; see
<http://trac.sacrideo.us/wg/wiki/NumericTower>.)

In addition, small vectors will probably run faster than
small lists on essentially all Schemes.

--
John Cowan http://www.ccil.org/~cowan co...@ccil.org
At times of peril or dubitation,
Perform swift circular ambulation,
With loud and high-pitched ululation.

cesu...@gmail.com

unread,

Apr 7, 2015, 10:41:18 AM4/7/15

to

John Cowan wrote:
> In addition, small vectors will probably run faster than
> small lists on essentially all Schemes.

That's not true. When access is sequential, as in the code
WJ posted, lists will probably be faster than vectors until
their size is large enough for caching and garbage collection
to matter.

For Chicken's R7RS mode, sequential access on lists of length
less than 100 is faster than exactly the same access pattern
on vectors. On the machine I use for benchmarking, for lists
of length 3 as in WJ's code, list access is about three times
as fast as vector access.

For Chibi as well, lists of length less than 100 are faster
than vectors, although the difference becomes smaller as the
length increases.

For Larceny and Gauche, sequential access on lists of length
less than 8 is faster than vectors of the same length. For
Petit Larceny, the breakeven length is 5.

Kawa was the only system I benchmarked in which vectors gave
faster sequential access than lists.

You'd expect lists of length 3 might take longer to allocate
than vectors of length 3, and that was true in Larceny, but
there was very little difference in Chicken, Kawa, Gauche,
and Petit Larceny. In Chibi, allocating lists of length 3
was twice as fast as allocating vectors of the same length.

Will

WJ

unread,

Apr 7, 2015, 11:48:51 AM4/7/15

to

John Cowan wrote:

> > http://www.larcenists.org/benchmarksGenuineR7Linux.html
> >
> > I can't help but conclude that the implementors of Chicken
> > don't know much about generating fast code.
>
> Chicken's R7RS mode currently requires full numeric tower
> support, which is not yet optimized. Chicken can do a much
> better job in its default fixnum-flonum-only mode.

So far as I know, I'm not using Chicken in R7Rs mode;
at least, I'm making no attempt to do so.

I added this to the ray-tracer source:

(define (fact n) (if (< n 2) 1 (* n (fact (- n 1)))))
(print (fact 22))

The output was:

1.12400072777761e+021

So it's not using the full numeric tower.

The Larceny benchmarks alluded to earlier show Gauche
beating Chicken in

geometricMean
deriv:10000000
destruc:600:50:4000
destruc:600:50:4000
triangl:22:1:50
tak:40:20:11:1
fib:40:5
fibfp:35.0:10
sum:10000:200000
sumfp:1000000.0:500
fft:65536:100
mbrot:75:1000
mbrotZ:75:1000
nucleic:50
pi:50:500:50:2
pnpoly:1000000
ray:50
simplex:1000000
ack:3:12:2
array1:1000000:500
string:500000:25
sum1:25
cat:50
wc:inputs/bib:50
read1:2500
maze:20:7:10000
mazefun:11:11:10000
nqueens:13:10
paraffins:23:10
quicksort:10000:2500
nboyer:5:1
perm20:10:2:1
bv2string:1000:1000:100

And Racket runs the ray-tracer 17 times as fast as
Chicken.

Either the Chicken implementors don't care about speed
or they don't know how to achieve speed.

--
"Infant Innocence" by A. E. Housman

The Grizzly Bear is huge and wild; He has devoured the infant child.
The infant child is not aware He has been eaten by the bear.

Stephen Eilert

unread,

Apr 7, 2015, 12:35:05 PM4/7/15

to

On Tuesday, April 7, 2015 at 12:48:51 PM UTC-3, WJ wrote:

> And Racket runs the ray-tracer 17 times as fast as
> Chicken.
>
> Either the Chicken implementors don't care about speed
> or they don't know how to achieve speed.

That may or not be the case. My guess is that it is a pathological case for the specific GC implementation used by Chicken. There's an ongoing discussion.

You, on the other hand, either don't know or don't care how to achieve politeness.

-- Stephen

jeffrey.ma...@gmail.com

unread,

Apr 7, 2015, 5:40:01 PM4/7/15

to

Raw benchmark numbers are only part of the answer. The more interesting question is why? Which the benchmark numbers alone don't answer. One needs analysis for that. Do you have an explanation for why accessing vectors is slower than lists? I'm not interested in the answer for interpreters, only compilers that generate native code, either ahead of time, or jit (after factoring out the jit cost), perhaps by way of intermediate languages of any form (C, LLVM, .NET, JVM, ...).

cesu...@gmail.com

unread,

Apr 7, 2015, 8:44:33 PM4/7/15

to

jeffrey.ma...@gmail.com wrote:

> Raw benchmark numbers are only part of the answer. The more
> interesting question is why?

Agreed.

> Do you have an explanation for why accessing vectors is slower
> than lists?

Yes.

> I'm not interested in the answer for interpreters, only compilers
> that generate native code, either ahead of time, or jit (after
> factoring out the jit cost), perhaps by way of intermediate
> languages of any form (C, LLVM, .NET, JVM, ...).

Then you're interested in Larceny, Petit Larceny, Chicken, and Kawa.
Larceny JIT-compiles directly to native code; the others compile to
C or JVM code, which are then compiled to native code. With the
possible exception of Kawa, my benchmarks excluded compile time.

All four generate safe code, all four are geared toward separate
compilation of libraries, and all four support generic arithmetic.
Safety and generic arithmetic are relevant here.

Sequential traversal of a list involves, for each list element:

* one null check
* one pair check
* two load instructions

(Some of the less optimizing compilers might perform two pair checks
per list element instead of just one. I know Larceny performs just
one.)

Sequential traversal of a vector involves, for each vector element:

* one fixnum check on the vector index
* one comparison of the vector index against the vector size
* one load instruction
* one addition (to compute the next index)
* one overflow check on the result of that addition

(Some of the less optimizing compilers might perform additional
operations per vector element, such as checking to make sure the
vector is still a vector, fetching the vector size on each access,
checking to make sure the incremented vector index is non-negative,
or checking to determine the index being incremented is a fixnum
rather than a bignum or (in the case of Chicken) a flonum.)

Furthermore, the vector traversal has more startup cost than the
list traversal because you have to check that the vector is indeed
a vector and fetch its length. That, together with the cache and
gc considerations, explains why there's a crossover point in Larceny
and Petit Larceny.

The pair check probably costs exactly the same as the fixnum check,
and the null check should cost no more than the overflow check (and
might cost less), so the question becomes whether the second
(dependent) load costs more than the comparison and addition.

If all loads hit the L1 (or even L2) cache, and the allocation rate
is substantially lower than the traversal rate, then sequential
traversal of short lists should be very competitive with sequential
traversal of short vectors. That hypothesis has been supported by
benchmarking.

In unsafe code, vectors would probably be faster across the board.

Will

jeffrey.ma...@gmail.com

unread,

Apr 7, 2015, 9:09:35 PM4/7/15

to

> > Do you have an explanation for why accessing vectors is slower
> > than lists?
>
> Yes.

Interesting analysis, Thanks!

> Safety and generic arithmetic are relevant here.

> In unsafe code, vectors would probably be faster across the board.

The metalevel conclusion is that there are several different factors that impact the question of whether lists or vectors are faster, among them choice of language, choice of implementation, and choice of potential options for control of safety. Some implementations might do various kinds of inference to eliminate some of the type checks and perhaps even the bounds checks. Some languages and language design choices might facilitate or hinder ability to do such. In some situations, people might care less about safety then performance. Or vice versa. Ditto for generic arithmetic. One must be careful about giving generic answers without the conditions under which they hold.

Daniel Leslie

unread,

Apr 9, 2015, 12:48:55 PM4/9/15

to

You can be fairly certain of Kawa excluding compilation time by running the tests at least once before running them again to test for performance.

-Dan

cesu...@gmail.com

unread,

Apr 9, 2015, 5:51:33 PM4/9/15

to

Good point. (Daniel's suggestion would delay the measurements
until after Java HotSpot(TM) VM's mixed mode has decided it is
worthwhile to use the JIT-compiler.)

Adding to my earlier explanation of why short lists are often
faster than short vectors, and illustrating some points made
by Jeffrey, here is the source code I benchmarked:

(define (search-list x y)
(cond ((null? y)
#f)
((eq? x (car y))
#t)
(else
(search-list x (cdr y)))))

(define (search-vector x y)
(let ((n (vector-length y)))
(let loop ((i 0))
(cond ((= i n)
#f)
((eq? x (vector-ref y i))
#t)
(else
(loop (+ i 1)))))))

In Larceny, the loop within search-list turns into the following
x86 machine code, to which I have added comments showing slightly
higher-level virtual machine code from which the x86 code was then
generated. (The check pseudo-instruction branches to an exception
sequence if the preceding safety check comes out false.)

00000018 80FA0A cmp dl,0xa ; reg 2 ; y
; op2 null?
0000001B 7506 jnz 0x23 ; branchf 0x23
0000001D BB02000000 mov ebx,0x2 ; const #f
00000022 C3 ret ; return

00000023 8D4207 lea eax,[edx+0x7] ; reg 2 ; y
00000026 A807 test al,0x7 ; op1 pair?
00000028 751C jnz 0x46 ; check 0x46,(2 0 0)
0000002A 8B7AFF mov edi,[edx-0x1] ; reg 2 ; y
; op1 car:pair
0000002D 39F9 cmp ecx,edi ; op2 eq?,1
0000002F 7506 jnz 0x37 ; branchf 0x37
00000031 BB06000000 mov ebx,0x6 ; const #t
00000036 C3 ret ; return

00000037 8B5203 mov edx,[edx+0x3] ; reg 2 ; y
; op2 cdr:pair
; setreg 2
0000003A FF4D08 dec dword [ebp+0x8] ; branch 0x18
0000003D 75D9 jnz 0x18
0000003F FF5518 call dword [ebp+0x18] ; (timer exception)

In the common case, that inner loop executes 11 machine instructions.
The inner loop for search-vector normally executes 20 instructions:

00000018 8B5D48 mov ebx,[ebp+0x48] ; reg 4 ; i
0000001B 39FB cmp ebx,edi ; op2 =:fix:fix,3
0000001D 7506 jnz 0x25 ; branchf 0x25
0000001F BB02000000 mov ebx,0x2 ; const #f
00000024 C3 ret ; return

00000025 8B5AFD mov ebx,[edx-0x3] ; reg 2 ; y
00000028 C1EB08 shr ebx,byte 0x8 ; op1 vector-length:vec
0000002B 895D7C mov [ebp+0x7c],ebx ; setreg 17
0000002E 8B5D48 mov ebx,[ebp+0x48] ; reg 4 ; i
00000031 3B5D7C cmp ebx,[ebp+0x7c] ; op2 <:fix:fix,17
00000034 7D5B jnl 0x91 ; check 0x91,(4 2 0)
00000036 8B4548 mov eax,[ebp+0x48] ; reg 4 ; i
; setreg eax ; TEMP
00000039 8B5C0201 mov ebx,[edx+eax+0x1] ; reg 2 ; y
; op2 vector-ref:trusted,eax
0000003D 895D7C mov [ebp+0x7c],ebx ; setreg 17
00000040 3B4D7C cmp ecx,[ebp+0x7c] ; reg 1 ; x
; op2 eq?,17
00000043 7506 jnz 0x4b ; branchf 0x4b
00000045 BB06000000 mov ebx,0x6 ; const #t
0000004A C3 ret ; return

0000004B 8B5D48 mov ebx,[ebp+0x48] ; reg 4 ; i
0000004E 8D5B04 lea ebx,[ebx+0x4] ; op2imm +:imm:idx,1
00000051 895D7C mov [ebp+0x7c],ebx ; setreg 17
00000054 895D48 mov [ebp+0x48],ebx ; setreg 4
00000057 FF4D08 dec dword [ebp+0x8] ; branch 0x18
0000005A 75BC jnz 0x18
0000005C FF5518 call dword [ebp+0x18] ; (timer exception)

Even though (vector-length y) is computed outside of the loop,
and remains available in register 3 (x86 register edi), Larceny
is recomputing that length every time through the loop, at the
cost of three machine instructions. Other inefficiencies can be
seen in both inner loops, including the cost of using only the
IA32 architecture's limited register set. If Larceny's compiler
were using the larger x64 register set, both loops should run
faster.

In my opinion, the main reason to use an optimizing compiler
is that it lets you focus on your programming problem instead
of obsessing about the micro-efficiency of your code. As the
machine code above reveals, there isn't much a programmer could
do to make these inner loops run faster. To make them faster,
you'd have to use a better compiler or sacrifice some safety.

Will