gforth-fast on RISC-V, using these definitions:
: exch1 over @ over @ >r swap ! r> swap ! ;
: exch2 over @ over @ swap rot ! swap ! ;
: exch3 dup @ 2 pick @ rot ! swap ! ;
And what SEE-CODE produces for them (it sometimes guesses the word
wrong, the native code is correct):
EXCH1: EXCH2: EXCH3:
over over dup
ld s1,$8(s8) ld s1,$8(s8) #0
addi s10,s10,8 addi s10,s10,8 sd s7,$0(s8)
@ @ ld s7,$0(s7)
ld s1,$0(s1) ld s1,$0(s1) addi s8,s8,-8
addi s10,s10,8 addi s10,s10,8 addi s10,s10,10
over over third
mv s3,s7 mv s3,s7 ld a5,$10(s8)
addi s10,s10,8 addi s10,s10,8 addi s10,s10,8
@ @ addi s8,s8,-8
ld s3,$0(s3) ld s3,$0(s3) sd s7,$8(s8)
addi s10,s10,8 addi s10,s10,8 mv s7,a5
>r swap noop
addi s2,s2,-8 mv a5,s1 addi s8,s8,8
addi s10,s10,8 addi s10,s10,8 mv s1,s7
sd s3,$0(s2) mv s1,s3 ld s7,$0(s8)
swap mv s3,a5 ld s1,$0(s1)
addi s8,s8,8 rot addi s10,s10,8
mv s3,s7 mv a5,s7 rot
ld s7,$0(s8) addi s10,s10,8 ld s3,$8(s8)
addi s10,s10,8 mv s7,s1 addi s10,s10,8
! mv s1,s3 addi s8,s8,8
sd s1,$0(s3) mv s3,a5 !
addi s10,s10,8 ! sd s1,$0(s3)
r> sd s1,$0(s3) addi s10,s10,8
ld s1,$0(s2) addi s10,s10,8 swap
addi s10,s10,8 swap ld s1,$8(s8)
addi s2,s2,8 ld s1,$8(s8) addi s10,s10,8
swap addi s10,s10,8 addi s8,s8,8
addi s8,s8,8 addi s8,s8,8 !
mv s3,s7 ! sd s7,$0(s1)
ld s7,$0(s8) sd s7,$0(s1) addi s10,s10,8
addi s10,s10,8 addi s10,s10,8 noop
! noop ld s7,$8(s8)
sd s1,$0(s3) ld s7,$8(s8) addi s8,s8,8
addi s10,s10,8 addi s8,s8,8 ld a6,$0(s2)
;s ld a6,$0(s2) addi s2,s2,8
ld a6,$0(s2) addi s2,s2,8 addi s10,a6,$8
addi s2,s2,8 addi s10,a6,$8 ld a4,$-8(s10)
addi s10,a6,$8 ld a4,$-8(s10) jr a4
ld a4,$-8(s10) jr a4
jr a4
84 bytes 80 bytes 88 bytes
Interesting: As far as native code is concerned, they all have the
same number of instructions.
Now Aarch64:
EXCH1: EXCH2: EXCH3:
noop over noop
str x27, [x25],#-0x8 ldr x21, [x25,#0x8] str x27, [x25],#-0x8
ldr x27, [x25,#0x10] add x26, x26, #0x8 ldr x27, [x25,#0x8]
add x26, x26, #0x8 @ add x26, x26, #0x8
@ ldr x21, [x21,#0x0] @
ldr x27, [x27,#0x0] add x26, x26, #0x8 ldr x27, [x27,#0x0]
add x26, x26, #0x8 over add x26, x26, #0x8
over mov x24, x27 third
ldr x21, [x25,#0x8] add x26, x26, #0x8 mov x0, x25
add x26, x26, #0x8 @ sub x25, x25, #0x8
@ ldr x24, [x24,#0x0] add x26, x26, #0x8
ldr x21, [x21,#0x0] add x26, x26, #0x8 ldr x0, [x0,#0x10]
add x26, x26, #0x8 swap str x27, [x25,#0x8]
>r str x27, [x25],#-0x8 mov x27, x0
sub x22, x22, #0x8 add x26, x26, #0x8 @
add x26, x26, #0x8 mov x27, x24 ldr x27, [x27,#0x0]
str x21, [x22,#0x0] rot add x26, x26, #0x8
swap ldr x24, [x25,#0x8]! rot
ldr x21, [x25,#0x8]! add x26, x26, #0x8 mov x0, x25
add x26, x26, #0x8 ! mov x21, x27
! add x26, x26, #0x8 add x25, x25, #0x10
add x26, x26, #0x8 str x21, [x24,#0x0] add x26, x26, #0x8
str x27, [x21,#0x0] swap ldp x27, x24, [x0,#0x8]
r> ldr x21, [x25,#0x8]! !
ldr x27, [x22],#0x8 add x26, x26, #0x8 add x26, x26, #0x8
add x26, x26, #0x8 ! str x21, [x24,#0x0]
swap add x26, x26, #0x8 swap
ldr x21, [x25,#0x8]! str x27, [x21,#0x0] ldr x21, [x25,#0x8]!
add x26, x26, #0x8 noop add x26, x26, #0x8
! mov x0, x25 !
add x26, x26, #0x8 add x25, x25, #0x8 add x26, x26, #0x8
str x27, [x21,#0x0] ldr x27, [x0,#0x8] str x27, [x21,#0x0]
noop ldr x26, [x22],#0x8 noop
mov x0, x25 add x26, x26, #0x8 mov x0, x25
add x25, x25, #0x8 ldur x1, [x26,#-0x8] add x25, x25, #0x8
ldr x27, [x0,#0x8] br x1 ldr x27, [x0,#0x8]
ldr x26, [x22],#0x8 ldr x26, [x22],#0x8
add x26, x26, #0x8 add x26, x26, #0x8
ldur x1, [x26,#-0x8] ldur x1, [x26,#-0x8]
br x1 br x1
116 bytes 104 bytes 124 bytes
Stack caching works well for these two architectures for EXCH2,
because we have many variants of OVER, @, SWAP, ROT, and !, but not so
great for EXCH3, because we only have one variant of THIRD, so the use
of THIRD means that the TOS is in a register and the rest in memory
immediately before and after THIRD, and all the code around it has to
live with this constraint, and is also longer, in particular the ROT.
Now AMD64 (where stack caching does not work so well):
EXCH1: EXCH2: EXCH3
noop noop dup
mov [r14],rbx mov [r14],rbx #0
sub r14,$08 sub r14,$08 sub r14,$08
mov rbx,$10[r14] mov rbx,$10[r14] mov $08[r14],rbx
add r15,$08 add r15,$08 mov rbx,[rbx]
@ @ add r15,$10
mov rbx,[rbx] mov rbx,[rbx] third
add r15,$08 add r15,$08 mov rax,$10[r14]
noop noop sub r14,$08
mov [r14],rbx mov [r14],rbx add r15,$08
sub r14,$08 sub r14,$08 mov $08[r14],rbx
mov rbx,$10[r14] mov rbx,$10[r14] mov rbx,rax
add r15,$08 add r15,$08 @
@ @ mov rbx,[rbx]
mov rbx,[rbx] mov rbx,[rbx] add r15,$08
add r15,$08 add r15,$08 rot
>r swap mov rdx,$08[r14]
add r14,$08 mov rax,$08[r14] mov rax,$10[r14]
sub r13,$08 add r15,$08 mov $08[r14],rbx
mov $00[r13],rbx mov $08[r14],rbx add r15,$08
add r15,$08 mov rbx,rax mov $10[r14],rdx
mov rbx,[r14] rot mov rbx,rax
swap mov rdx,$08[r14] !
mov rax,$08[r14] mov rax,$10[r14] mov rax,$08[r14]
add r15,$08 mov $08[r14],rbx add r14,$10
mov $08[r14],rbx add r15,$08 add r15,$08
mov rbx,rax mov $10[r14],rdx mov [rbx],rax
! mov rbx,rax mov rbx,[r14]
mov rax,$08[r14] ! swap
add r14,$10 mov rax,$08[r14] mov rax,$08[r14]
add r15,$08 add r14,$10 add r15,$08
mov [rbx],rax add r15,$08 mov $08[r14],rbx
mov rbx,[r14] mov [rbx],rax mov rbx,rax
noop mov rbx,[r14] !
mov [r14],rbx swap #0
sub r14,$08 mov rax,$08[r14] mov rax,$08[r14]
mov rbx,$00[r13] add r15,$08 add r14,$10
add r15,$08 mov $08[r14],rbx add r13,$08
add r13,$08 mov rbx,rax mov [rbx],rax
swap ! mov r10,-$08[r13]
mov rax,$08[r14] #0 mov rbx,[r14]
add r15,$08 mov rax,$08[r14] lea r15,$08[r10]
mov $08[r14],rbx add r14,$10 mov rcx,-$08[r15]
mov rbx,rax add r13,$08 jmp ecx
! mov [rbx],rax
#0 mov r10,-$08[r13]
mov rax,$08[r14] mov rbx,[r14]
add r14,$10 lea r15,$08[r10]
add r13,$08 mov rcx,-$08[r15]
mov [rbx],rax jmp ecx
mov r10,-$08[r13]
mov rbx,[r14]
lea r15,$08[r10]
mov rcx,-$08[r15]
jmp ecx
162 bytes 147 bytes 129 bytes
Let's look at some sophisticated compilers:
VFX64:
EXCH1 EXCH2 EXCH3
MOV RDX, [RBP] MOV RDX, [RBP] MOV RDX, 0 [RBX]
MOV RDX, 0 [RDX] MOV RDX, 0 [RDX] MOV RCX, [RBP]
MOV RCX, 0 [RBX] MOV RCX, 0 [RBX] MOV RCX, 0 [RCX]
PUSH RCX MOV 0 [RBX], RDX MOV 0 [RBX], RCX
MOV 0 [RBX], RDX MOV RBX, [RBP] MOV RBX, [RBP]
POP RBX MOV 0 [RBX], RCX MOV 0 [RBX], RDX
MOV RDX, [RBP] MOV RBX, [RBP+08] MOV RBX, [RBP+08]
MOV 0 [RDX], RBX LEA RBP, [RBP+10] LEA RBP, [RBP+10]
MOV RBX, [RBP+08] RET/NEXT RET/NEXT
LEA RBP, [RBP+10] 29 bytes 29 bytes
RET/NEXT
31 bytes
EXCH1 suffers from VFX not being analytical about the return stack.
lxf:
EXCH1: EXCH2: EXCH3:
mov eax , [ebp] mov eax , [ebp] mov eax , ebx
mov eax , [eax] mov eax , [eax] mov eax , [eax]
mov ecx , ebx mov ecx , ebx mov ecx , [ebp]
mov ecx , [ecx] mov ecx , [ecx] mov ecx , [ecx]
mov [ebx] , eax mov [ebx] , eax mov [ebx] , ecx
mov ebx , [ebp] mov ebx , [ebp] mov ebx , [ebp]
mov [ebx] , ecx mov [ebx] , ecx mov [ebx] , eax
mov ebx , [ebp+4h] mov ebx , [ebp+4h] mov ebx , [ebp+4h]
lea ebp , [ebp+8h] lea ebp , [ebp+8h] lea ebp , [ebp+8h]
ret near ret near ret near
No return stack overhead here, but a reg-reg MOV that VFX avoids.
iForth:
EXCH1: EXCH2: EXCH3:
pop rbx pop rbx mov rbx, [rsp] qword
pop rdi pop rdi push [rbx] qword
mov rax, [rdi] qword mov rax, [rbx] qword mov rbx, [rsp #16 +] qword
mov rdx, [rbx] qword mov rdx, [rdi] qword pop rdi
mov [ebx] dword, rax mov [ebx] dword, rdx pop rax
mov [edi] dword, rdx mov [edi] dword, rax mov rdx, [rbx] qword
; ; mov [eax] dword, rdx
pop rbx
mov [ebx] dword, rdi
;
iForth does not keep the TOS in a register on word boundaries, and
uses RSP as data stack pointer. Apparently it implements 2 PICK by
first dumping the whole stack into memory.
- anton
--
M. Anton Ertl
http://www.complang.tuwien.ac.at/anton/home.html
comp.lang.forth FAQs:
http://www.complang.tuwien.ac.at/forth/faq/toc.html
New standard:
http://www.forth200x.org/forth200x.html
EuroForth 2021:
https://euro.theforth.net/2021