Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

fasta bench

12 views
Skip to first unread message

MelissA

unread,
Apr 22, 2012, 5:49:32 PM4/22/12
to
This time I wrote fasta benchmark, and stole
some code from compilers ;)
For example division by constant implemented
by shift and multiply and loop that is faster
if iterates whole range instead of partial
range with conditional exit ;)

Implemented from this file:
http://shootout.alioth.debian.org/u64q/program.php?test=fasta&lang=gcc&id=4

Results on q6600 @ 2.4GHz (gfortran is fastest this time)

[bmaxa@bmaxa fasta]$ time ./fasta 25000000 > /dev/null

real 0m4.654s
user 0m4.648s
sys 0m0.003s
[bmaxa@bmaxa fasta]$ time ./fastac 25000000 > /dev/null

real 0m4.974s
user 0m4.956s
sys 0m0.015s
[bmaxa@bmaxa fasta]$ time ./fastaf 25000000 > /dev/null

real 0m4.537s
user 0m4.519s
sys 0m0.015s
[bmaxa@bmaxa fasta]$ time java fasta 25000000 > /dev/null

real 0m5.359s
user 0m5.346s
sys 0m0.024s
[bmaxa@bmaxa fasta]$ time java fasta 25000000 > /dev/null

real 0m5.336s
user 0m5.356s
sys 0m0.016s


Program follows:

; fasta.asm
format ELF64 executable 3

STDIN equ 0
STDOUT equ 1
STDERR equ 2
WIDTH equ 60
BUFFSIZ equ 16384

struc aminoacid{
.p dd ?
.c db ?
.filler db 3 dup(?)
}

macro sys_exit err
{
mov eax,60
mov rdi,err
syscall
}

macro sys_write fd, buf, size
{
mov eax,1
mov rdi, fd
mov rsi, buf
mov rdx, size
syscall
}

macro rand max{
IM = 139968
IA = 3877
IC = 29573
; r12 hold multiply value
xor rdx,rdx
mov rax,r15
imul rax,IA
add rax, IC
mov rsi,rax
mul r12
shr rdx,15
imul rdx,IM
sub rsi,rdx
mov r15,rsi
cvtsi2ss xmm0,rsi
mulss xmm0,[RCP]
}

virtual at 0
amino aminoacid
end virtual

entry main

segment executable
main:
cmp qword[rsp],2
jl .begin
mov rdi,[rsp+16] ; argv[1]
call a2qw
mov [count],rax
.begin:
mov rbx,iub
mov rcx,iublen
call acumulate_probabilities
mov rbx,homosapiens
mov rcx,homosapienslen
call acumulate_probabilities

sys_write STDOUT,hsalu,hsalusz
mov rax, alusz
mov rbx, alu
mov rdx,[count]
imul rdx,2
call repeat_fasta

sys_write STDOUT,IUB,IUBsz
mov rbx, iub
mov rcx, iublen
mov rdx, [count]
imul rdx,3
call random_fasta

sys_write STDOUT,hsf,hsfsz
mov rbx, homosapiens
mov rcx, homosapienslen
mov rdx, [count]
imul rdx,5
call random_fasta
sys_exit 0

; rcx len, rbx aminoacid buffer
acumulate_probabilities:
pxor xmm0,xmm0
.L0:
addss xmm0,[rbx+amino.p]
movss [rbx+amino.p],xmm0
add rbx,8
dec rcx
jnz .L0
ret

; rax buffer size, rbx input buffer, rdx count
repeat_fasta:
sub rsp,BUFFSIZ*2
mov rsi,rbx
mov r8, rax
mov r9,BUFFSIZ
cld
mov rcx,WIDTH
mov rdi,rsp
.L0:
movsb
dec rdx
jz .print_exit
dec rcx
jz .L3
.L00:
dec r9
jz .print
.L1:
dec r8
jnz .L0
mov rsi,rbx
mov r8,rax
jmp .L0
.L2:
add rsp,BUFFSIZ*2
ret
.L3:
mov byte[rdi],0xa
inc rdi
mov rcx,WIDTH
dec r9
jz .print1
jmp .L00
.print1:
mov r10, BUFFSIZ+1
jmp .print2
.print:
mov r10, BUFFSIZ
.print2:
mov rbp,rsp
push rax rdi rsi rcx rdx
sys_write STDOUT,rbp,r10
pop rdx rcx rsi rdi rax
mov r9, BUFFSIZ
mov rdi,rsp
jmp .L1
.print_exit:
mov byte[rdi],0xa
mov r8,rdi
sub r8,rsp
inc r8
sys_write STDOUT,rsp,r8
jmp .L2

; rbx input buffer, rcx buflen, rdx count
random_fasta:
mov r14,rcx
sub rsp,BUFFSIZ*2
mov rcx,WIDTH
mov r12,4318579316753219217
mov r15,[last]
mov rdi,rsp
mov r8,BUFFSIZ
movss xmm1,[rbx]
.L0:
push rdx
rand 1.0
pop rdx
mov rax,1
xor r13,r13
mov r10,r14
.L1: ; this is faster than simply iterate until found ?
comiss xmm0,[rbx+rax*8+amino.p-8]
lea r11,[r13+1]
cmovae r13,r11
inc rax
dec r10
jnz .L1
.L2:
lea rsi,[rbx+r13*8+amino.c]
movsb
dec rdx
jz .print_exit
dec rcx
jz .L4
.L22:
dec r8
jz .print
jmp .L0
.L3:
mov [last],r15
add rsp,BUFFSIZ*2
ret
.L4:
mov byte[rdi],0xa
inc rdi
mov rcx, WIDTH
dec r8
jz .print1
jmp .L22
.print1:
mov r9,BUFFSIZ+1
jmp .print2
.print:
mov r9,BUFFSIZ
.print2:
mov r10,rsp
push rax rcx rdi rsi rdx
sys_write STDOUT,r10,r9
pop rdx rsi rdi rcx rax
mov r8,BUFFSIZ
mov rdi,rsp
jmp .L0
.print_exit:
mov byte[rdi],0xa
mov r8,rdi
sub r8,rsp
inc r8
sys_write STDOUT,rsp,r8
jmp .L3
; rax qw to convert, rdi output buf returns rcx bufsz
qw2a:
lea r10 , [rsp-1]
xor rcx,rcx
mov r8,10
.L0:
xor rdx,rdx
div r8
mov r9b,[table+rdx]
mov [r10],r9b
dec r10
inc rcx
test rax,rax
jnz .L0

lea rsi,[r10+1]
cld
.L1:
movsb
cmp rsi,rsp
jne .L1

ret
; rdi input buf (null terminated), return rax digit
a2qw:
cld
xor eax,eax
mov rcx,20
repne scasb
sub rdi,2
inc rcx
mov rdx,1
.L0:
xor ebx,ebx
mov bl,byte [rdi]
sub bl,0x30
imul rbx,rdx
add rax, rbx
imul rdx,10
dec rdi
inc rcx
cmp rcx,20
jne .L0
ret

segment readable writeable

iub dd 0.27
db 'a',0,0,0

dd 0.12
db 'c',0,0,0

dd 0.12
db 'g',0,0,0

dd 0.27
db 't',0,0,0

dd 0.02
db 'B',0,0,0

dd 0.02
db 'D',0,0,0

dd 0.02
db 'H',0,0,0

dd 0.02
db 'K',0,0,0

dd 0.02
db 'M',0,0,0

dd 0.02
db 'N',0,0,0

dd 0.02
db 'R',0,0,0

dd 0.02
db 'S',0,0,0

dd 0.02
db 'V',0,0,0

dd 0.02
db 'W',0,0,0

dd 0.02
db 'Y',0,0,0
iublen = ($-iub) / 8

homosapiens dd 0.3029549426680
db 'a' ,0,0,0
dd 0.1979883004921
db 'c' ,0,0,0
dd 0.1975473066391
db 'g' ,0,0,0
dd 0.3015094502008
db 't' ,0,0,0
homosapienslen = ($-homosapiens) / 8

hsalu db '>ONE Homo sapiens alu',0xa
hsalusz = $-hsalu
IUB db '>TWO IUB ambiguity codes',0xa
IUBsz = $-IUB
hsf db '>THREE Homo sapiens frequency',0xa
hsfsz = $-hsf
table db '0123456789'
alu db 'GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG'
db 'GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA'
db 'CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT'
db 'ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA'
db 'GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG'
db 'AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC'
db 'AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA'
alusz = $-alu

align 8
count dq 1
last dq 42
RCP dd 921680564; 1/139968

0 new messages