Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

thread ring benchmark (asm linux)

9 views
Skip to first unread message

MelissA

unread,
Apr 27, 2012, 2:55:51 PM4/27/12
to
This time have tried to implement thread ring benchmark
from this program :
http://shootout.alioth.debian.org/u64q/program.php?test=threadring&lang=gpp&id=1

Program creates 503 threads which unlock each other 50000000 times.

When set affinity to single cpu off course program runs faster,
but I'm interested if someone knows what trick uses
functional languages in order to be so much faster than
conventional futex approach.

I have implemented signaling from thread to thread via futex.
It is faster than mutex from pthreads library, but
is slower than functional languiages ;)

Program compile with
fasm ring.asm
link with
gcc ring.o -o ring

Progrma follows:
;ring.asm
format ELF64

NUMQ equ 4
CPUSETSIZE = NUMQ * 8
STACKSIZE = 4096
NUMTHREADS = 503

FUTEX_WAIT equ 0
FUTEX_WAKE equ 1
FUTEX_PRIVATE_FLAG equ 128

struc cpu_set_t{
.bits dq NUMQ dup(?)
}

struc Thread{
.mutex rd 1
.node_id rd 1
.next_id rd 1
}

macro futex_acquire sema{
local .L0,.L1
mov r15,sema
.L0:
mov ebx,1
xor eax,eax
lock cmpxchg [r15],ebx
test eax,eax
jz .L1
mov eax, 202
mov rdi, r15
mov rsi, FUTEX_WAIT or FUTEX_PRIVATE_FLAG
mov edx, 1
xor r10,r10
syscall
jmp .L0
.L1:
}

macro futex_release sema{
lock and dword[sema],0
mov eax,202
mov rdi, sema
mov rsi, FUTEX_WAKE or FUTEX_PRIVATE_FLAG
mov rdx,1
syscall
}

macro acquire sema{
local .L0
mov ebx,1
.L0:
xor eax,eax
lock cmpxchg [sema],ebx
test eax,eax
jnz .L0
}

macro release sema{
lock and dword[sema],0
}

macro sys_sched_getaffinity set{
mov eax,204
xor edi,edi
mov esi, CPUSETSIZE
mov rdx,set
syscall
}

macro sys_sched_setaffinity set{
mov eax,203
xor edi,edi
mov esi, CPUSETSIZE
mov rdx,set
syscall
}

CLONE_VM equ 0x00000100
CLONE_FS equ 0x00000200
CLONE_FILES equ 0x00000400
CLONE_SIGHAND equ 0x00000800
CLONE_THREAD equ 0x00010000

macro sys_clone stack{
mov eax,56 ; sys_clone
mov rdi,CLONE_VM or CLONE_FS or CLONE_FILES \
or CLONE_SIGHAND or CLONE_THREAD;
mov rsi,stack ; choose stack
syscall
}

__WNOTHREAD equ 0x20000000
__WALL equ 0x40000000
__WCLONE equ 0x80000000

macro sys_wait pid{
mov eax,61 ; sys_wait4
mov edi,pid
xor esi,esi
mov edx,__WALL;
xor r10d,r10d
syscall
}

macro sys_gettid{
mov eax,186
syscall
}

macro sys_exit err{
mov eax,60
mov rdi,err
syscall
}

macro sys_exit_group err{
mov eax,231
mov rdi,err
syscall
}

struc timespec tv_sec,tv_nsec{
.tv_sec dq tv_sec ; seconds
.tv_nsec dq tv_nsec ; nanoseconds (it must be in range 0 to 999999999)
}

macro sys_nanosleep spec{
mov eax, 35
mov rdi, spec
xor esi,esi
syscall
}

virtual at 0
oThread Thread
end virtual

section '.text' executable align 16
public main
extrn printf
extrn atoi

main:
mov qword[token],1000
mov rax,rdi ; argc
cmp rax,2
jl .begin
mov rdi,[rsi+8] ; argv[1]
call atoi
mov [token],rax
.begin:
sub rsp,16

; setting affinity to single cpu speeds up program
mov qword[myset.bits],1
sys_sched_setaffinity myset
xor rcx,rcx
.L1:
mov rsi,rcx
imul rbx,rcx,12
lea rdi,[threads+rbx]
push rcx rbx
call init_ring_thread
pop rbx rcx

imul rdx,rcx,STACKSIZE
add rdx, STACKSIZE
mov rsi,ring_thread
lea rdx,[stacks+rdx]
lea rdi,[threads+rbx]
push rcx
call start_thread
pop rcx
inc rcx
cmp rcx,NUMTHREADS
jl .L1


lea rsi,[threads+oThread.mutex]
futex_release rsi ; let's start ring

futex_acquire leavesema ; wait for exit

xor eax,eax
add rsp,16
ret

; rsi ptr to function, rdi arg to function , rdx top of stack
; returns pid in eax
start_thread:

push rbp
mov rbp,rsp
mov rsp, rdx;
push rdi rsi
sub rdx,16 ; adjust stack for thread
mov rsp,rbp

sys_clone rdx
test eax,eax
jnz .L0
; child
pop rsi rdi
call rsi
sys_exit 0
.L0:; parent
pop rbp
ret

virtual at rdi
rThread Thread
end virtual

; rdi pointer to object, rsi id
init_ring_thread:
mov [rThread.node_id],esi
inc esi
xor eax,eax
cmp esi,NUMTHREADS
cmove esi,eax
mov [rThread.next_id],esi


ret

ring_thread:
sub rsp,8
.L0:
lea rsi,[rThread.mutex]
push rdi
futex_acquire rsi
pop rdi


cmp qword[token],0
je .L1
dec qword[token]

mov ebx,[rThread.next_id]
imul rbx,rbx,12
lea rbx,[threads+rbx+oThread.mutex]

push rdi
futex_release rbx
pop rdi


jmp .L0
.L1:
mov esi,[rThread.node_id]
inc esi
xor eax,eax
mov rdi,msgnode
call printf
futex_release leavesema ; signal exit to main thread
add rsp,8
ret


section '.data' writeable align 16

myset cpu_set_t
msgnode db '%u',0xa,0
align 8
threads dd NUMTHREADS dup(1,0,0)
sema dd 0
leavesema dd 1

section '.bss' writeable align 16
stacks rd STACKSIZE*NUMTHREADS
token rq 1




0 new messages