how resolve all the problems about threads

av

unread,

Dec 3, 2006, 9:47:41 AM12/3/06

to

this is how i, a little newbie resolve the problem of read-write
a region of common memory between threads:

0) the first thread that have to read-write a region of memory

1) Block that range of memory so that if one or more other thread
want to read or write that memory going in sleep

2) Read - write that memory

3) Opening the memory for read - write of all the threads

4) Goto 0

David Schwartz

unread,

Dec 3, 2006, 10:37:58 PM12/3/06

to

av wrote:

That really doesn't make any sense at all. There are so many reasons
why it's hard to know where to start.

DS

av

unread,

Dec 4, 2006, 2:44:22 AM12/4/06

to

i mean some of this kind (don't know if it is ok) for shared resuorces

Can i have right?
From namsw manual:

`CMPXCHG' compares its destination (first) operand to the value in
`AL', `AX' or `EAX' (depending on the operand size of the
instruction). If they are equal, it copies its source (second)
operand into the destination and sets the zero flag. Otherwise, it
clears the zero flag and copies the destination register to AL, AX
or EAX.
The destination can be either a register or a memory location. The
source is a register.

`CMPXCHG' is intended to be used for atomic operations in
multitasking or multiprocessor environments. To safely update a
value in shared memory, for example, you might load the value into
`EAX', load the updated value into `EBX', and then execute the
instruction `LOCK CMPXCHG [value],EBX'. If `value' has not changed
since being loaded, it is updated with your desired new value, and
the zero flag is set to let you know it has worked. (The `LOCK'
prefix prevents another processor doing anything in the middle of
this operation: it guarantees atomicity.) However, if another
processor has modified the value in between your load and your
attempted store, the store does not happen, and you are notified of
the failure by a cleared zero flag, so you can go round and try
again.
"

/* int CanIHaveResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N
CanIHaveResource:
push ecx
xor eax, eax
mov ecx, [esp+8]
lock CMPXCHG dword[SharedValue], ecx
jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4

/* int FreeResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N
FreeResource:
push ecx
mov eax, [esp+8]
mov ecx, 0
lock CMPXCHG dword[SharedValue], ecx
jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4
--------------
in thread1
int a;

l0:
a=CanIHaveResource(1);
if(a==0) {Sleep(10); goto l0;}
use shared resource
FreeResource(1);
--------------

--------------
in thread2
int a;

l0:
a=CanIHaveResource(2);
if(a==0) {Sleep(10); goto l0;}
use shared resource
FreeResource(2);
--------------
--------------
in thread3
int a;

l0:
a=CanIHaveResource(3);
if(a==0) {Sleep(10); goto l0;}
use shared resource
FreeResource(3);
--------------

etc etc

av

unread,

Dec 4, 2006, 5:04:22 AM12/4/06

to

On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:

unsigned head=0;
unsigne tail=0;
unsigned arr[128];
unsigned SharedValue=0;
unsigned in_uso=0;

/* int CanIHaveResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N
CanIHaveResource:
push ecx
xor eax, eax
mov ecx, [esp+8]
lock CMPXCHG dword[SharedValue], ecx
jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4

/* int FreeResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N
FreeResource:
push ecx
mov eax, [esp+8]
mov ecx, 0

lock CMPXCHG dword[in_uso], ecx

jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4

/* int CanIHaveResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N
CanIHaveResource1:

push ecx
xor eax, eax
mov ecx, [esp+8]

lock CMPXCHG dword[in_uso], ecx

jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4

/* int FreeResource(int NassociatoAlThread)
/* 0c, 4ra, 8P_N

FreeResource1:

push ecx
mov eax, [esp+8]
mov ecx, 0

lock CMPXCHG dword[in_uso], ecx

jz .1
mov eax, 1
clc
jmp .2
.1: mov eax, 0
stc
.2:
pop ecx
ret 4

// FIFO?
unsigned put(unsigned i)
{unsigned j;
j=(tail+1)%128;
if(j==head) return 0;
tail=j; arr[j-1]=i;
return 1;
}

int get(unsigned* i)
{if(tail==head) return 0;
head=(head+1)%128;
*i=arr[head];
return 1;
}

int is_in(unsigned v)
{int i, j;
if(head==tail) return 0;
for(i=head, j=i%128; j!=tail; ++i, j=i%128)
if(arr[j]==v) return 1;
return 0;
}

int is_first(unsigned v)
{if(head==tail) return 0;
if(arr[head]==v) return 1;
return 0;
}

int is_full(void)
{return (tail+1)%128==head;}

int is_empy(void)
{return head==tail;}

unsigned usa_risorsa(unsigned r)
{unsigned v, j;
l0:
a=CanIHaveResource(r);
if(a==0) {Sleep(20); goto l0;}
if( is_full()) v=0;
else if(is_first(r)
{ if( CanIHaveResource1(r) )
{
get(&j); v=1;
FreeResource(r);
usa_risorsa_condivisa();
FreeResource1(r);
goto l2;
}
else v=0;
}
else {if(!is_in(r))
{if(put(r)==0) v=0;}
}
FreeResource(r);
l2:
return v;
}

--------------
in thread1
while( usa_risorsa(1) == 0) Sleep(10);
--------------
in thread2
while( usa_risorsa(2) == 0) Sleep(10);
--------------
in thread3
while( usa_risorsa(3) == 0) Sleep(10);
--------------
etc, etc

av

unread,

Dec 4, 2006, 5:21:34 AM12/4/06

to

On Mon, 04 Dec 2006 11:04:22 +0100, av wrote:
>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:

so what is the answer?

*i=arr[head];
head=(head+1)%128;
return 1;
}

int is_in(unsigned v)
{int i, j;
if(head==tail) return 0;

for(i=head, j=i; j!=tail; ++i, j=i%128)

if(arr[j]==v) return 1;
return 0;
}

int is_first(unsigned v)
{if(head==tail) return 0;
if(arr[head]==v) return 1;
return 0;
}

int is_full(void)
{return (tail+1)%128==head;}

int is_empy(void)
{return head==tail;}

unsigned usa_risorsa(unsigned r)
{unsigned v, j;
l0:
a=CanIHaveResource(r);

if(a==0) {Sleep(10); goto l0;}

if( is_full()) v=0;
else if(is_first(r)
{ if( CanIHaveResource1(r) )
{
get(&j); v=1;
FreeResource(r);

/* here i use the shared resource */
/* but "FIFO" an "in_uso" are shared between threads too*/

usa_risorsa_condivisa();

FreeResource1(r);
goto l2;
}
else v=0;
}

else {if(!is_in(r)) put(r);

v=0;
}
FreeResource(r);
l2:
return v;
}

--------------
in thread1
while( usa_risorsa(1) == 0) Sleep(40);
--------------
in thread2
while( usa_risorsa(2) == 0) Sleep(40);
--------------
in thread3
while( usa_risorsa(3) == 0) Sleep(40);
--------------
etc, etc

Joe Seigh

unread,

Dec 4, 2006, 7:06:18 AM12/4/06

to

av wrote:
> On Mon, 04 Dec 2006 11:04:22 +0100, av wrote:
>
>>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:
>
>
> so what is the answer?
>

What was the question?

[...]

And how is all that [...] an improvement over current art?

--
Joe Seigh

When you get lemons, you make lemonade.
When you get hardware, you make software.

Steve Watt

unread,

Dec 5, 2006, 3:48:04 PM12/5/06

to

In article <rjs7n2psa4orpgt7t...@4ax.com>, av <av@ala.a> wrote:
>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:

[ snip ]

Congratulations, you have reinvented the spinlock. A very suboptimal
version of it, no less.

Please explain what you think is revolutionary about it.
--
Steve Watt KD6GGD PP-ASEL-IA ICBM: 121W 56' 57.5" / 37N 20' 15.3"
Internet: steve @ Watt.COM Whois: SW32-ARIN
Free time? There's no such thing. It just comes in varying prices...

av

unread,

Dec 6, 2006, 12:07:28 PM12/6/06

to

On Tue, 5 Dec 2006 20:48:04 +0000 (UTC), Steve Watt wrote:
>In article <rjs7n2psa4orpgt7t...@4ax.com>, av <av@ala.a> wrote:
>>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:
>[ snip ]
>Congratulations, you have reinvented the spinlock. A very suboptimal
>version of it, no less.

i'm a not knower, what "spinlock" is?
only now i have some time with test code.

>Please explain what you think is revolutionary about it.

i have to do some test. in 99% of case i will have wrong
but seems i not fear "threads"

av

unread,

Dec 6, 2006, 12:07:40 PM12/6/06

to

On Mon, 04 Dec 2006 07:06:18 -0500, Joe Seigh wrote:

>av wrote:
>> On Mon, 04 Dec 2006 11:04:22 +0100, av wrote:
>>
>>>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:
>>
>>
>> so what is the answer?
>>
>
>What was the question?

can you write down the routines that minimize time and CPU cycles and
allow using one resource by many treads (in concurrences) functions?

>[...]
>
>And how is all that [...] an improvement over current art?

so what is the current art?

David Schwartz

unread,

Dec 6, 2006, 5:38:56 PM12/6/06

to

av wrote:

> On Tue, 5 Dec 2006 20:48:04 +0000 (UTC), Steve Watt wrote:
> >In article <rjs7n2psa4orpgt7t...@4ax.com>, av <av@ala.a> wrote:
> >>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:
> >[ snip ]
> >Congratulations, you have reinvented the spinlock. A very suboptimal
> >version of it, no less.

> i'm a not knower, what "spinlock" is?
> only now i have some time with test code.

A "spinlock" is the most basic form of mutual exclusion without
scheduling. It simply allows only one thread at a time to access an
object and ensures a synchronized view of memory for changes and views
made while the object is owned by a thread.

> >Please explain what you think is revolutionary about it.
>
> i have to do some test. in 99% of case i will have wrong
> but seems i not fear "threads"

Your implementation is terrible. For one thing, it requires a locked
operation even if there is no contention, which ensures you always get
about the worst performance possible. Worse, it doesn't actually ensure
a synchronized view of memory.

It takes an incredibly detailed understanding of both the software and
hardware involved to write threading primitives that work, much less
work well. And you cannot get them to work by testing them. Threading
code can fail in ways that are too subtle to be replicated reliably by
typical testing methods. So bad code will slip through if you don't
understand what the requirements of writing good multithreaded code
are.

Learn what's out there and how to use it first.

DS

Markus Elfring

unread,

Dec 7, 2006, 12:14:39 PM12/7/06

to

> i'm a not knower, what "spinlock" is?

Did you read any articles about its technical details?
http://en.wikipedia.org/wiki/Spinlock
http://en.wikipedia.org/wiki/Synchronization_%28computer_science%29

Regards,
Markus

av

unread,

Dec 11, 2006, 10:32:44 AM12/11/06

to

On 6 Dec 2006 14:38:56 -0800, David Schwartz wrote:
>av wrote:
>> On Tue, 5 Dec 2006 20:48:04 +0000 (UTC), Steve Watt wrote:
>> >In article <rjs7n2psa4orpgt7t...@4ax.com>, av <av@ala.a> wrote:
>> >>On Mon, 04 Dec 2006 08:44:22 +0100, av wrote:
>> >[ snip ]
>> >Congratulations, you have reinvented the spinlock. A very suboptimal
>> >version of it, no less.
>
>> i'm a not knower, what "spinlock" is?
>> only now i have some time with test code.
>
>A "spinlock" is the most basic form of mutual exclusion without
>scheduling. It simply allows only one thread at a time to access an
>object and ensures a synchronized view of memory for changes and views
>made while the object is owned by a thread.
>
>> >Please explain what you think is revolutionary about it.
>>
>> i have to do some test. in 99% of case i will have wrong
>> but seems i not fear "threads"
>
>Your implementation is terrible. For one thing, it requires a locked
>operation even if there is no contention, which ensures you always get
>about the worst performance possible. Worse, it doesn't actually ensure
>a synchronized view of memory.

so you say that changing only "DaiRisorsa256" function you can write
something more fast than below with 49 threads that compete for the
same resource (13 seconds for doing 499 inc in each thread for the
same array)

>It takes an incredibly detailed understanding of both the software and
>hardware involved to write threading primitives that work, much less
>work well. And you cannot get them to work by testing them. Threading
>code can fail in ways that are too subtle to be replicated reliably by
>typical testing methods.

yes you speak very well but
can you point out a function for the thread like
uns __stdcall adda(uns* a)
{if(*a>500) return 0;
++th1[*a]; return 1;
}
that can make "DaiRisorsa256" function in crisis
thank you

>So bad code will slip through if you don't
>understand what the requirements of writing good multithreaded code
>are.

>Learn what's out there and how to use it first.
>
>DS

i don't have the book for now so i write

#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define P printf
#define uns unsigned

/* This seems ok like a C++ file */
#ifdef __cplusplus
extern "C" {
#endif
int __stdcall
DaiRisorsa256(uns Ntherad, char* funz, void* par, uns* res);
#ifdef __cplusplus
}
#endif

char a[500]={0};

int thr[500]={0}; // risorsa condivisa senza blocchi (non ok)
int th1[500]={0}; // risorsa condivisa tramite la funzione (ok?)

uns __stdcall adda(uns* a)
{if(*a>500) return 0;
++th1[*a]; return 1;
}

void printArray(int* a, uns len)
{uns i;
P("Array=");
for(i=0; i<len; ++i) P("%i ", a[i]);
P("\n");
}

DWORD __stdcall ThreadFunc( void* lpParam )
{uns n, res, par, res1;
n= *(uns*) lpParam;
Sleep(3000);
if(n>=500) {P("Errore + di 500 threads\n"); return 0;}
else ++thr[n];
for(par=1, res1=0; par<490; ++par, res1+=res)
while( DaiRisorsa256(n, (char*)adda, &par, &res)==0 )
{Sleep(100); thr[n]|=0x80000000;}
return res1;
}

int main(void)
{uns n, res, par=1;
time_t ti, tf;
DWORD dwThreadId[200], dwThrdParam;
HANDLE hThread[200];
DWORD lpExitCode =0;

P("Inizio ...\n"); ti=time(0);
for(n=1; n<50; ++n)
{ dwThrdParam=n;
hThread[n] = CreateThread(
NULL, // no security attributes
0, // use default stack size
ThreadFunc, // thread function
&dwThrdParam, // argument to thread function
0, // use default creation flags
&dwThreadId[n]); // returns the thread
identifier
if(hThread[n] == NULL)
{// ErrorExit( "CreateThread failed." );
P("CreateThread failed.\n");
exit(1);
}
P("hThread=0x%x dwThreadId=0x%x\n",
(unsigned)hThread[n], (unsigned) dwThreadId[n]);
}

for(n=1; n<50; ++n)
{li:;
Sleep(200);
GetExitCodeThread(hThread[n],
&lpExitCode // address to receive
termination status
);
if(lpExitCode==STILL_ACTIVE) goto li;
P("return%u=%u ", n, lpExitCode);
CloseHandle( hThread[n] );
}
Sleep(500);
printArray(thr, 50);
printArray(th1, 50);
tf=time(0);
P("DeltaT=%f\n", difftime(tf, ti));
exit(0);
}

section _DATA public use32 class=DATA

; nasmw -fobj thisfile.asm

global LockDword , UnlockDword , putInFIFO , getFromFIFO , lenFIFO
global isInFIFO , printFIFO , isFirstInFIFO , meno1FromFIFO
global DaiRisorsa , DaiRisorsa256

extern _printf, Sleep

%define _P _printf

daiRisorsa_var dd 0 , 0
daiRisorsa_var1 dd 0 , 0

fifo_head dd 0 , 0
fifo_tail dd 0 , 0
fifo_vals times 258 dd 0

dummy_val dd 0 , 0 , 0 , 0

section _TEXT public use32 class=CODE

; int LockDword(uns* mem, uns NThread)
; Se effettua il lock ritorna 1(CF==0)
; Se non effettua il lock ritorna 0(CF==1)
; 0c, 4r, 8ra, 12P_mem, 16P_NThread
LockDword:
push ecx
push edx
%define @mem [esp+12]
%define @NThread [esp+16]
mov edx, @mem
xor eax, eax
cmp dword[edx], 0
jne .1
mov ecx, @NThread
lock cmpxchg dword[edx] , ecx
jnz .1
mov eax, 1
clc
jmp short .2
.1:
xor eax, eax
stc
.2:
%undef @mem
%undef @NThread
pop edx
pop ecx
ret 8

; void UnlockDword(uns* mem)
; 0a, 4ra, 8P_mem
; UnlockDword: <a| a=[s+8];< 0;> D[a]|>a; ret 4

; void UnlockDword(uns* mem)
; 0r, 4a, 8ra, 12P_mem
UnlockDword:
push eax
push edx
mov eax, [esp+12]
xor edx, edx
lock xchg [eax] , edx
pop edx
pop eax
ret 4

; int putInFIFO(uns* FIFOarray, uns* head,
; uns* tail, uns fifosize, uns val)
; mette "val" nella fifo
; se tutto ok ritorna 1(CF==0) altrimenti 0(CF==1)
; Fifoarray è il vettore,
; head e tail sono puntatori a indici di tale vettore
; maxVal è la dimensione massima della fifo
; 0i, 4r, 8c, 12b, 16ra, 20P_FIFO,
; 24P_Head, 28P_Tail, 32P_maxval, 36P_val
putInFIFO:
push ebx
push ecx
push edx
push esi
%define @fifo [esp+20]
%define @head [esp+24]
%define @tail [esp+28]
%define @fifosize [esp+32]
%define @val [esp+36]
cmp dword @fifosize, 0
jle .0
mov ecx, @tail
mov ebx, @fifo
mov eax, [ecx]
inc eax
cmp eax, @fifosize
jbe .a
mov eax, 0
.a:
mov esi, @head
cmp eax, [esi]
jne .1
.0:
xor eax, eax
stc
jmp short .f
.1:
mov edx, [ecx]
mov esi, @val
mov [ebx+4*edx], esi
mov [ecx], eax
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
%undef @val
pop esi
pop edx
pop ecx
pop ebx
ret 20

; int getFromFIFO(uns* FIFOarray, uns* head,
; uns* tail, uns fifosize, uns* pval)
; Fifoarray è il vettore, head e tail sono
; puntatori a indici di tale vettore
; maxVal è la dimensione massima della fifo
; 0i, 4r, 8c, 12b, 16ra, 20P_FIFO,
; 24P_Head, 28P_Tail, 32P_maxval, 36P_pval
getFromFIFO:
push ebx
push ecx
push edx
push esi
%define @fifo [esp+20]
%define @head [esp+24]
%define @tail [esp+28]
%define @fifosize [esp+32]
%define @pval [esp+36]
cmp dword @fifosize, 0
jle .0
mov ebx, @head
mov esi, @tail
mov eax, [ebx]
cmp eax, [esi]
jne .1
.0:
xor eax, eax
stc
jmp short .f
.1:
mov edx, @pval
mov ecx, @fifo
push dword [ecx+4*eax]
pop dword [edx]
inc eax
cmp eax, @fifosize
jbe .2
mov eax, 0
.2:
mov [ebx], eax
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
%undef @pval
pop esi
pop edx
pop ecx
pop ebx
ret 20

; int meno1FromFIFO(uns* FIFOarray, uns* head,
; uns* tail, uns fifosize)
; Fifoarray è il vettore, head e tail sono
; puntatori a indici di tale vettore
; maxVal è la dimensione massima della fifo
; 0r, 4c, 8ra, 12P_FIFO, 16P_Head, 20P_Tail, 24P_fifosize
meno1FromFIFO:
push ecx
push edx
%define @fifo [esp+12]
%define @head [esp+16]
%define @tail [esp+20]
%define @fifosize [esp+24]
cmp dword @fifosize, 0
jle .0
mov edx, @head
mov ecx, @tail
mov eax, [edx]
cmp eax, [ecx]
jne .1
.0:
xor eax, eax
stc
jmp short .f
.1:
inc eax
cmp eax, @fifosize
jbe .2
mov eax, 0
.2:
mov [edx], eax
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
pop edx

pop ecx
ret 16

; int isInFIFO(uns* FIFOarray, uns head,
; uns tail, uns fifosize, uns val)
; 0i, 4r, 8c, 12b, 16ra, 20P_FIFO,
; 24P_Head, 28P_Tail, 32P_maxval, 36P_val
isInFIFO:
push ebx
push ecx
push edx
push esi
%define @fifo [esp+20]
%define @head [esp+24]
%define @tail [esp+28]
%define @fifosize [esp+32]
%define @val [esp+36]
cmp dword @fifosize, 0
jle .0
mov eax, @head
mov esi, @tail
cmp eax, esi
jne .1
.0:
xor eax, eax
stc
jmp short .f
.1: ; in *b=head_index
mov ecx, @fifo
mov edx, @val
jmp short .3
.2:
inc eax
cmp eax, @fifosize
jbe .a
mov eax, 0
.a:
cmp eax, esi
je .0
.3:
cmp edx, [ecx+4*eax]
jne .2
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
%undef @val
pop esi
pop edx
pop ecx
pop ebx
ret 20

; int printFIFO(uns* FIFOarray, uns head, uns tail, uns fifosize)
; 0j, 4i, 8r, 12c, 16ra, 20P_FIFO, 24P_Head, 28P_Tail, 32P_maxval
printFIFO:
push ecx
push edx
push esi
push edi
sub esp, 8
%define @fifo [esp+28]
%define @head [esp+32]
%define @tail [esp+36]
%define @fifosize [esp+40]
cmp dword @fifosize, 0
jle .0
mov edi, @head
mov esi, @tail
cmp edi, esi
jne .1
.0:
mov eax, esp
mov dword[eax], "FIFO"
mov dword[eax+4], "=0"
push eax
call _P
add esp, 4
xor eax, eax
stc
jmp short .f
.1:
mov ecx, @fifo
mov dword[esp], "%u "
mov dword[esp+4], 0
jmp short .3
.2:
inc edi
mov eax, edi
cmp eax, @fifosize
jbe .a
mov eax, 0
.a:
mov edi, eax
cmp esi, eax
je .4
.3:
mov edx, [ecx+4*edi]
mov eax, esp
push edx
push eax
call _P
add esp, 8
inc dword[esp+4]
jmp short .2
.4:
mov eax, [esp+4]
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
add esp, 8
pop edi
pop esi
pop edx
pop ecx
ret 16

; 0 1 2 3 4 5 6 7 8
; ^ > ^
; int lenFIFO(uns head, uns tail, uns fifosize)
; 0ra, 4P_head, 8P_tail, 12P_fifosize
lenFIFO:
mov eax, [esp+8]
sub eax, [esp+4]
jge .f
add eax, [esp+12]
inc eax
.f:
ret 12

; int isFirstInFIFO(uns* FIFOarray, uns head, uns tail, uns val)
; 0r, 4c, 8ra, 12P_FIFO, 16P_Head, 20P_Tail, 24P_val
isFirstInFIFO:
push ecx
push edx
%define @fifo [esp+12]
%define @head [esp+16]
%define @tail [esp+20]
%define @val [esp+24]
mov eax, @head
mov edx, @tail
cmp eax, edx
jne .1
.0:
xor eax, eax
stc
jmp short .f
.1:
mov ecx, @fifo
mov edx, @val
cmp dword[ecx+4*eax], edx
jne .0
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @val
pop edx
pop ecx
ret 16

; 0r, 4r, 8a, 12ra, 16P
Lp:
push eax
push edx
push edx
mov edx, [esp+16] ; D*a&=0xFF;
mov eax, esp
mov [eax], edx
push eax
call _P
add esp, 4
pop edx
pop edx
pop eax
ret 4

; >=Pentium
; int
; daiRisorsa(uns* fifo,uns* head,uns* tail,uns fifosize,
; uns NThread, char* funz, uns* par, uns* res)
; int isFirstInFIFO(uns* FIFOarray, uns head, uns tail, uns val)
; int getFromFIFO(uns* FIFOarray, uns* head,
; uns* tail, uns fifosize, uns* pval)
; 0k,4j,8i,12r,16c,20b,24ra,28P_FIFO,
; 32P_Head,36P_Tail,40P_fifosize,44P_Nthr
daiRisorsa:
push ebx
push ecx
push edx
push esi
push edi
push ebp
; 0 4 8 12
%define @fifo [esp+28]
%define @head [esp+32]
%define @tail [esp+36]
%define @fifosize [esp+40]
; 16 20 24
%define @Nthr [esp+44]
%define @funz [esp+48]
%define @par [esp+52]
mov esi, @Nthr
lea ebp, [esp+28]
xor edi, edi
xor ebx, ebx
cmp dword @fifo, 0
je .e
cmp dword @head, 0
je .e
cmp dword @tail, 0
je .e
cmp dword @fifosize, 0
jle .e
mov ecx, dword[ebp+4]
mov edx, dword[ebp+8]
mov eax, [ecx]
mov ecx, [edx]
push dword[ebp+12]
push ecx
push eax
call lenFIFO
cmp eax, @fifosize
jne .0 ; FIFO non piena
.e:
xor eax, eax
stc
jmp .f
.0:
push esi
push daiRisorsa_var
call LockDword
jz .1
RDTSC
and eax, 0x7f
push eax
call Sleep
jmp short .0
.1:
push dword[ebp+16]
push dword[ebp+12]
push dword[ebp+8]
push dword[ebp+4]
push dword[ebp]
call putInFIFO
push daiRisorsa_var
call UnlockDword ; errore nell'inserimento
cmp eax, 0
je .e
mov ecx, dword[ebp+4]
mov edx, dword[ebp+8]
mov eax, [ecx]
mov ecx, [edx]
.2:
push dword[ebp+16]
push ecx
push eax
push dword[ebp]
call isFirstInFIFO
jz .3
RDTSC
and eax, 0x7f
push eax
call Sleep
jmp short .2
.3:
push esi
push daiRisorsa_var1
call LockDword
jz .4
RDTSC
and eax, 0x7f
push eax
call Sleep
jmp short .3
.4:
cmp dword[ebp+24], 0
je .a
push dword [ebp+24]
call dword[ebp+20]
mov edx, [ebp+28]
mov [edx], eax
jmp short .b
.a:
call dword[ebp+20]
mov edx, [ebp+28]
mov [edx], eax
.b:
push daiRisorsa_var1
call UnlockDword
.5:
push esi
push daiRisorsa_var
call LockDword
jz .6
RDTSC
and eax, 0x7f
push eax
call Sleep
jmp short .5
.6:
push dword[ebp+12]
push dword[ebp+8]
push dword[ebp+4]
push dword[ebp+20]
call meno1FromFIFO
push daiRisorsa_var
call UnlockDword
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @fifosize
%undef @Nthr
%undef @funz
%undef @par
pop ebp
pop edi
pop esi
pop edx
pop ecx
pop ebx
ret 32

; int
;DaiRisorsa256(uns Ntherad, char* funzione, uns* par, uns* res)
; 0i, 4r, 8c, 12ra, 16P_Nthread, 20P_funz, 24P_par
DaiRisorsa256:
push ecx
push edx
push esi
%define @Nthr [esp+16]
%define @funz [esp+20]
%define @par [esp+24]
mov eax, @Nthr
mov ecx, @funz
mov edx, @par
mov esi, [esp+28]
push esi
push edx
push ecx
push eax
push 256
push fifo_tail
push fifo_head
push fifo_vals
call daiRisorsa
%undef @Nthr
%undef @funz
%undef @par
pop esi
pop edx
pop ecx
ret 16

pentium1
1 0 0 3 1 1 0 2 1 0 1 2 0 0 3 0 0 3 1 0 2 1 1 0 2 1 1 0 2
2 1 0 2 1 1 1
DeltaT=13.000000

amdAthon
0 0 0 0 0 0 0 10 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 9 0 0
0 0 0 0 0 0 9 0 0 0 0 5
DeltaT=13.000000

why they are the same time?

av

unread,

Dec 11, 2006, 11:43:24 AM12/11/06

to

On Mon, 11 Dec 2006 16:32:44 +0100, av wrote:
> extern _printf, Sleep
>
> %define _P _printf

[]

Here for to be ok should be _P==_printf
has to preserve ecx, edx too (but it seems printf does't do it)
but here this function "printFIFO" is not used

David Schwartz

unread,

Dec 11, 2006, 12:58:39 PM12/11/06

to

av wrote:

> so you say that changing only "DaiRisorsa256" function you can write
> something more fast than below with 49 threads that compete for the
> same resource (13 seconds for doing 499 inc in each thread for the
> same array)

That type of benchmarking makes no sense. You have to test threading
primitives under realistic conditions or you wind up picking *very* bad
primitives. You also have to consider CPU usage, not just wall time.

DS

av

unread,

Dec 12, 2006, 12:27:06 PM12/12/06

to

On Mon, 11 Dec 2006 16:32:44 +0100, av wrote:

>On 6 Dec 2006 14:38:56 -0800, David Schwartz wrote:
>>av wrote:
>>Your implementation is terrible. For one thing, it requires a locked
>>operation even if there is no contention, which ensures you always get
>>about the worst performance possible. Worse, it doesn't actually ensure
>>a synchronized view of memory.

if it is so terrible and horribble, why it take in a pentium1 to do
49*(49+1000)= 51401 incs in share memory in 49 threads 1s? (and 4s if
there is Sleep(3000) in the thread function "ThreadFunc")
for amdAthlon4 it take 3s for doing 49*(49+1000) incs in the shared
memory in 49 threads with in each thread "Sleep(3000)" in the routine
"ThreadFunc"

#include <windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define P printf
#define uns unsigned

/* This seems ok like a C++ file */
#ifdef __cplusplus
extern "C" {
#endif

int __stdcall
DaiRisorsa256(uns Ntherad, char* funz, void* par, uns* res);

#ifdef __cplusplus
}
#endif

char a[500]={0};

int thr[500]={0};

int th1[500]={0};

uns __stdcall adda(uns* a)

{if(*a>500) return 0;
++th1[*a]; return 1;
}

void printArray(int* a, uns len)
{uns i;
P("Array=");
for(i=0; i<len; ++i) P("%i ", a[i]);
P("\n");
}

// nthreads=n 49*(49+1000)= 51401 incs

DWORD __stdcall ThreadFunc( void* lpParam )

{uns n, res, par, res1, val, i;
n= *(uns*) lpParam;
Sleep(3000); // with this they are 4s otherwise it is 1s

if(n>=500) {P("Errore + di 500 threads\n"); return 0;}
else ++thr[n];

for(par=1, res1=0; par<50; ++par, res1+=res)

while( DaiRisorsa256(n, (char*)adda, &par, &res)==0 )
{Sleep(100); thr[n]|=0x80000000;}

par=1;
for(i=0; i<1000; ++i)

while( DaiRisorsa256(n, (char*)adda, &par, &res)==0 )
{Sleep(100); thr[n]|=0x80000000;}

return res1;
}

int main(void)
{uns n, res, par;

time_t ti, tf;
DWORD dwThreadId[200], dwThrdParam;
HANDLE hThread[200];
DWORD lpExitCode =0;

P("Inizio ...\n"); ti=time(0);
for(n=1; n<50; ++n)
{ dwThrdParam=n;
hThread[n] = CreateThread(
NULL, // no security attributes
0, // use default stack size
ThreadFunc, // thread function
&dwThrdParam, // argument to thread function
0, // use default creation flags
&dwThreadId[n]); // returns the thread identifier
if(hThread[n] == NULL)
{// ErrorExit( "CreateThread failed." );
P("CreateThread failed.\n");
exit(1);
}
P("hThread=0x%x dwThreadId=0x%x\n",
(unsigned)hThread[n], (unsigned) dwThreadId[n]);
}

for(n=1; n<50; ++n)
{li:;

Sleep(0);

GetExitCodeThread(hThread[n],
&lpExitCode
// address to receive termination status
);
if(lpExitCode==STILL_ACTIVE) goto li;
P("return%u=%u ", n, lpExitCode);
CloseHandle( hThread[n] );
}

printArray(thr, 50);
printArray(th1, 50);
tf=time(0);
P("DeltaT=%f\n", difftime(tf, ti));
exit(0);
}

---------------------------------------------

section _DATA public use32 class=DATA

; nasmw -fobj thisfile.asm

global LockDword , UnlockDword , putInFIFO , getFromFIFO , lenFIFO

global isInFIFO , isFirstInFIFO , meno1FromFIFO
global DaiRisorsa , DaiRisorsa256
extern Sleep

daiRisorsa_var dd 0 , 0

fifo_head dd 0 , 0

; 0 1 2 3 4 5 6 7 8
; ^ > ^
; int lenFIFO(uns head, uns tail, uns fifosize)
; 0ra, 4P_head, 8P_tail, 12P_fifosize
lenFIFO:
mov eax, [esp+8]
sub eax, [esp+4]
jge .f
add eax, [esp+12]
inc eax
.f:
ret 12

;int

;isFirstInFIFO(uns* FIFOarray, uns* head, uns* tail, uns val)

; 0r, 4c, 8ra, 12P_FIFO, 16P_Head, 20P_Tail, 24P_val
isFirstInFIFO:
push ecx
push edx
%define @fifo [esp+12]
%define @head [esp+16]
%define @tail [esp+20]
%define @val [esp+24]
mov eax, @head
mov edx, @tail

mov ecx, [eax]
cmp ecx, [edx]

jne .1
.0:
xor eax, eax
stc
jmp short .f
.1:

mov eax, @fifo
mov edx, @val
cmp dword[eax+4*ecx], edx

jne .0
mov eax, 1
clc
.f:
%undef @fifo
%undef @head
%undef @tail
%undef @val
pop edx
pop ecx
ret 16

; >=Pentium
; int
; daiRisorsa(uns* fifo,uns* head,uns* tail,uns fifosize,
; uns NThread, char* funz, uns* par, uns* res)

cmp esi, 0
je .e
cmp dword[ebp+20], 0
je .e
cmp dword[ebp+28], 0
je .e

.2:
push dword[ebp+16]

push dword[ebp+8]
push dword[ebp+4]
push dword[ebp]

call isFirstInFIFO
jz .3
RDTSC
and eax, 0x7f
push eax
call Sleep
jmp short .2
.3:

cmp dword[ebp+24], 0

je .4

push dword [ebp+24]
call dword[ebp+20]
mov edx, [ebp+28]
mov [edx], eax

jmp short .5
.4:

call dword[ebp+20]
mov edx, [ebp+28]
mov [edx], eax

; Ntrhead!=0, funzione!=0, res!=0

Array=0 0 2 0 2 0 1 1 2 0 2 1 1 0 2 1 1 0 2 1 0 2 1 0 0 3 1 1
0 2 1 1 1 0 2 1 0 2 1 1 0 2 1 0 2 1 1 1 0 2
Array=0 49049 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49
49 49 49
49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49
49 49 49
DeltaT=4.000000

Array=0 0 0 0 0 0 0 0 0 0 0 0 12 0 0 0 0 0 0 0 0 9 0 0 0 0 0
0 0 0 9 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 9 1
Array=0 49049 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49
49 49 49
49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49 49
49 49 49
DeltaT=3.000000

David Schwartz

unread,

Dec 12, 2006, 6:18:39 PM12/12/06

to

av wrote:

> On Mon, 11 Dec 2006 16:32:44 +0100, av wrote:

> >On 6 Dec 2006 14:38:56 -0800, David Schwartz wrote:

> >>av wrote:

> >>Your implementation is terrible. For one thing, it requires a locked
> >>operation even if there is no contention, which ensures you always get
> >>about the worst performance possible. Worse, it doesn't actually ensure
> >>a synchronized view of memory.

> if it is so terrible and horribble,

It is, for the two reasons I stated above.

> why it take in a pentium1 to do
> 49*(49+1000)= 51401 incs in share memory in 49 threads 1s? (and 4s if
> there is Sleep(3000) in the thread function "ThreadFunc")
> for amdAthlon4 it take 3s for doing 49*(49+1000) incs in the shared
> memory in 49 threads with in each thread "Sleep(3000)" in the routine
> "ThreadFunc"

That's not how you benchmark synchronization primitives. You have to
look at multiple modern CPUs, and you have to look at how much CPU is
wasted too. You have to test the impact of the synchronization code on
other code running on other CPUs because FSB efficiency is important
too.

Your code synchronizes by saturating the FSB. That is a disaster.

DS

av

unread,

Dec 13, 2006, 12:56:58 AM12/13/06

to

i don't know what "FSB" should be, but if a multi-cpu system is in
crisis where a pentium1 system with 1 alone cpu is not: it could be
something wrong with "FSB"

David Schwartz

unread,

Dec 13, 2006, 4:09:27 AM12/13/06

to

av wrote:

> >DS

You have no idea what you're talking about. An hour or two of research
would do you a lot of good.

DS

av

unread,

Dec 13, 2006, 12:11:32 PM12/13/06

to

On 13 Dec 2006 01:09:27 -0800, David Schwartz wrote:
> i

>> i don't know what "FSB" should be, but if a multi-cpu system is in
>> crisis where a pentium1 system with 1 alone cpu is not: it could be
>> something wrong with "FSB"

i agree with me

>You have no idea what you're talking about. An hour or two of research
>would do you a lot of good.

so i have to search what?
it is enough to search "FSB" with ???? word in Google?

David Schwartz

unread,

Dec 14, 2006, 1:50:18 AM12/14/06

to

av wrote:

This may be a useful start:
http://en.wikipedia.org/wiki/Front_side_bus

Imagine if you have a lock that you acquire like this:

#define LOCKED 0
#define UNLOCKED 1
int spinlock=UNLOCKED;

void lock(void)
{
while(InterlockedExchange(&spinlock, LOCKED)!=UNLOCKED)
/* do nothing */;
}
void unlock(void)
{
spinlock=UNLOCKED;
}

Here 'InterlockedExchange' is roughly equivalent to:
int InterlockedExchange(int &j, int value)
{
int ret;
lock_fsb();
ret=*j;
*j=value;
unlock_fsb();
return ret;
}

Now imagine you have one thread that has returned from 'lock' and holds
the lock. It's trying to get some work done so that it can release the
lock. Now imagine two other threads have called 'lock' and are waiting
for the first thread to call 'unlock'.

Each of those two threads is keeping the FSB locked most of the time.
Also, the '*j=value' causes the cache line containing 'spinlock' to
bounce back and forth over the FSB between the two CPU's caches.

Fairness will probably result in each of the three threads getting
about 1/3 of the FSB, although in practice it's usually worse. So how
will thread one manage to finish the work it's doing and release the
lock when the other two threads are saturating the FSB with junk?

It takes extensive experience on different types of hardware before
you're likely to be able to write good threading primitives. You have
to have a detailed understanding of what simple instructions make the
hardware actually do.

A 'simple' fix for this might be to change 'lock' like this:

void lock(void)
{
while(1)
{
while( (*(volatile int *)&spinlock) == LOCKED )
/* do nothing */ ;
if(InterlockedExchange(&spinlock, LOCKED)==UNLOCKED) return;
}
}

That may seem better, but it's really not. Consider if one thread is
spinning in that inner 'while' loop while another thread,
hyper-threaded on the same core, is trying to get finished and release
the lock. How will it get finished when the first thread is
monopolizing the core's memory access hardware in that tight spin?

Again, It takes extensive experience on different types of hardware
before you're likely to be able to write good threading primitives. You
have to have a detailed understanding of what simple instructions make
the hardware actually do.

DS

joseph...@gmail.com

unread,

Dec 14, 2006, 5:52:26 AM12/14/06

to

You admit to not knowing what a spinlock is, or an FSB, or indeed, the
current state-of-the-art in multithreading primitives. Yet, you expect
to have invented something equal or better to the state of the art? I
know what those two things are and I'm still a novice in
multithreading.

Stop wasting your time trying to get people to teach you the subject
backwards, starting at low-level implementation and progressing to
high-level understanding. Put your code on a shelf somewhere and find
good learning material.

Where does one find good learning material? Now that is a passable
beginner's question.

av

unread,

Dec 14, 2006, 7:09:25 AM12/14/06

to

On Tue, 12 Dec 2006 18:27:06 +0100, av wrote:
i have some errors here and the "right" version has a strange
behaviour: it seems if there is no sleep in the thread function 1 time
over 4 the program not exit ...
why the wrong version it seems to had run ok?

jnc .1

> RDTSC
> and eax, 0x7f
> push eax
> call Sleep
> jmp short .0
>.1:
> push dword[ebp+16]
> push dword[ebp+12]
> push dword[ebp+8]
> push dword[ebp+4]
> push dword[ebp]
> call putInFIFO
> push daiRisorsa_var
> call UnlockDword
>.2:
> push dword[ebp+16]
> push dword[ebp+8]
> push dword[ebp+4]
> push dword[ebp]
> call isFirstInFIFO
> jz .3

jnc .3

> RDTSC
> and eax, 0x7f
> push eax
> call Sleep
> jmp short .2
>.3:
> cmp dword[ebp+24], 0
> je .4
> push dword [ebp+24]
> call dword[ebp+20]
> mov edx, [ebp+28]
> mov [edx], eax
> jmp short .5
>.4:
> call dword[ebp+20]
> mov edx, [ebp+28]
> mov [edx], eax
>.5:
> push esi
> push daiRisorsa_var
> call LockDword
> jz .6

jnc .6

av

unread,

Dec 14, 2006, 1:35:07 PM12/14/06

to

On Thu, 14 Dec 2006 13:09:25 +0100, av wrote:
>On Tue, 12 Dec 2006 18:27:06 +0100, av wrote:
>i have some errors here and the "right" version has a strange
>behaviour: it seems if there is no sleep in the thread function 1 time
>over 4 the program not exit ...
>why the wrong version it seems to had run ok?

there is some problem for to load windows95 OS, it seems when i push
the ON button for load OS, system read hardisk and "lock" so i can not
load anything. It was my little assembly program for study thread that
cause all this? it is a manifestation of undefinite behaviour that has
broken my old PC? for now i fail