My older asm...

Chris M. Thomasson

unread,

Dec 3, 2023, 12:38:18 AM12/3/23

to

Fwiw, I found some of my older 686 code I posted about way back in 2006.
The WayBack Machine is pretty nice!

http://web.archive.org/web/20060214112345/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_gcc_asm.html

Here is a MASM version:

http://web.archive.org/web/20060214112539/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_masm_asm.html

A long time ago...

For GCC, assemble with GAS:
____________________

# Copyright 2005 Chris Thomasson

.align 16
.globl np_ac_i686_atomic_dwcas_fence
np_ac_i686_atomic_dwcas_fence:
pushl %esi
pushl %ebx
movl 16(%esp), %esi
movl (%esi), %eax
movl 4(%esi), %edx
movl 20(%esp), %esi
movl (%esi), %ebx
movl 4(%esi), %ecx
movl 12(%esp), %esi
lock cmpxchg8b (%esi)
jne np_ac_i686_atomic_dwcas_fence_fail
xorl %eax, %eax
popl %ebx
popl %esi
ret

np_ac_i686_atomic_dwcas_fence_fail:
movl 16(%esp), %esi
movl %eax, (%esi)
movl %edx, 4(%esi)
movl $1, %eax
popl %ebx
popl %esi
ret

.align 16
.globl ac_i686_stack_mpmc_push_cas
ac_i686_stack_mpmc_push_cas:
movl 4(%esp), %edx
movl (%edx), %eax
movl 8(%esp), %ecx

ac_i686_stack_mpmc_push_cas_retry:
movl %eax, (%ecx)
lock cmpxchgl %ecx, (%edx)
jne ac_i686_stack_mpmc_push_cas_retry
ret

.align 16
.globl np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas
np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas:
pushl %esi
pushl %ebx

np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_reload:
movl 12(%esp), %esi
movl 4(%esi), %edx
movl (%esi), %eax

np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_retry:
movl 16(%esp), %ebx
movl %eax, (%ebx)
mfence
cmpl (%esi), %eax
jne np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_reload
test %eax, %eax
je np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_fail
movl (%eax), %ebx
leal 1(%edx), %ecx
lock cmpxchg8b (%esi)
jne np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_retry

np_ac_i686_lfgc_smr_stack_mpmc_pop_dwcas_fail:
movl 16(%esp), %esi
xorl %ebx, %ebx
movl %ebx, (%esi)
popl %ebx
popl %esi
ret

.align 16
.globl np_ac_i686_stack_mpmc_pop_dwcas
np_ac_i686_stack_mpmc_pop_dwcas:
pushl %esi
pushl %ebx
movl 12(%esp), %esi
movl 4(%esi), %edx
movl (%esi), %eax

np_ac_i686_stack_mpmc_pop_dwcas_retry:
test %eax, %eax
je np_ac_i686_stack_mpmc_pop_dwcas_fail
movl (%eax), %ebx
leal 1(%edx), %ecx
lock cmpxchg8b (%esi)
jne np_ac_i686_stack_mpmc_pop_dwcas_retry

np_ac_i686_stack_mpmc_pop_dwcas_fail:
popl %ebx
popl %esi
ret

.align 16
.globl ac_i686_lfgc_smr_activate
ac_i686_lfgc_smr_activate:
movl 4(%esp), %edx
movl 8(%esp), %ecx

ac_i686_lfgc_smr_activate_reload:
movl (%ecx), %eax
movl %eax, (%edx)
mfence
cmpl (%ecx), %eax
jne ac_i686_lfgc_smr_activate_reload
ret

.align 16
.globl ac_i686_lfgc_smr_deactivate
ac_i686_lfgc_smr_deactivate:
movl 4(%esp), %ecx
xorl %eax, %eax
movl %eax, (%ecx)
ret

.align 16
.globl ac_i686_queue_spsc_push
ac_i686_queue_spsc_push:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl 4(%eax), %edx
# sfence may be needed here for future x86
movl %ecx, (%edx)
movl %ecx, 4(%eax)
ret

.align 16
.globl ac_i686_queue_spsc_pop
ac_i686_queue_spsc_pop:
pushl %ebx
movl 8(%esp), %ecx
movl (%ecx), %eax
cmpl 4(%ecx), %eax
je ac_i686_queue_spsc_pop_failed
movl (%eax), %edx
# lfence may be needed here for future x86
movl 12(%edx), %ebx
movl %edx, (%ecx)
movl %ebx, 12(%eax)
popl %ebx
ret

ac_i686_queue_spsc_pop_failed:
xorl %eax, %eax
popl %ebx
ret

.align 16
.globl ac_i686_mb_fence
ac_i686_mb_fence:
mfence
ret

.align 16
.globl ac_i686_mb_naked
ac_i686_mb_naked:
ret

.align 16
.globl ac_i686_mb_store_fence
ac_i686_mb_store_fence:
movl 4(%esp), %ecx
movl 8(%esp), %eax
mfence
movl %eax, (%ecx)
ret

.align 16
.globl ac_i686_mb_store_naked
ac_i686_mb_store_naked:
movl 4(%esp), %ecx
movl 8(%esp), %eax
movl %eax, (%ecx)
ret

.align 16
.globl ac_i686_mb_load_fence
ac_i686_mb_load_fence:
movl 4(%esp), %ecx
movl (%ecx), %eax
mfence
ret

.align 16
.globl ac_i686_mb_load_naked
ac_i686_mb_load_naked:
movl 4(%esp), %ecx
movl (%ecx), %eax
ret

.align 16
.globl ac_i686_atomic_xchg_fence
ac_i686_atomic_xchg_fence:
movl 4(%esp), %ecx
movl 8(%esp), %eax
xchgl %eax, (%ecx)
ret

.align 16
.globl ac_i686_atomic_xadd_fence
ac_i686_atomic_xadd_fence:
movl 4(%esp), %ecx
movl 8(%esp), %eax
lock xaddl %eax, (%ecx)
ret

.align 16
.globl ac_i686_atomic_inc_fence
ac_i686_atomic_inc_fence:
movl 4(%esp), %ecx
movl $1, %eax
lock xaddl %eax, (%ecx)
incl %eax
ret

.align 16
.globl ac_i686_atomic_dec_fence
ac_i686_atomic_dec_fence:
movl 4(%esp), %ecx
movl $-1, %eax
lock xaddl %eax, (%ecx)
decl %eax
ret

.align 16
.globl ac_i686_atomic_cas_fence
ac_i686_atomic_cas_fence:
movl 4(%esp), %ecx
movl 8(%esp), %eax
movl 12(%esp), %edx
lock cmpxchgl %edx, (%ecx)
ret
__________________

MitchAlsup

unread,

Dec 3, 2023, 3:21:18 PM12/3/23

to

Do you have a HLL version of these ??

I would like to try esm on them.

Chris M. Thomasson

unread,

Dec 4, 2023, 7:47:04 PM12/4/23

to

On 12/3/2023 12:20 PM, MitchAlsup wrote:
> Do you have a HLL version of these ??
>
> I would like to try esm on them.

Nope. Damn it! However, the wayback machine helped me find some of my
original code. I think I have it on an older hard drive in a damn
warehouse...

Hummm... Afaict, I should be able to port this over into pure C++11, wrt
HLL... ;^) Humm... When I wrote this, well, I had to use asm in order to
try to avoid any potential C++ reordering, link time optimizations aside
for a moment. atomic and membars were not std then... ;^o

Chris M. Thomasson

unread,

Dec 4, 2023, 11:42:01 PM12/4/23

to

On 12/3/2023 12:20 PM, MitchAlsup wrote:

> Do you have a HLL version of these ??
>
> I would like to try esm on them.

Take note of:
___________________

.align 16
.globl ac_i686_lfgc_smr_activate
ac_i686_lfgc_smr_activate:
movl 4(%esp), %edx
movl 8(%esp), %ecx

ac_i686_lfgc_smr_activate_reload:
movl (%ecx), %eax
movl %eax, (%edx)
mfence
cmpl (%ecx), %eax
jne ac_i686_lfgc_smr_activate_reload
ret

___________________

This is an example of where a #StoreLoad style membar is required on an
x86. SMR is Safe Memory Reclamation, or aka Hazard Pointers.

MitchAlsup

unread,

Dec 5, 2023, 2:11:12 PM12/5/23

to

Chris M. Thomasson wrote:

> On 12/3/2023 12:20 PM, MitchAlsup wrote:
>> Do you have a HLL version of these ??
>>
>> I would like to try esm on them.

> Take note of:
> ___________________

> ..align 16
> ..globl ac_i686_lfgc_smr_activate

> ac_i686_lfgc_smr_activate:
> movl 4(%esp), %edx
> movl 8(%esp), %ecx

> ac_i686_lfgc_smr_activate_reload:
> movl (%ecx), %eax
> movl %eax, (%edx)
> mfence
> cmpl (%ecx), %eax
> jne ac_i686_lfgc_smr_activate_reload
> ret
> ___________________

> This is an example of where a #StoreLoad style membar is required on an
> x86. SMR is Safe Memory Reclamation, or aka Hazard Pointers.

esm performs a switch into 1) sequentially consistent at the beginning
of an ATOMIC event, 2) treats each memory reference in the event as
SC, and 3) reverts back to causal consistency after all the memory
references become visible instantaneously. So my ISA covers the
MemBar requirements automagically.

{
1) HW is in a position to know if a ST/LD or LD/LD MemBar is required
at the beginning of the event.
2) Uncacheable STs in the atomic event are performed in processor-order
==memory-order so that cacheable locks covering uncacheable memory bring
no surprises
3) HW is in a position to know if ST/LD or ST/ST MemaBr is required after
leaving an event.
}
So software does not have to concern itself with the idiosyncrasies
of the memory model.

Chris M. Thomasson

unread,

Dec 5, 2023, 4:15:46 PM12/5/23

to

Fwiw, the only reason I needed to use mfence in my
ac_i686_lfgc_smr_activate function is to _honor_ ordering wrt the store
followed by a load to another location on i686. Now, fwiw, my friend Joe
Seigh created an interesting algorithm called SMR-RCU, a really neat
hybrid. This would allow me to elude the explicit #StoreLoad membar on
an x86 aka MFENCE or even a dummy LOCK RMW. Fwiw, loading a hazard
pointer does not require any atomic RMW logic...

> {
> 1) HW is in a position to know if a ST/LD or LD/LD MemBar is required
> at the beginning of the event.
> 2) Uncacheable STs in the atomic event are performed in processor-order
> ==memory-order so that cacheable locks covering uncacheable memory bring
> no surprises
> 3) HW is in a position to know if ST/LD or ST/ST MemaBr is required
> after leaving an event.
> }
> So software does not have to concern itself with the idiosyncrasies of
> the memory model.

So, when you get some _really_ free time to burn and you are bored, can
you show me what ac_i686_lfgc_smr_activate would look like in your
system? Can I just get rid of the MFENCE? If I can, well, that implies
sequential consistency.

Do you have a special compiler that can turn std C++11 code into asm
that works wrt your system? Is that why you asked me if I had a HLL
version of it?

Thanks.

MitchAlsup

unread,

Dec 5, 2023, 6:26:09 PM12/5/23

to

Chris M. Thomasson wrote:

> On 12/5/2023 11:09 AM, MitchAlsup wrote:
>> Chris M. Thomasson wrote:
>>
>>> On 12/3/2023 12:20 PM, MitchAlsup wrote:
>>>> Do you have a HLL version of these ??
>>>>
>>>> I would like to try esm on them.
>>
>>> Take note of:
>>> ___________________
>>> ..align 16
>>> ..globl ac_i686_lfgc_smr_activate
>>> ac_i686_lfgc_smr_activate:
>>> movl 4(%esp), %edx
>>> movl 8(%esp), %ecx

I cannot tell is the above is 2 LDs or 2 STs (one of the downsides of using MOV rather than LD or ST.)
Since the SP has not been altered at this point you should not be able to do STs to the stack, so I
assume LDs (for whatever reason). If these are arguments (as they appear) these are passed in registers
in My 66000, So I assume these 2 instructions are unnecessary.

>>
>>> ac_i686_lfgc_smr_activate_reload:
>>>    movl (%ecx), %eax
>>>    movl %eax, (%edx)
>>>    mfence
>>>    cmpl (%ecx), %eax

Here, it looks like you are checking that the value you just stored is the same as the value of
the memory container it was loaded from. This is checked by HW in My 66000. {{But I suggest this
sequence is prone to ABA failures since it is based on the bit pattern stored rather than the
fact the memory address was not written.}}

>>> jne ac_i686_lfgc_smr_activate_reload
>>> ret

My attempt--based on the above realizations.

ac_i686_lfgc_smr_activate:
LD R4,[R2].lock
ST R4,[R1].lock
RET

The .lock on the LD begins the ATOMIC event and initializes the failure point
to ac_i686_lfgc_smr_activate_reload, which does not need a label or a branch;
core makes sure LD access is sequentially consistent with all previously
issued memory references before checking any deeper than the DCache and TLB,
and does not deliver LD.data until this state has been achieved. It is this
waiting that opens up window for a SNOOP to interfere with this sequence.

The .lock on the ST ends the ATOMIC event--so if no interference has been
detected, the event succeeds and tehST is performed, core reverts to causal
consistency--if interference has been detected, ST is cancelled, control
passes back to the initiator (LD.lock) and the event begins anew and afresh.

You get rid of a lot more than just the mfence. See Above.

> Do you have a special compiler that can turn std C++11 code into asm
> that works wrt your system? Is that why you asked me if I had a HLL
> version of it?

I have a 99% functional C compiler that runs many Fortran programs, but
C++ is a way bigger language {constructors, destructors, try-throw-catch,
their version of ATOMICs, threading, .....}

> Thanks.

Chris M. Thomasson

unread,

Dec 5, 2023, 6:44:38 PM12/5/23

to

On 12/5/2023 3:25 PM, MitchAlsup wrote:
> Chris M. Thomasson wrote:
>
>> On 12/5/2023 11:09 AM, MitchAlsup wrote:
>>> Chris M. Thomasson wrote:
>>>
>>>> On 12/3/2023 12:20 PM, MitchAlsup wrote:
>>>>> Do you have a HLL version of these ??
>>>>>
>>>>> I would like to try esm on them.
>>>
>>>> Take note of:
>>>> ___________________
>>>> ..align 16
>>>> ..globl ac_i686_lfgc_smr_activate
>>>> ac_i686_lfgc_smr_activate:
>>>> movl 4(%esp), %edx
>>>> movl 8(%esp), %ecx
>
> I cannot tell is the above is 2 LDs or 2 STs (one of the downsides of
> using MOV rather than LD or ST.)
> Since the SP has not been altered at this point you should not be able
> to do STs to the stack, so I
> assume LDs (for whatever reason). If these are arguments (as they
> appear) these are passed in registers
> in My 66000, So I assume these 2 instructions are unnecessary.

Sorry for the quick response! The C prototype for the function
ac_i686_lfgc_smr_activate is:
__________________________
AC_SYS_APIEXPORT ac_i686_node_t* AC_CDECL
ac_i686_lfgc_smr_activate
( ac_i686_lfgc_smr_hazard_t,
ac_i686_node_t** );
__________________________

That resided in the following file:

http://web.archive.org/web/20060214112610/http://appcore.home.comcast.net/appcore/include/cpu/i686/ac_i686_lfgc_smr_h.html

This is using cdecl for the ABI.

Notice that:

ac_i686_lfgc_smr_hazard_t is:

typedef ac_i686_node_t** ac_i686_lfgc_smr_hazard_t;

and ac_i686_node_t is:

/* MUST be four adjacent words */
typedef struct
AC_DECLSPEC_PACKED
ac_i686_node_
{
struct ac_i686_node_ *next;
struct ac_i686_node_ *lfgc_next;
ac_fp_dtor_t fp_dtor;
const void *state;

} ac_i686_node_t;

That can be found here:

http://web.archive.org/web/20060214112519/http://appcore.home.comcast.net/appcore/include/cpu/i686/ac_i686_h.html

I will get back to you! A bit constrained on time right now.

>
>>>
>>>> ac_i686_lfgc_smr_activate_reload:
>>>>    movl (%ecx), %eax
>>>>    movl %eax, (%edx)
>>>>    mfence
>>>>    cmpl (%ecx), %eax
>
> Here, it looks like you are checking that the value you just stored is
> the same as the value of
> the memory container it was loaded from.

It's a store followed by a load to another location. SMR needs this to
be honored.

MitchAlsup

unread,

Dec 5, 2023, 6:51:06 PM12/5/23

to

Chris M. Thomasson wrote:

> On 12/5/2023 3:25 PM, MitchAlsup wrote:
>>
>>>>> ac_i686_lfgc_smr_activate_reload:
>>>>>    movl (%ecx), %eax
>>>>>    movl %eax, (%edx)
>>>>>    mfence
>>>>>    cmpl (%ecx), %eax
>>
>> Here, it looks like you are checking that the value you just stored is
>> the same as the value of
>> the memory container it was loaded from.

> It's a store followed by a load to another location. SMR needs this to
> be honored.

This means I cannot read x86 anymore, so we need a different communication means.

Chris M. Thomasson

unread,

Dec 5, 2023, 7:06:56 PM12/5/23

to

[...]

Basically, SMR needs to load something from location A, store it in
location B, and reload from A and compare it to B. This needs a
StoreLoad relationship. Basically, location B would be on a per-thread
stack in TLS. So, iirc off the top of my head:

<pseudo-code>
______________
smr_reload:
a = atomic_load(&loc_a);

// critical!
atomic_store(&loc_b, a);
membar_storeload();
b = atomic_load(&loc_a);

if (a != b) goto smr_reload;
______________

Where loc_b usually resides in TLS. This is a key aspect of why SMR can
work at all.

Chris M. Thomasson

unread,

Dec 5, 2023, 7:15:35 PM12/5/23

to

Fwiw, I had to create these sensitive algorithms in asm in order for me
to be able to use them in C without it trying any fancy reordering funny
business. Well, back in 2005 when I wrote it there was no std for
atomics and membars wrt C/C++. So, I am using cdecl for the ABI into my
asm. Btw, this is in at&t syntax. Fwiw, here is one in MASM, aka intel
syntax:

https://web.archive.org/web/20060214112539/http://appcore.home.comcast.net/appcore/src/cpu/i686/ac_i686_masm_asm.html

align 16
ac_i686_lfgc_smr_activate PROC
mov edx, [esp + 4]
mov ecx, [esp + 8]

ac_i686_lfgc_smr_activate_reload:
mov eax, [ecx]
mov [edx], eax
mfence
cmp eax, [ecx]
jne ac_i686_lfgc_smr_activate_reload
ret
ac_i686_lfgc_smr_activate ENDP

Chris M. Thomasson

unread,

Dec 5, 2023, 7:26:01 PM12/5/23

to

On 12/5/2023 3:25 PM, MitchAlsup wrote:
> Chris M. Thomasson wrote:

[...]

> I have a 99% functional C compiler that runs many Fortran programs, but
> C++ is a way bigger language {constructors, destructors, try-throw-catch,
> their version of ATOMICs, threading, .....}

A C11 compiler that knows about membars and atomics? Fwiw, check this out:

http://www.smorgasbordet.com/pellesc

[...]

MitchAlsup

unread,

Dec 5, 2023, 9:21:02 PM12/5/23

to

Chris M. Thomasson wrote:

> Basically, SMR needs to load something from location A, store it in
> location B, and reload from A and compare it to B. This needs a
> StoreLoad relationship. Basically, location B would be on a per-thread
> stack in TLS. So, iirc off the top of my head:

> <pseudo-code>
> ______________
> smr_reload:
> a = atomic_load(&loc_a);

> // critical!
> atomic_store(&loc_b, a);
> membar_storeload();
> b = atomic_load(&loc_a);

> if (a != b) goto smr_reload;
> ______________

> Where loc_b usually resides in TLS. This is a key aspect of why SMR can
> work at all.

smr_reload:

LD R4,[R2].lock
ST R4,[R1].lock

-----------

As before: the LD waits until all older memory references are sequentially ordered,
a HW monitor is initialized and if there is a write, coherent invalidate, (or a
couple of other) accesses to the cache line touched by R2, the ST sill not be performed
and control reverts to the LD.lock instruction, creating a tight loop.

By the time control arrives at the instruction following the ST.lock a == b is guaranteed.

You can insert code to check this, but it is dead code under esm execution model.

Should you want to go somewhere other than smr_reload::

smr_reload:
LD R4,[R2].lock
BI somewhereelse
ST R4,[R1].lock
-----------
somewhereelse::
// control arrives here upon detection of interference.

Using other parts of ISA one could::

smr_reload:
MM R1,R2,#8

and have the guarantee the the memory to memory move was ATOMIC so you don't need
the .lock at all but you can't go somewhereelse.

None of the esm solutions are sensitive to the ABA problem.

MitchAlsup

unread,

Dec 5, 2023, 9:26:51 PM12/5/23

to

Chris M. Thomasson wrote:

> On 12/5/2023 3:25 PM, MitchAlsup wrote:
>> Chris M. Thomasson wrote:
> [...]
>> I have a 99% functional C compiler that runs many Fortran programs, but
>> C++ is a way bigger language {constructors, destructors, try-throw-catch,
>> their version of ATOMICs, threading, .....}

> A C11 compiler that knows about membars and atomics? Fwiw, check this out:

Where does it mention a My 66000 ISA target ?? That is the only ISA I am
spending time in.........

> http://www.smorgasbordet.com/pellesc

> [...]

Chris M. Thomasson

unread,

Dec 5, 2023, 9:33:06 PM12/5/23

to

On 12/5/2023 6:18 PM, MitchAlsup wrote:
> Chris M. Thomasson wrote:
>
>> Basically, SMR needs to load something from location A, store it in
>> location B, and reload from A and compare it to B. This needs a
>> StoreLoad relationship. Basically, location B would be on a per-thread
>> stack in TLS. So, iirc off the top of my head:
>
> {{I still contend this is sensitive to the ABA problem}}

Actually, SMR is not sensitive to the ABA problem at all in and of
itself. Fwiw, in fact SMR removes the ABA problem in ABA prone lock-free
algorithms. ABA exists mainly in CAS based algorithms. For instance
LL/SC based algorithms do not need to worry about ABA, but a different
can of worms. Usually pertaining to potential livelock in user programs
that do not take reservation granularity and proper alignment into
account. I found an old post of mine:

https://groups.google.com/g/comp.arch/c/yREvvvKvr6k/m/nRZ5tpLwDNQJ

Way back in 2005. Wow how time flies.

Neither is SMR. If a system is seq_cst, then no membar is needed in the
SMR algorithm. Also, Joe Seighs SMR+RCU hybrid does not need a membar in
SMR even on RMO systems. I need to find an old post from Joe about this.

Chris M. Thomasson

unread,

Dec 5, 2023, 9:35:36 PM12/5/23

to

I was just wondering if your C compiler handles C11? If so, that would
be great!!!!

MitchAlsup

unread,

Dec 6, 2023, 10:02:37 AM12/6/23

to

It handles whatever the current CLANG front end handles.

Chris M. Thomasson

unread,

Dec 6, 2023, 5:40:02 PM12/6/23

to

On 12/5/2023 6:18 PM, MitchAlsup wrote:

Fwiw, check this out:

https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0233r5.pdf

Chris M. Thomasson

unread,

Dec 6, 2023, 8:00:18 PM12/6/23

to

Okay. I will get back to you tonight or sometime tomorrow. I will ask
you to compile some std C11... I want to see what occurs, so to speak. :^)

Chris M. Thomasson

unread,

Dec 6, 2023, 8:56:20 PM12/6/23

to

A proposal?
https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0233r5.pdf :^)

MitchAlsup

unread,

Dec 6, 2023, 10:31:35 PM12/6/23

to

Under 2.3. Pros and Cons
<snip>
The main disadvantage of the hazard pointers method is that each traversal incurs a store-load
memory order fence, when using the method's basic form (without blocking or using system
support such as sys_membarrier()).

The transition from {sequential to causal*} consistency appears to take place at the
subsequent memory reference . There are 2 cases to
consider::
a) ST.lock remains in the execution window
b) ST.lock has retired

(*) or stronger {MMI/O, config}

The stage by stage rules seem to be::

No-Address:: younger memory references are not allowed to access CACHE;
after ST.lock AGENs, those younger memory references can access the cache.
{{The younger memory references can pass through AGEN, and optimistically
read tag, TLB, data but not change any state or deliver (LD) or accept
(ST) a value.}}

{Address:
Write-Permission:
No-Data-No-Line::
No-Data-Line::
Data-No-Line::} younger cacheable memory references with different line
index then ST.lock are allowed to be seen externally, less than cacheable
are not allowed...

Data-Line:: When this store is sequentially consistent with the rest of
processor memory order; ST.data has been performed. Less than cacheable
younger accesses are allowed.

Notice that ST.lock is the only timing point:: but all participating
STs remain processor consistent so all participating STs are performed
before or during ST.lock.

Chris M. Thomasson

unread,

Dec 7, 2023, 10:15:20 PM12/7/23

to

How about this, C11:
____________________________
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <stdatomic.h>

struct ct_node
{
struct ct_node* m_next;
};

int
main(void)
{
printf("ct_c_atomic_test...\n\n");
fflush(stdout);

{
_Atomic(struct ct_node*) shared = NULL;

struct ct_node local = { NULL };

struct ct_node* result_0 = atomic_exchange(&shared, &local);

assert(!result_0);

struct ct_node* result_1 = atomic_exchange(&shared, NULL);

assert(result_1 == &local);
}

printf("completed!\n\n");

return 0;
}
____________________________

?

MitchAlsup

unread,

Dec 8, 2023, 6:02:07 PM12/8/23

to

Chris M. Thomasson wrote:

> On 12/6/2023 7:01 AM, MitchAlsup wrote:
>>
> How about this, C11:
> ____________________________
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
> #include <stdatomic.h>

> struct ct_node
> {
> struct ct_node* m_next;
> };

> int
> main(void)
> {

> printf("ct_c_atomic_test...nn");

> fflush(stdout);

> {
> _Atomic(struct ct_node*) shared = NULL;

> struct ct_node local = { NULL };

> struct ct_node* result_0 = atomic_exchange(&shared, &local);

> assert(!result_0);

> struct ct_node* result_1 = atomic_exchange(&shared, NULL);

> assert(result_1 == &local);
> }

> printf("completed!nn");

> return 0;
> }
> ____________________________

> ?

It should be approximately::

main:
ENTER R0,R0,#16
LEA R1,#"ct_c_atomic_test...nn" // printf("ct_c_atomic_test...nn");
CALL printf
MOV R1,#1 // fflush(stdout);
CALL fflush

MOV R2,#0 // shared = NULL; // pointer = 0; ?!?
ST R2,[SP,8] // local.m_next = NULL;
// for the life of me I can't see why the
// below code does not just SIGSEGV.
//..............................................// But I ignore that.....

ADD R5,SP,#8 // &local;
LD R3,[R2].lock // atomic_exchange(&shared
ST R5,[R2].lock // atomic_exchange(&shared = &local);
// ST R3,[SP,8] // local = atomic_exchange(); // dead

BEQ0 R3,assert1 // assert(!result_0);

// NULL = atomic_exchange(); is dead
LD R3,[R2].lock // atomic_exchange(&shared
ST #0,[R2].lock // atomic_exchange(&shared = NULL);
// R4 = result_1

// R5 already has &[sp+8]
CMP R4,R3,R5 // assert(result_1 == &local);
// last use R5, R3, R2
BEQ R4,assert2

LEA R1,#"completed!nn" // printf("completed!nn");
CALL printf
MOV R1,#0 // return 0;
EXIT R0,R0,#16

Chris M. Thomasson

unread,

Dec 8, 2023, 9:52:44 PM12/8/23

to

Actually, I am wondering why you "seem" think that it would have any
chance of SIGSEGV? The atomic exchanges are legit, all the memory
references are legit, no problem. Akin to, pseudo-code:
_________________
atomic<word*> shared = nullptr;
word local = 123;
word* x = shared.exchange(&local);
assert(x == nullptr);
word* y = shared.exchange(nullptr);
assert(y == &local);
_________________

Iirc, keep in mind that default membar is seq_cst in C/C++11. Unless I
foobar'ed it, it looks fine to me. :^)

>     ADD        R5,SP,#8        // &local;
>     LD        R3,[R2].lock        // atomic_exchange(&shared
>     ST        R5,[R2].lock        // atomic_exchange(&shared = &local);
> //    ST        R3,[SP,8]        // local = atomic_exchange();    // dead
>
>     BEQ0        R3,assert1        // assert(!result_0);
>
>                         // NULL = atomic_exchange(); is dead
>     LD        R3,[R2].lock        // atomic_exchange(&shared
>     ST        #0,[R2].lock        // atomic_exchange(&shared = NULL);
>                         // R4 = result_1
>
>                         // R5 already has &[sp+8]
>     CMP        R4,R3,R5        // assert(result_1 == &local);
>                         // last use R5, R3, R2
>     BEQ        R4,assert2
>
>     LEA        R1,#"completed!nn"    // printf("completed!nn");
>     CALL        printf
>     MOV        R1,#0            // return 0;
>     EXIT        R0,R0,#16

Ahhh! I need to examine this. Fwiw, MSVC has C11 atomic, but no threads.
What fun! ;^o

Afaict, PellesC has full C11 atomics, threads and membars.

MitchAlsup

unread,

Dec 8, 2023, 11:26:06 PM12/8/23

to

Why does _Atomic(struct ct_node*) shared = NULL; not set the shared
pointer to zero (NULL) ?? Apparently you are setting something at where
it is pointing to NULL; so how does shared get to be a pointer to something ??

> Iirc, keep in mind that default membar is seq_cst in C/C++11. Unless I
> foobar'ed it, it looks fine to me. :^)

>>     ADD        R5,SP,#8        // &local;
>>     LD        R3,[R2].lock        // atomic_exchange(&shared
>>     ST        R5,[R2].lock        // atomic_exchange(&shared = &local);
>> //    ST        R3,[SP,8]        // local = atomic_exchange();    // dead
>>
>>     BEQ0        R3,assert1        // assert(!result_0);
>>
>>                         // NULL = atomic_exchange(); is dead
>>     LD        R3,[R2].lock        // atomic_exchange(&shared
>>     ST        #0,[R2].lock        // atomic_exchange(&shared = NULL);
>>                         // R4 = result_1
>>
>>                         // R5 already has &[sp+8]
>>     CMP        R4,R3,R5        // assert(result_1 == &local);
>>                         // last use R5, R3, R2
>>     BEQ        R4,assert2
>>
>>     LEA        R1,#"completed!nn"    // printf("completed!nn");
>>     CALL        printf
>>     MOV        R1,#0            // return 0;
>>     EXIT        R0,R0,#16

> Ahhh! I need to examine this. Fwiw, MSVC has C11 atomic, but no threads.
> What fun! ;^o

You will find My 66000 ATOMICs to be a lot thinner that competing ISAs.

I looked at ARM ASM for this and ARM has converted the LD.lock;ST.lock
into a test-and-test-and-set loop. esm has automagic looping if there
is no interference check (BI), so::

    LD        R3,[R2].lock
// long number of cycles achieving sequential consistency and the value
// to be delivered to a register
ST R5,[R2[.lock --- // must fail
| // automagically
LD R3,[r2].lock <----/ // try again
// fewer cycles
ST R5,[R2[.lock // greater chance of success

If interference is detected making ST.lock unperformable, then control
reverts to the LD.lock. Now, we are already sequentially consistent
so the LD is performed and made visible externally with intent to write.

Oh, and BTW: this adds no state that across context switches--all
context switches cause the event to fail and control reverts to the
control point prior to context switching.

If you really do want test-and-test-and-set functionality::

Label:
    LD        R3,[R2]
    BC some_condition,R3,Label
    LD        R3,[R2].lock
ST R5,[R2[.lock

Chris M. Thomasson

unread,

Dec 8, 2023, 11:45:34 PM12/8/23

to

First of all, think of a simple atomic exchange, ala RMW, LOCK XCHG wrt
x86, fine... The atomic shared thing here is the word*:

word* shared = nullptr;

Okay, the shared pointer is equal to nullptr.

Lets create a local word:

word local = 123;

This local word is equal to 123.

Lets LOCK XCHG them:

word* previous = atomic_exchange(&shared, &local);

previous better damn well equal nullptr, and shared better be a pointer
to local!

See?

Chris M. Thomasson

unread,

Dec 9, 2023, 1:54:16 AM12/9/23

to

Humm... Okay, lets start small here... How would you code up
atomic_exchange all by itself, as a standalone function?

say an atomic type would be say, word*:

So, atomic_exchange would be:

word* atomic_exchange(word**, word*)

as a prototype where atomic_exchange is analogous to the RMW in x86 aka
LOCK XCHG. What would that function alone look like in your system?

Chris M. Thomasson

unread,

Dec 9, 2023, 1:59:30 AM12/9/23

to

On 12/8/2023 10:54 PM, Chris M. Thomasson wrote:
> On 12/8/2023 8:25 PM, MitchAlsup wrote:
>> Chris M. Thomasson wrote:

[...]

Code up the function atomic_exchange, then my code works in your system
as is. That is one easy way to start... ;^)

MitchAlsup

unread,

Dec 9, 2023, 5:17:24 PM12/9/23

to

MitchAlsup wrote:

> Chris M. Thomasson wrote:

Having written::

> LD R3,[R2].lock
> // long number of cycles achieving sequential consistency and the value
> // to be delivered to a register
> ST R5,[R2[.lock --- // must fail
> | // automagically
> LD R3,[r2].lock <----/ // try again
> // fewer cycles
> ST R5,[R2[.lock // greater chance of success

It occurs to me that the magic control transfer to LD.lock arrives
with the notion the previous ATOMIC event has failed and that this
second execution of LD.lock should simply wait for the cache line
{as if performing a LD without the .lock and without the attempt to
obtain write permission, and when the cache line arrives, chase it
with a Coherent Invalidate:: performing the test-and-test-and-set
paradigm without actually encoding the paradigm in instructions}.

This should eliminate most of the desire for test-and-test-and-set
explicitly in the instruction stream.

MitchAlsup

unread,

Dec 10, 2023, 12:06:11 PM12/10/23

to

Chris M. Thomasson wrote:

>

> How about this, C11:
> ____________________________
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
> #include <stdatomic.h>

Can you provide a reference to stdatomic.h that discusses how the functions are
supposed to work at the HW level rather than at the SW level.

Scott Lurndal

unread,

Dec 10, 2023, 3:08:43 PM12/10/23

to

As I understand it, such a reference doesn't exist. The C++ standard
simply defines the guarantees the application can expect from the
implementation (compiler + OS).

The C11/C++11 Standard language version is here:

https://en.cppreference.com/w/cpp/header/stdatomic.h

GCC's version:

https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

MitchAlsup

unread,

Dec 10, 2023, 6:02:05 PM12/10/23

to

This one is much better.

Chris M. Thomasson

unread,

Dec 10, 2023, 7:16:05 PM12/10/23

to

On 12/10/2023 12:08 PM, Scott Lurndal wrote:
> mitch...@aol.com (MitchAlsup) writes:
>> Chris M. Thomasson wrote:
>>
>>>
>>
>>> How about this, C11:
>>> ____________________________
>>> #include <stdio.h>
>>> #include <stdlib.h>
>>> #include <assert.h>
>>> #include <stdatomic.h>
>>
>> Can you provide a reference to stdatomic.h that discusses how the functions are
>> supposed to work at the HW level rather than at the SW level.
>
> As I understand it, such a reference doesn't exist.

I think so. An atomic exchange can be implemented with an atomic RMW
exchange (LOCK XCHG), CAS (cmpxchg), or even LL/SC.

MitchAlsup

unread,

Dec 10, 2023, 7:31:57 PM12/10/23

to

Chris M. Thomasson wrote:

> On 12/10/2023 12:08 PM, Scott Lurndal wrote:
>> mitch...@aol.com (MitchAlsup) writes:
>>> Chris M. Thomasson wrote:
>>>
>>>>
>>>
>>>> How about this, C11:
>>>> ____________________________
>>>> #include <stdio.h>
>>>> #include <stdlib.h>
>>>> #include <assert.h>
>>>> #include <stdatomic.h>
>>>
>>> Can you provide a reference to stdatomic.h that discusses how the functions are
>>> supposed to work at the HW level rather than at the SW level.
>>
>> As I understand it, such a reference doesn't exist.

> I think so. An atomic exchange can be implemented with an atomic RMW
> exchange (LOCK XCHG), CAS (cmpxchg), or even LL/SC.

More like::

An Atomic Exchange has to reach a point of sequential consistency (which
may entail a MEMBAR on machines with relaxed memory ordering) before
the address of the exchanged container is made visible to the system.
The exchange is performed in such a way that the stored value is visible
to the system prior to any other access to that container is made visible
to the system (this may also require an MEMBAR on systems with relaxed
memory orderings.)

Chris M. Thomasson

unread,

Dec 10, 2023, 7:48:03 PM12/10/23

to

On 12/10/2023 4:15 PM, Chris M. Thomasson wrote:
> On 12/10/2023 12:08 PM, Scott Lurndal wrote:
>> mitch...@aol.com (MitchAlsup) writes:
>>> Chris M. Thomasson wrote:
>>>
>>>>
>>>
>>>> How about this, C11:
>>>> ____________________________
>>>> #include <stdio.h>
>>>> #include <stdlib.h>
>>>> #include <assert.h>
>>>> #include <stdatomic.h>
>>>
>>> Can you provide a reference to stdatomic.h that discusses how the
>>> functions are
>>> supposed to work at the HW level rather than at the SW level.
>>
>> As I understand it, such a reference doesn't exist.
>
> I think so. An atomic exchange can be implemented with an atomic RMW
> exchange (LOCK XCHG), CAS (cmpxchg), or even LL/SC.

[...]

Also, fwiw, atomic exchange can be implemented using various hashed
locking schemes. This would require is_lock_free to return false:

https://cplusplus.com/reference/atomic/atomic/is_lock_free/

Chris M. Thomasson

unread,

Dec 10, 2023, 7:50:51 PM12/10/23

to

On 12/10/2023 4:30 PM, MitchAlsup wrote:
> Chris M. Thomasson wrote:
>
>> On 12/10/2023 12:08 PM, Scott Lurndal wrote:
>>> mitch...@aol.com (MitchAlsup) writes:
>>>> Chris M. Thomasson wrote:
>>>>
>>>>>
>>>>
>>>>> How about this, C11:
>>>>> ____________________________
>>>>> #include <stdio.h>
>>>>> #include <stdlib.h>
>>>>> #include <assert.h>
>>>>> #include <stdatomic.h>
>>>>
>>>> Can you provide a reference to stdatomic.h that discusses how the
>>>> functions are
>>>> supposed to work at the HW level rather than at the SW level.
>>>
>>> As I understand it, such a reference doesn't exist.
>
>> I think so. An atomic exchange can be implemented with an atomic RMW
>> exchange (LOCK XCHG), CAS (cmpxchg), or even LL/SC.
>
> More like::
>
> An Atomic Exchange has to reach a point of sequential consistency (which
> may entail a MEMBAR on machines with relaxed memory ordering) before
> the address of the exchanged container is made visible to the system.

Not really... Atomic exchange can be implemented in relaxed form wrt no
memory barriers in sight. Just as long as it does its job, an atomic
swap. On the SPARC I had to decorate atomic exchange with the correct
membars to get the job done. The weakest I could get away with...

MitchAlsup

unread,

Dec 10, 2023, 8:21:03 PM12/10/23

to

Ok, then write the paragraph I tried to write with sufficient detail that
a CPU designer, who knows nothing of software, can read but cannot misunderstand.
Fit all necessary sequencing, barriers, timing, ... needed such that any CPU
sequence designer will achieve a successful atomic_exchange over all his
(or her !) designs.

Chris M. Thomasson

unread,

Dec 10, 2023, 8:21:40 PM12/10/23

to

I should say I had to use the most relaxed membar scheme I could wrt the
requirements of the lock/wait free algorithm I was working with,
creating, ect... naked atomic exchange is a real thing. Naked in the
sense of no implied membars!

Chris M. Thomasson

unread,

Dec 10, 2023, 8:52:25 PM12/10/23

to

Think of different ways to implement LOCK XCHG. Two come to mind, a
pessimistic lock based scheme, ala intel, or a more opportunistic method
akin to LL/SC?

lock based version, pseudo code typed in the newsreader, hope I did not
make a damn mistake:
______________________________
word*
atomic_exchange(
word** origin,
word* xchg
){
hash_lock(origin);

// RMW
word* original = *origin;
*origin = xchg;

hash_unlock(origin);

return original;
}
______________________________

That is one way to do it using a hashed locking scheme.

David Brown

unread,

Dec 11, 2023, 4:09:25 AM12/11/23

to

Other links that might be of use or interest from gcc include:

<https://gcc.gnu.org/wiki/MemoryModel>
<https://gcc.gnu.org/wiki/Atomic/GCCMM>
<https://gcc.gnu.org/wiki/Atomic/C11>
<https://gcc.gnu.org/wiki/Atomic>
<https://gcc.gnu.org/wiki/Atomic/GCCMM/Optimizations>

<https://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html>

MitchAlsup

unread,

Dec 14, 2023, 3:56:04 PM12/14/23

to

Chris M. Thomasson wrote:

> On 12/10/2023 5:20 PM, MitchAlsup wrote:
>>
> ______________________________
> word*
> atomic_exchange(
> word** origin,
> word* xchg
> ){
> hash_lock(origin);

> // RMW
> word* original = *origin;
> *origin = xchg;

> hash_unlock(origin);

> return original;
> }
> ______________________________

As written, and assuming hash_lock() and hash_unlock are function calls
that guarentee the atomicity of the exchange::

atomic_exchange:
ENTER R28,R0,#24
MOV R30,R1
MOV R29,R2

CALL hash_lock

LD R28,[R30]
ST R29,[R30

MOV R1,R30
CALL hash_unlock

MOV R1,R28
EXIT R28,R0,#24

But I suspect that hash_{[un]lock} is not a function call but a macro
that offsets into the structure and performs a LL while un performs
the SC. Here is an example where we do not even use the value in the
addressed memory container, but simply monitor the cache line for
interference; In which case we get::

atomic_exchange:
PRE #19,[R1+lock].lock

LD R3,[R1]
ST R2,[R1]

PUSH #3,[R1+lock].lock

MOV R1,R3
RET

If interference is detected, control automagically reverts to PRE
#19 specifies Write permission and L1 cache.

The PUSH #3 does not change the value in the location, just releases
the lock and leaves the line in L1 cache. Interference is detected
when some other resource requests write permission {and is at higher
priority}.

Let us see how interference plays out::

atomic_exchange:

PRE #19,[R1+lock].lock // control-point = .
LD R3,[R1]
ST R2,[R1]
{ repeat
PUSH #3,[R1+lock].lock -fails----
| // IP = control-point
PRE #19,[R1+lock].lock -arrives-/

LD R3,[R1]
ST R2,[R1]
} any numbers of times
PUSH #3,[R1+lock].lock -succ----
|
MOV R1,R3 -arrives-/
RET
_____________________________________________________________________

By majority vote, the .lock notation is being retired and the Lock
designation is obtained by concattenating an L on the end of the
memory reference nmemonic::

LDmem Rd,[Rb...].lock
becomes:
LDmemL Rd,[Rb...]
etcetera.

Chris M. Thomasson

unread,

Dec 14, 2023, 9:28:46 PM12/14/23

to

On 12/14/2023 12:55 PM, MitchAlsup wrote:
> Chris M. Thomasson wrote:
>
>> On 12/10/2023 5:20 PM, MitchAlsup wrote:
>>>
>> ______________________________
>> word*
>> atomic_exchange(
>>    word** origin,
>>    word* xchg
>> ){
>>     hash_lock(origin);
>
>>       // RMW
>>       word* original = *origin;
>>       *origin = xchg;
>
>>     hash_unlock(origin);
>
>>     return original;
>> }
>> ______________________________
>
> As written, and assuming hash_lock() and hash_unlock are function calls
> that guarentee the atomicity of the exchange::

[...]

Fwiw, my example of atomic exchange using locking is taking into account
one of my previous experiments that hashes addresses into indexes into a
mutex table. The mutex table is completely separated from the user logic.

https://groups.google.com/g/comp.lang.c++/c/sV4WC_cBb9Q/m/5JRwvhpVCAAJ
(read all)

This is one way to implement C++ atomics using locks! This would require
me to report that the impl is not lock-free vis is_lock_free and some
other places.

Now, this is a locked version. The wait free version on x86 would be
LOCK XCHG.

MitchAlsup

unread,

Dec 14, 2023, 11:17:00 PM12/14/23

to

So would you assign the # define ATOMIC_BOOL_LOCK_FREE a 1 (sometimes lock
free) or 2 (always lock free) or 0 (not lock free)

Chris M. Thomasson

unread,

Dec 14, 2023, 11:58:31 PM12/14/23

to

Good question Mitch! :^)

I would simply have to set it to 0 simply because it is using one my
hash locking schemes wrt the context, it is not lock-free. Sometimes
lock-free can be potentially an "adaptive" lock-free algo, that strives
to be lock-free but can "choose" to block in the kernel during certain
times... The lock-free has its mind set on trying to avoid the kernel.
However, since it can potentially hit a path that goes into a kernel
wait, well, sometimes lock-free can be in order.... This would be
sometimes lock-free. However, if you want to use it in a signal handler,
it better damn well be 100% lock-free, at least! Wait-free fine, ect...

Chris M. Thomasson

unread,

Dec 14, 2023, 11:59:19 PM12/14/23

to

Striving for lock/wait-free instead of blocking in the kernel! Well, I
got my set on that!

https://youtu.be/6ZwjdGSqO0k

:^)

Chris M. Thomasson

unread,

Dec 15, 2023, 12:47:09 AM12/15/23

to

[...]

It's that damn Store to Load memory order requirement. There are ways
around it wrt current arch's, dec alpha aside for a moment... I
mentioned one of them in this thread.

MitchAlsup

unread,

Dec 15, 2023, 12:17:29 PM12/15/23

to

Chris M. Thomasson wrote:

> On 12/6/2023 7:29 PM, MitchAlsup wrote:
>>
>> The main disadvantage of the hazard pointers method is that each
>> traversal incurs a store-load
>> memory order fence, when using the method's basic form (without blocking
>> or using system
>> support such as sys_membarrier()).
>>
>> The transition from {sequential to causal*} consistency appears to take
>> place at the
>> subsequent memory reference . There are 2 cases to
>> consider::
>> a) ST.lock remains in the execution window
>> b) ST.lock has retired
> [...]

> It's that damn Store to Load memory order requirement. There are ways
> around it wrt current arch's, dec alpha aside for a moment... I
> mentioned one of them in this thread.

I learned very early, that getting ATOMICs right is vastly more important
than making them fast. And getting memory order done properly is paramount.

Chris M. Thomasson

unread,

Dec 15, 2023, 7:19:42 PM12/15/23

to

Yup. Getting the atomics right is first on the list, indeed. Wrt memory
order, I usually make everything std::memory_order_seq_cst first! Okay,
get it working... Now, lets ponder on and experiment with trying to
reduce the memory order to its weakest point before things no longer
work... Fwiw, there are some tools out there that be fairly beneficial.
One of them was written by one of my friends from way back over on
comp.programming.threads:

https://github.com/dvyukov/relacy

This is pretty nice, well, according to me. :^)