This is experimental for now, but the code should work just fine:
typedef signed int ac_i686_intword_t;
/* must be three adjacent words and aligned on 128 boundary */
typedef struct
__attribute__( (packed) )
__attribute__( (aligned( 128 )) )
ac_i686_mutex_
{
  ac_i686_intword_t m1;
  ac_i686_intword_t m2;
  ac_i686_intword_t t;
} ac_i686_mutex_t;
#define ac_i686_mutex_init( ac_macro_this ) \
  (ac_macro_this)->m1 = 0; \
  (ac_macro_this)->m2 = 0; \
  (ac_macro_this)->t = 1
extern void ac_i686_mutex_lock_1( ac_i686_mutex_t* );
extern void ac_i686_mutex_unlock_1( ac_i686_mutex_t* );
extern void ac_i686_mutex_lock_2( ac_i686_mutex_t* );
extern void ac_i686_mutex_unlock_2( ac_i686_mutex_t* );
align 16
ac_i686_mutex_lock_1 PROC
  mov eax, [esp + 4]
  mov ecx, 1
  mov [eax], ecx
  mov ecx, 2
  mov [eax + 8], ecx
  mfence ; load-after-store
ac_i686_mutex_lock_1_retry:
  mov ecx, 0
  cmp ecx, [eax + 4]
  je ac_i686_mutex_lock_1_done
  pause
  mov ecx, 1
  cmp ecx, [eax + 8]
  je ac_i686_mutex_lock_1_done
  pause
  jmp ac_i686_mutex_lock_1_retry
ac_i686_mutex_lock_1_done:
  ret
ac_i686_mutex_lock_1 ENDP
align 16
ac_i686_mutex_unlock_1 PROC
  mov ecx, [esp + 4]
  mov eax, 0
  mov [ecx], eax
  ret
ac_i686_mutex_unlock_1 ENDP
align 16
ac_i686_mutex_lock_2 PROC
  mov eax, [esp + 4]
  mov ecx, 1
  mov [eax + 4], ecx
  mov ecx, 1
  mov [eax + 8], ecx
  mfence ; load-after-store
ac_i686_mutex_lock_2_retry:
  mov ecx, 0
  cmp ecx, [eax]
  je ac_i686_mutex_lock_2_done
  pause
  mov ecx, 2
  cmp ecx, [eax + 8]
  je ac_i686_mutex_lock_2_done
  pause
  jmp ac_i686_mutex_lock_2_retry
ac_i686_mutex_lock_2_done:
  ret
ac_i686_mutex_lock_2 ENDP
align 16
ac_i686_mutex_unlock_2 PROC
  mov eax, [esp + 4]
  mov ecx, 0
  mov [eax + 4], ecx
  ret
ac_i686_mutex_unlock_2 ENDP
-- 
http://appcore.home.comcast.net/
(portable lock-free data-structures) 
this is redundant, you can remove it.