[PATCH 2/4] MCS Lock: optimizations and extra comments

Tim Chen

unread,

Nov 4, 2013, 6:40:02 PM11/4/13

to

Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
check in mcs_spin_unlock() likely() as it is likely that a race did not occur
most of the time.

Also add in more comments describing how the local node is used in MCS locks.

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index b5de3b0..96f14299 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -18,6 +18,12 @@ struct mcs_spinlock {
};

/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
* We don't inline mcs_spin_lock() so that perf can correctly account for the
* time spent in this lock function.
*/
@@ -33,7 +39,6 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
- node->locked = 1;
return;
}
ACCESS_ONCE(prev->next) = node;
@@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mutex_cpu_relax();
}

+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *next = ACCESS_ONCE(node->next);
@@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
/*
* Release the lock by setting it to NULL
*/
- if (cmpxchg(lock, node, NULL) == node)
+ if (likely(cmpxchg(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))
--
1.7.4.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Tim Chen

unread,

Nov 4, 2013, 6:40:02 PM11/4/13

to

We will need the MCS lock code for doing optimistic spinning for rwsem.
Extracting the MCS code from mutex.c and put into its own file allow us
to reuse this code easily for rwsem.

Reviewed-by: Ingo Molnar <mi...@elte.hu>
Reviewed-by: Peter Zijlstra <pet...@infradead.org>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davi...@hp.com>
---
include/linux/mcs_spinlock.h | 64 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 ++-
kernel/mutex.c | 60 ++++----------------------------------
3 files changed, 74 insertions(+), 55 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644
index 0000000..b5de3b0
--- /dev/null
+++ b/include/linux/mcs_spinlock.h
@@ -0,0 +1,64 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static noinline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index ccd4260..e6eaeea 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,6 +46,7 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
+struct mcs_spinlock;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -55,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- void *spin_mlock; /* Spinner MCS lock */
+ struct mcs_spinlock *mcs_lock; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
@@ -179,4 +180,4 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
#define arch_mutex_cpu_relax() cpu_relax()
#endif

-#endif
+#endif /* __LINUX_MUTEX_H */
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 6d647ae..4640731 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include <linux/mcs_spinlock.h>

/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->spin_mlock = NULL;
+ lock->mcs_lock = NULL;
#endif

debug_mutex_init(lock, name, key);
@@ -111,54 +112,7 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */

- node->locked = 1;

- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */

- if (cmpxchg(lock, node, NULL) == node)

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}

/*
* Mutex spinning code migrated from kernel/sched/core.c
@@ -448,7 +402,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,

for (;;) {
struct task_struct *owner;
- struct mspin_node node;
+ struct mcs_spinlock node;

if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
struct ww_mutex *ww;
@@ -470,10 +424,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- mspin_lock(MLOCK(lock), &node);
+ mcs_spin_lock(&lock->mcs_lock, &node);
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner)) {
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);
goto slowpath;
}

@@ -488,11 +442,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}

mutex_set_owner(lock);
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);
preempt_enable();
return 0;
}
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);

/*
* When there's no owner, we might have preempted between the

Tim Chen

unread,

Nov 4, 2013, 6:40:03 PM11/4/13

to

In this patch series, we separated out the MCS lock code which was previously embedded in the
mutex.c. This allows for easier reuse of MCS lock in other places like rwsem and qrwlock.
We also did some micro optimizations and barrier cleanup.

This patches were previously part of the rwsem optimization patch series but now we spearate
them out.

Tim Chen

Jason Low (2):
MCS Lock: optimizations and extra comments
MCS Lock: Barrier corrections

Tim Chen (1):
MCS Lock: Restructure the MCS lock defines and locking code into its
own file

Waiman Long (1):
MCS Lock: Make mcs_spinlock.h includable in other files

include/linux/mcs_spinlock.h | 100 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 +-
kernel/Makefile | 6 +-
kernel/mcs_spinlock.c | 37 +++++++++++++++
kernel/mutex.c | 60 +++----------------------
5 files changed, 150 insertions(+), 58 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h
create mode 100644 kernel/mcs_spinlock.c

Tim Chen

unread,

Nov 4, 2013, 6:40:03 PM11/4/13

to

This patch corrects the way memory barriers are used in the MCS lock
and removes ones that are not needed. Also add comments on all barriers.

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h

index 96f14299..93d445d 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -36,16 +36,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;

+ /* xchg() provides a memory barrier */

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */

return;

}
ACCESS_ONCE(prev->next) = node;
- smp_wmb();

/* Wait until the lock holder passes the lock down */

while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
+
+ /* Make sure subsequent operations happen after the lock is acquired */
+ smp_rmb();
}

/*
@@ -58,6 +61,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

if (likely(!next)) {
/*
+ * cmpxchg() provides a memory barrier.

* Release the lock by setting it to NULL

*/

if (likely(cmpxchg(lock, node, NULL) == node))

@@ -65,9 +69,14 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

/* Wait until the next pointer is set */

while (!(next = ACCESS_ONCE(node->next)))

arch_mutex_cpu_relax();
+ } else {
+ /*
+ * Make sure all operations within the critical section
+ * happen before the lock is released.
+ */
+ smp_wmb();

}
ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
}

#endif /* __LINUX_MCS_SPINLOCK_H */

Tim Chen

unread,

Nov 4, 2013, 6:40:03 PM11/4/13

to

The following changes are made to enable mcs_spinlock.h file to be
widely included in other files without causing problem:

1) Include a number of prerequisite header files and define
arch_mutex_cpu_relax(), if not previously defined.
2) Separate out mcs_spin_lock() into a mcs_spinlock.c file.
3) Make mcs_spin_unlock() an inlined function.

Signed-off-by: Waiman Long <Waima...@hp.com>
---
include/linux/mcs_spinlock.h | 28 +++++++++++++++++++++++-----
kernel/Makefile | 6 +++---
kernel/mcs_spinlock.c | 37 +++++++++++++++++++++++++++++++++++++
3 files changed, 63 insertions(+), 8 deletions(-)
create mode 100644 kernel/mcs_spinlock.c

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 93d445d..0b36927 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -12,11 +12,27 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
struct mcs_spinlock {
struct mcs_spinlock *next;

int locked; /* 1 if lock acquired */

};

+extern
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
+
/*

* In order to acquire the lock, the caller should declare a local node and

* pass a reference of the node to this function in addition to the lock.

@@ -24,11 +40,11 @@ struct mcs_spinlock {

* on this node->locked until the previous lock holder sets the node->locked

* in mcs_spin_unlock().
*
- * We don't inline mcs_spin_lock() so that perf can correctly account for the

- * time spent in this lock function.

+ * The _raw_mcs_spin_lock() function should not be called directly. Instead,
+ * users should call mcs_spin_lock().
*/
-static noinline
-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *prev;

@@ -55,7 +71,9 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

* Releases the lock. The caller should pass in the corresponding node that

* was used to acquire the lock.

*/
-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce4755..2ad8454 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,9 +50,9 @@ obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/mcs_spinlock.c b/kernel/mcs_spinlock.c
new file mode 100644
index 0000000..6b20324
--- /dev/null
+++ b/kernel/mcs_spinlock.c
@@ -0,0 +1,37 @@

+/*
+ * MCS lock

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#include <linux/mcs_spinlock.h>
+#include <linux/export.h>

+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;

+ node->next = NULL;
+

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}

+EXPORT_SYMBOL(mcs_spin_lock);

Peter Zijlstra

unread,

Nov 5, 2013, 6:20:01 AM11/5/13

to

On Mon, Nov 04, 2013 at 03:37:13PM -0800, Tim Chen wrote:
> +EXPORT_SYMBOL(mcs_spin_lock);

If that can be a GPL, please make it so.

Tim Chen

unread,

Nov 5, 2013, 12:50:02 PM11/5/13

to

In this patch series, we separated out the MCS lock code which was
previously embedded in the mutex.c. This allows for easier reuse of
MCS lock in other places like rwsem and qrwlock. We also did some micro
optimizations and barrier cleanup.

This patches were previously part of the rwsem optimization patch series
but now we spearate them out.

Tim Chen

v2:
1. change export mcs_spin_lock as a GPL export symbol
2. corrected mcs_spin_lock to references

Jason Low (2):
MCS Lock: optimizations and extra comments
MCS Lock: Barrier corrections

Tim Chen (1):
MCS Lock: Restructure the MCS lock defines and locking code into its
own file

Waiman Long (1):
MCS Lock: Make mcs_spinlock.h includable in other files

include/linux/mcs_spinlock.h | 99 ++++++++++++++++++++++++++++++++++++++++++

include/linux/mutex.h | 5 +-
kernel/Makefile | 6 +-

kernel/mcs_spinlock.c | 21 +++++++++
kernel/mutex.c | 60 +++----------------------
5 files changed, 133 insertions(+), 58 deletions(-)

create mode 100644 include/linux/mcs_spinlock.h
create mode 100644 kernel/mcs_spinlock.c

--
1.7.4.4

Tim Chen

unread,

Nov 5, 2013, 12:50:03 PM11/5/13

to

This patch corrects the way memory barriers are used in the MCS lock
and removes ones that are not needed. Also add comments on all barriers.

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 96f14299..93d445d 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -36,16 +36,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;

+ /* xchg() provides a memory barrier */
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
return;
}
ACCESS_ONCE(prev->next) = node;
- smp_wmb();

/* Wait until the lock holder passes the lock down */

Tim Chen

unread,

Nov 5, 2013, 12:50:03 PM11/5/13

to

We will need the MCS lock code for doing optimistic spinning for rwsem.
Extracting the MCS code from mutex.c and put into its own file allow us
to reuse this code easily for rwsem.

Reviewed-by: Ingo Molnar <mi...@elte.hu>
Reviewed-by: Peter Zijlstra <pet...@infradead.org>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davi...@hp.com>
---
include/linux/mcs_spinlock.h | 64 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 ++-
kernel/mutex.c | 60 ++++----------------------------------

3 files changed, 74 insertions(+), 55 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644

index 0000000..b5de3b0
--- /dev/null
+++ b/include/linux/mcs_spinlock.h

@@ -0,0 +1,64 @@
+/*

+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;

+ int locked; /* 1 if lock acquired */
+};

+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */

+static noinline

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;

+ node->next = NULL;
+

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+

+static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{

*
- * We don't inline mspin_lock() so that perf can correctly account for the

- * time spent in this lock function.

*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */
- node->locked = 1;
- return;
- }

- ACCESS_ONCE(prev->next) = node;
- smp_wmb();

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */
- if (cmpxchg(lock, node, NULL) == node)
- return;

- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();

Tim Chen

unread,

Nov 5, 2013, 12:50:03 PM11/5/13

to

The following changes are made to enable mcs_spinlock.h file to be
widely included in other files without causing problem:

1) Include a number of prerequisite header files and define
arch_mutex_cpu_relax(), if not previously defined.

2) Make mcs_spin_unlock() an inlined function and
rename mcs_spin_lock() to _raw_mcs_spin_lock() which is also an
inlined function.
3) Create a new mcs_spinlock.c file to contain the non-inlined
mcs_spin_lock() function.

Signed-off-by: Waiman Long <Waima...@hp.com>
---

include/linux/mcs_spinlock.h | 27 ++++++++++++++++++++++-----
kernel/Makefile | 6 +++---
kernel/mcs_spinlock.c | 21 +++++++++++++++++++++
3 files changed, 46 insertions(+), 8 deletions(-)
create mode 100644 kernel/mcs_spinlock.c

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 93d445d..f2c71e8 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -12,11 +12,27 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
struct mcs_spinlock {
struct mcs_spinlock *next;

int locked; /* 1 if lock acquired */

};

+extern

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
+

/*
* In order to acquire the lock, the caller should declare a local node and
* pass a reference of the node to this function in addition to the lock.
@@ -24,11 +40,11 @@ struct mcs_spinlock {
* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().

*
- * We don't inline mcs_spin_lock() so that perf can correctly account for the

- * time spent in this lock function.

+ * The _raw_mcs_spin_lock() function should not be called directly. Instead,
+ * users should call mcs_spin_lock().
*/
-static noinline
-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *prev;

@@ -55,7 +71,8 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

* Releases the lock. The caller should pass in the corresponding node that
* was used to acquire the lock.
*/

-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce4755..2ad8454 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,9 +50,9 @@ obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/mcs_spinlock.c b/kernel/mcs_spinlock.c

new file mode 100644
index 0000000..3c55626
--- /dev/null
+++ b/kernel/mcs_spinlock.c
@@ -0,0 +1,21 @@

+/*
+ * MCS lock

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#include <linux/mcs_spinlock.h>
+#include <linux/export.h>

+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{

+ _raw_mcs_spin_lock(lock, node);
+}
+EXPORT_SYMBOL_GPL(mcs_spin_lock);

Tim Chen

unread,

Nov 5, 2013, 12:50:03 PM11/5/13

to

Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
check in mcs_spin_unlock() likely() as it is likely that a race did not occur
most of the time.

Also add in more comments describing how the local node is used in MCS locks.

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index b5de3b0..96f14299 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -18,6 +18,12 @@ struct mcs_spinlock {
};

/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *

* We don't inline mcs_spin_lock() so that perf can correctly account for the

* time spent in this lock function.

*/
@@ -33,7 +39,6 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

prev = xchg(lock, node);
if (likely(prev == NULL)) {

/* Lock acquired */
- node->locked = 1;

return;
}
ACCESS_ONCE(prev->next) = node;

@@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mutex_cpu_relax();
}

+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{
struct mcs_spinlock *next = ACCESS_ONCE(node->next);

@@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
/*

* Release the lock by setting it to NULL

*/
- if (cmpxchg(lock, node, NULL) == node)

+ if (likely(cmpxchg(lock, node, NULL) == node))
return;

/* Wait until the next pointer is set */

while (!(next = ACCESS_ONCE(node->next)))

Peter Zijlstra

unread,

Nov 5, 2013, 2:00:01 PM11/5/13

to

On Tue, Nov 05, 2013 at 09:42:39AM -0800, Tim Chen wrote:
> + * The _raw_mcs_spin_lock() function should not be called directly. Instead,
> + * users should call mcs_spin_lock().
> */
> -static noinline
> -void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> +static inline
> +void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> {
> struct mcs_spinlock *prev;
>

So why keep it in the header at all?

Tim Chen

unread,

Nov 5, 2013, 2:40:02 PM11/5/13

to

On Tue, 2013-11-05 at 19:57 +0100, Peter Zijlstra wrote:
> On Tue, Nov 05, 2013 at 09:42:39AM -0800, Tim Chen wrote:
> > + * The _raw_mcs_spin_lock() function should not be called directly. Instead,
> > + * users should call mcs_spin_lock().
> > */
> > -static noinline
> > -void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> > +static inline
> > +void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> > {
> > struct mcs_spinlock *prev;
> >
>
> So why keep it in the header at all?

I also made the suggestion originally of keeping both lock and unlock in
mcs_spinlock.c. Wonder if Waiman decides to keep them in header
because in-lining the unlock function makes execution a bit faster?

Tim

Michel Lespinasse

unread,

Nov 5, 2013, 4:20:02 PM11/5/13

to

What base kernel does this apply over ?

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Tim Chen

unread,

Nov 5, 2013, 4:30:01 PM11/5/13

to

Should be applicable on latest v3.12-rc7.

Tim

Paul E. McKenney

unread,

Nov 6, 2013, 10:10:02 AM11/6/13

to

On Tue, Nov 05, 2013 at 10:18:03PM +0100, Peter Zijlstra wrote:
> On Tue, Nov 05, 2013 at 11:21:57AM -0800, Tim Chen wrote:
> > On Tue, 2013-11-05 at 18:37 +0000, Will Deacon wrote:

> > > On Tue, Nov 05, 2013 at 05:42:36PM +0000, Tim Chen wrote:
> > > > This patch corrects the way memory barriers are used in the MCS lock
> > > > and removes ones that are not needed. Also add comments on all barriers.
> > >

> > > Hmm, I see that you're fixing up the barriers, but I still don't completely
> > > understand how what you have is correct. Hopefully you can help me out :)

> > > Ok, so this is an smp_rmb() because we assume that stores aren't speculated,
> > > right? (i.e. the control dependency above is enough for stores to be ordered
> > > with respect to taking the lock)...
>
> PaulMck completely confused me a few days ago with control dependencies
> etc.. Pretty much saying that C/C++ doesn't do those.

I remember that there was a subtlety here, but don't remember what it was...

And while I do remember reviewing this code, I don't find any evidence
that I gave my "Reviewed-by". Tim/Jason, if I fat-fingered this, please
forward that email back to me.

Thanx, Paul

Waiman Long

unread,

Nov 6, 2013, 10:40:01 AM11/6/13

to

On 11/05/2013 02:30 PM, Tim Chen wrote:
> On Tue, 2013-11-05 at 19:57 +0100, Peter Zijlstra wrote:
>> On Tue, Nov 05, 2013 at 09:42:39AM -0800, Tim Chen wrote:
>>> + * The _raw_mcs_spin_lock() function should not be called directly. Instead,
>>> + * users should call mcs_spin_lock().
>>> */
>>> -static noinline
>>> -void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
>>> +static inline
>>> +void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
>>> {
>>> struct mcs_spinlock *prev;
>>>
>> So why keep it in the header at all?
> I also made the suggestion originally of keeping both lock and unlock in
> mcs_spinlock.c. Wonder if Waiman decides to keep them in header
> because in-lining the unlock function makes execution a bit faster?
>
> Tim
>

I was following the example of the spinlock code where the lock function
is not inlined, but the unlock function is. I have no objection to make
them both as non-inlined functions, if you think that is the right move.

Regards,
Longman

Peter Zijlstra

unread,

Nov 6, 2013, 11:10:01 AM11/6/13

to

On Wed, Nov 06, 2013 at 10:31:47AM -0500, Waiman Long wrote:
> On 11/05/2013 02:30 PM, Tim Chen wrote:
> >On Tue, 2013-11-05 at 19:57 +0100, Peter Zijlstra wrote:
> >>On Tue, Nov 05, 2013 at 09:42:39AM -0800, Tim Chen wrote:
> >>>+ * The _raw_mcs_spin_lock() function should not be called directly. Instead,
> >>>+ * users should call mcs_spin_lock().
> >>> */
> >>>-static noinline
> >>>-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> >>>+static inline
> >>>+void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> >>> {
> >>> struct mcs_spinlock *prev;
> >>>
> >>So why keep it in the header at all?
> >I also made the suggestion originally of keeping both lock and unlock in
> >mcs_spinlock.c. Wonder if Waiman decides to keep them in header
> >because in-lining the unlock function makes execution a bit faster?
> >
> >Tim
> >
>
> I was following the example of the spinlock code where the lock function is
> not inlined, but the unlock function is. I have no objection to make them
> both as non-inlined functions, if you think that is the right move.

I don't care, what I do find odd is the existence of
_raw_mcs_spin_lock(). If you want to out-of-line it, just move the
entire thing into a .c file already.

Tim Chen

unread,

Nov 6, 2013, 1:30:01 PM11/6/13

to

Yes Paul, you didn't explicitly gave the Reviewed-by.
I put it in there because you have given valuable
comments on the potential critical section bleeding when
reviewing initial version of the code.

I'll take it out now till you have explicitly given it.
Appreciate if you can provide your feedback on the current
version of code.

Thanks.

Tim

Waiman Long

unread,

Nov 6, 2013, 2:30:02 PM11/6/13

to

Tim,

I have just sent out a patch as an addendum to your patch series.
Hopefully that will address the memory barrier issue.

-Longman

Tim Chen

unread,

Nov 6, 2013, 4:40:01 PM11/6/13

to

We will need the MCS lock code for doing optimistic spinning for rwsem.
Extracting the MCS code from mutex.c and put into its own file allow us
to reuse this code easily for rwsem.

Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davi...@hp.com>
---
include/linux/mcs_spinlock.h | 64 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 ++-
kernel/mutex.c | 60 ++++----------------------------------

3 files changed, 74 insertions(+), 55 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644

index 0000000..b5de3b0
--- /dev/null
+++ b/include/linux/mcs_spinlock.h

@@ -0,0 +1,64 @@
+/*

+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static noinline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;

+ node->next = NULL;
+

index bab49da..32a32e6 100644

--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,6 +46,7 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
+struct mcs_spinlock;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -55,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- void *spin_mlock; /* Spinner MCS lock */
+ struct mcs_spinlock *mcs_lock; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
@@ -179,4 +180,4 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);

# define arch_mutex_cpu_relax() cpu_relax()

#endif

-#endif
+#endif /* __LINUX_MUTEX_H */
diff --git a/kernel/mutex.c b/kernel/mutex.c

index d24105b..e08b183 100644

--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include <linux/mcs_spinlock.h>

/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->spin_mlock = NULL;
+ lock->mcs_lock = NULL;
#endif

debug_mutex_init(lock, name, key);
@@ -111,54 +112,7 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*

- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/

-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {

- /* Lock acquired */

- node->locked = 1;

- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*

- * Release the lock by setting it to NULL
- */

- if (cmpxchg(lock, node, NULL) == node)

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}

/*
* Mutex spinning code migrated from kernel/sched/core.c
@@ -448,7 +402,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,

for (;;) {
struct task_struct *owner;
- struct mspin_node node;
+ struct mcs_spinlock node;

if (use_ww_ctx && ww_ctx->acquired > 0) {

struct ww_mutex *ww;
@@ -470,10 +424,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- mspin_lock(MLOCK(lock), &node);
+ mcs_spin_lock(&lock->mcs_lock, &node);
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner)) {
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);
goto slowpath;
}

@@ -488,11 +442,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}

mutex_set_owner(lock);
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);
preempt_enable();
return 0;
}
- mspin_unlock(MLOCK(lock), &node);
+ mcs_spin_unlock(&lock->mcs_lock, &node);

/*
* When there's no owner, we might have preempted between the
--
1.7.4.4

Tim Chen

unread,

Nov 6, 2013, 4:40:02 PM11/6/13

to

The following changes are made to enable mcs_spinlock.h file to be
widely included in other files without causing problem:

1) Include a number of prerequisite header files and define
arch_mutex_cpu_relax(), if not previously defined.
2) Make mcs_spin_unlock() an inlined function and
rename mcs_spin_lock() to _raw_mcs_spin_lock() which is also an
inlined function.
3) Create a new mcs_spinlock.c file to contain the non-inlined
mcs_spin_lock() function.

Signed-off-by: Waiman Long <Waima...@hp.com>

Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

---
include/linux/mcs_spinlock.h | 27 ++++++++++++++++++++++-----
kernel/Makefile | 6 +++---
kernel/mcs_spinlock.c | 21 +++++++++++++++++++++
3 files changed, 46 insertions(+), 8 deletions(-)
create mode 100644 kernel/mcs_spinlock.c

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 93d445d..f2c71e8 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -12,11 +12,27 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
+
+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
struct mcs_spinlock {
struct mcs_spinlock *next;

int locked; /* 1 if lock acquired */

};

+extern

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
+

/*

* In order to acquire the lock, the caller should declare a local node and

* pass a reference of the node to this function in addition to the lock.

@@ -24,11 +40,11 @@ struct mcs_spinlock {

* on this node->locked until the previous lock holder sets the node->locked

* in mcs_spin_unlock().
*
- * We don't inline mcs_spin_lock() so that perf can correctly account for the

- * time spent in this lock function.

+ * The _raw_mcs_spin_lock() function should not be called directly. Instead,
+ * users should call mcs_spin_lock().
*/
-static noinline

-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *prev;

@@ -55,7 +71,8 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

* Releases the lock. The caller should pass in the corresponding node that

* was used to acquire the lock.

*/
-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce4755..2ad8454 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -50,9 +50,9 @@ obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
diff --git a/kernel/mcs_spinlock.c b/kernel/mcs_spinlock.c

new file mode 100644

index 0000000..3c55626
--- /dev/null
+++ b/kernel/mcs_spinlock.c

@@ -0,0 +1,21 @@

+/*
+ * MCS lock

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#include <linux/mcs_spinlock.h>
+#include <linux/export.h>

+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{

+ _raw_mcs_spin_lock(lock, node);
+}
+EXPORT_SYMBOL_GPL(mcs_spin_lock);

Tim Chen

unread,

Nov 6, 2013, 4:40:02 PM11/6/13

to

This patch moves the decision of what kind of memory barriers to be
used in the MCS lock and unlock functions to the architecture specific
layer. It also moves the actual lock/unlock code to mcs_spinlock.c
file.

A full memory barrier will be used if the following macros are not
defined:
1) smp_mb__before_critical_section()
2) smp_mb__after_critical_section()

For the x86 architecture, only compiler barrier will be needed.

Signed-off-by: Waiman Long <Waima...@hp.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---

arch/x86/include/asm/barrier.h | 6 +++
include/linux/mcs_spinlock.h | 78 +-------------------------------------
kernel/mcs_spinlock.c | 81 ++++++++++++++++++++++++++++++++++++++-
3 files changed, 86 insertions(+), 79 deletions(-)

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index c6cd358..6d0172c 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -92,6 +92,12 @@
#endif
#define smp_read_barrier_depends() read_barrier_depends()
#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+
+#if !defined(CONFIG_X86_PPRO_FENCE) && !defined(CONFIG_X86_OOSTORE)
+# define smp_mb__before_critical_section() barrier()
+# define smp_mb__after_critical_section() barrier()
+#endif
+
#else
#define smp_mb() barrier()
#define smp_rmb() barrier()
diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index f2c71e8..d54bb23 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -12,19 +12,6 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H

-/*
- * asm/processor.h may define arch_mutex_cpu_relax().
- * If it is not defined, cpu_relax() will be used.
- */
-#include <asm/barrier.h>
-#include <asm/cmpxchg.h>
-#include <asm/processor.h>
-#include <linux/compiler.h>
-
-#ifndef arch_mutex_cpu_relax
-# define arch_mutex_cpu_relax() cpu_relax()
-#endif
-

struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */

@@ -32,68 +19,7 @@ struct mcs_spinlock {

extern
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
-
-/*
- * In order to acquire the lock, the caller should declare a local node and
- * pass a reference of the node to this function in addition to the lock.
- * If the lock has already been acquired, then this will proceed to spin
- * on this node->locked until the previous lock holder sets the node->locked
- * in mcs_spin_unlock().
- *
- * The _raw_mcs_spin_lock() function should not be called directly. Instead,
- * users should call mcs_spin_lock().
- */
-static inline
-void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
- struct mcs_spinlock *prev;

-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-

- /* xchg() provides a memory barrier */

- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */

- return;
- }
- ACCESS_ONCE(prev->next) = node;

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-

- /* Make sure subsequent operations happen after the lock is acquired */
- smp_rmb();
-}
-
-/*
- * Releases the lock. The caller should pass in the corresponding node that
- * was used to acquire the lock.
- */
-static inline
-void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);

-
- if (likely(!next)) {
- /*

- * cmpxchg() provides a memory barrier.

- * Release the lock by setting it to NULL
- */

- if (likely(cmpxchg(lock, node, NULL) == node))

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();

- } else {
- /*
- * Make sure all operations within the critical section
- * happen before the lock is released.
- */
- smp_wmb();

- }
- ACCESS_ONCE(next->locked) = 1;
-}

+extern
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node);

#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/mcs_spinlock.c b/kernel/mcs_spinlock.c
index 3c55626..2dfd207 100644
--- a/kernel/mcs_spinlock.c
+++ b/kernel/mcs_spinlock.c
@@ -7,15 +7,90 @@

* It avoids expensive cache bouncings that common test-and-set spin-lock

* implementations incur.
*/

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>

#include <linux/mcs_spinlock.h>
#include <linux/export.h>

+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+

/*

- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.

+ * Fall back to use full memory barrier if those macros are not defined
+ * in a architecture specific header file.
+ */
+#ifndef smp_mb__before_critical_section
+#define smp_mb__before_critical_section() smp_mb()
+#endif
+
+#ifndef smp_mb__after_critical_section
+#define smp_mb__after_critical_section() smp_mb()
+#endif
+
+
+/*

+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin

+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
*/

void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{
- _raw_mcs_spin_lock(lock, node);

+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+

+ /* xchg() provides a memory barrier */

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */

+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;

+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+

+ /* Make sure subsequent operations happen after the lock is acquired */

+ smp_mb__before_critical_section();
}
EXPORT_SYMBOL_GPL(mcs_spin_lock);
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*

+ * cmpxchg() provides a memory barrier.

+ * Release the lock by setting it to NULL
+ */

+ if (likely(cmpxchg(lock, node, NULL) == node))

+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();

+ } else {
+ /*
+ * Make sure all operations within the critical section
+ * happen before the lock is released.
+ */

+ smp_mb__after_critical_section();

+ }
+ ACCESS_ONCE(next->locked) = 1;
+}

+EXPORT_SYMBOL_GPL(mcs_spin_unlock);

Tim Chen

unread,

Nov 6, 2013, 4:40:02 PM11/6/13

to

This patch corrects the way memory barriers are used in the MCS lock
and removes ones that are not needed. Also add comments on all barriers.

Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>

Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---

include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 96f14299..93d445d 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -36,16 +36,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;

+ /* xchg() provides a memory barrier */
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */
return;
}

ACCESS_ONCE(prev->next) = node;
- smp_wmb();

/* Wait until the lock holder passes the lock down */

while (!ACCESS_ONCE(node->locked))

arch_mutex_cpu_relax();
+
+ /* Make sure subsequent operations happen after the lock is acquired */

+ smp_rmb();
}

/*
@@ -58,6 +61,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

if (likely(!next)) {

/*
+ * cmpxchg() provides a memory barrier.

* Release the lock by setting it to NULL

*/

if (likely(cmpxchg(lock, node, NULL) == node))

@@ -65,9 +69,14 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

/* Wait until the next pointer is set */

while (!(next = ACCESS_ONCE(node->next)))

arch_mutex_cpu_relax();
+ } else {
+ /*
+ * Make sure all operations within the critical section
+ * happen before the lock is released.
+ */

+ smp_wmb();
}
ACCESS_ONCE(next->locked) = 1;

- smp_wmb();
}

#endif /* __LINUX_MCS_SPINLOCK_H */

Tim Chen

unread,

Nov 6, 2013, 4:40:02 PM11/6/13

to

In this patch series, we separated out the MCS lock code which was
previously embedded in the mutex.c. This allows for easier reuse of
MCS lock in other places like rwsem and qrwlock. We also did some micro
optimizations and barrier cleanup.

This patches were previously part of the rwsem optimization patch series
but now we spearate them out.

Tim Chen

v3:
1. modified memory barriers to support non x86 architectures that have
weak memory ordering.

v2:
1. change export mcs_spin_lock as a GPL export symbol
2. corrected mcs_spin_lock to references

Jason Low (2):
MCS Lock: optimizations and extra comments
MCS Lock: Barrier corrections

Jason Low (2):
MCS Lock: optimizations and extra comments
MCS Lock: Barrier corrections

Tim Chen (1):
MCS Lock: Restructure the MCS lock defines and locking code into its
own file

Waiman Long (2):

MCS Lock: Make mcs_spinlock.h includable in other files

MCS Lock: Allow architecture specific memory barrier in lock/unlock

arch/x86/include/asm/barrier.h | 6 +++
include/linux/mcs_spinlock.h | 25 ++++++++++

include/linux/mutex.h | 5 +-
kernel/Makefile | 6 +-

kernel/mcs_spinlock.c | 96 ++++++++++++++++++++++++++++++++++++++++
kernel/mutex.c | 60 +++----------------------
6 files changed, 140 insertions(+), 58 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h
create mode 100644 kernel/mcs_spinlock.c

Tim Chen

unread,

Nov 6, 2013, 4:40:02 PM11/6/13

to

Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
check in mcs_spin_unlock() likely() as it is likely that a race did not occur
most of the time.

Also add in more comments describing how the local node is used in MCS locks.

Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h

index b5de3b0..96f14299 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -18,6 +18,12 @@ struct mcs_spinlock {
};

/*

+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().

+ *

* We don't inline mcs_spin_lock() so that perf can correctly account for the

* time spent in this lock function.

*/
@@ -33,7 +39,6 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */

- node->locked = 1;

return;
}
ACCESS_ONCE(prev->next) = node;

@@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mutex_cpu_relax();

}

+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

@@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
/*

* Release the lock by setting it to NULL
*/

- if (cmpxchg(lock, node, NULL) == node)

+ if (likely(cmpxchg(lock, node, NULL) == node))
return;

/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))

Tim Chen

unread,

Nov 6, 2013, 4:50:02 PM11/6/13

to

On Wed, 2013-11-06 at 13:37 -0800, Tim Chen wrote:
> The following changes are made to enable mcs_spinlock.h file to be
> widely included in other files without causing problem:
>
> 1) Include a number of prerequisite header files and define
> arch_mutex_cpu_relax(), if not previously defined.
> 2) Make mcs_spin_unlock() an inlined function and
> rename mcs_spin_lock() to _raw_mcs_spin_lock() which is also an
> inlined function.
> 3) Create a new mcs_spinlock.c file to contain the non-inlined
> mcs_spin_lock() function.
>
> Signed-off-by: Waiman Long <Waima...@hp.com>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

Should be Acked-by: Tim Chen <tim.c...@linux.intel.com>

Tim Chen

unread,

Nov 6, 2013, 4:50:02 PM11/6/13

to

On Wed, 2013-11-06 at 13:37 -0800, Tim Chen wrote:

> Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
> check in mcs_spin_unlock() likely() as it is likely that a race did not occur
> most of the time.
>
> Also add in more comments describing how the local node is used in MCS locks.
>
> Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
> Signed-off-by: Jason Low <jason...@hp.com>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

Should be Acked-by: Tim Chen <tim.c...@linux.intel.com>.
My fat fingers accidentally added my signed off for all patches.

Tim

H. Peter Anvin

unread,

Nov 6, 2013, 4:50:02 PM11/6/13

to

On 11/06/2013 01:36 PM, Tim Chen wrote:
> In this patch series, we separated out the MCS lock code which was
> previously embedded in the mutex.c. This allows for easier reuse of
> MCS lock in other places like rwsem and qrwlock. We also did some micro
> optimizations and barrier cleanup.
>
> This patches were previously part of the rwsem optimization patch series
> but now we spearate them out.
>
> Tim Chen

Perhaps I'm missing something here, but what is MCS lock and what is the
value?

-hpa

Tim Chen

unread,

Nov 6, 2013, 4:50:02 PM11/6/13

to

On Wed, 2013-11-06 at 13:37 -0800, Tim Chen wrote:

> This patch moves the decision of what kind of memory barriers to be
> used in the MCS lock and unlock functions to the architecture specific
> layer. It also moves the actual lock/unlock code to mcs_spinlock.c
> file.
>
> A full memory barrier will be used if the following macros are not
> defined:
> 1) smp_mb__before_critical_section()
> 2) smp_mb__after_critical_section()
>
> For the x86 architecture, only compiler barrier will be needed.
>
> Signed-off-by: Waiman Long <Waima...@hp.com>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

Should be Acked-by: Tim Chen <tim.c...@linux.intel.com>

Michel Lespinasse

unread,

Nov 6, 2013, 5:00:02 PM11/6/13

to

On Wed, Nov 6, 2013 at 1:42 PM, H. Peter Anvin <h...@zytor.com> wrote:
> Perhaps I'm missing something here, but what is MCS lock and what is the
> value?

Its a kind of queued lock where each waiter spins on a a separate
memory word, instead of having them all spin on the lock's memory
word. This helps with scalability when many waiters queue on the same
lock.

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Tim Chen

unread,

Nov 6, 2013, 8:30:02 PM11/6/13

to

We will need the MCS lock code for doing optimistic spinning for rwsem

and queue rwlock. Extracting the MCS code from mutex.c and put into
its own file allow us to reuse this code easily.

Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davi...@hp.com>
---
include/linux/mcs_spinlock.h | 64 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 ++-

kernel/locking/mutex.c | 60 ++++----------------------------------
3 files changed, 74 insertions(+), 55 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644

index 0000000..b5de3b0
--- /dev/null
+++ b/include/linux/mcs_spinlock.h

@@ -0,0 +1,64 @@
+/*

+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;

+ int locked; /* 1 if lock acquired */
+};
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static noinline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{

+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */

+ node->locked = 1;

+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;

+ smp_wmb();

+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+

+static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*

+ * Release the lock by setting it to NULL
+ */

+ if (cmpxchg(lock, node, NULL) == node)

+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }

+ ACCESS_ONCE(next->locked) = 1;

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d24105b..e08b183 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c

@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include <linux/mcs_spinlock.h>

/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->spin_mlock = NULL;
+ lock->mcs_lock = NULL;
#endif

debug_mutex_init(lock, name, key);
@@ -111,54 +112,7 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*

- * We don't inline mspin_lock() so that perf can correctly account for the

- * time spent in this lock function.

*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{

- struct mspin_node *prev;

-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-

- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */

- node->locked = 1;

- return;
- }
- ACCESS_ONCE(prev->next) = node;

- smp_wmb();

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-

-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{

- struct mspin_node *next = ACCESS_ONCE(node->next);

-
- if (likely(!next)) {
- /*

- * Release the lock by setting it to NULL
- */

- if (cmpxchg(lock, node, NULL) == node)

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }

- ACCESS_ONCE(next->locked) = 1;

Tim Chen

unread,

Nov 6, 2013, 8:30:02 PM11/6/13

to

This patch moves the decision of what kind of memory barriers to be
used in the MCS lock and unlock functions to the architecture specific
layer. It also moves the actual lock/unlock code to mcs_spinlock.c
file.

A full memory barrier will be used if the following macros are not
defined:
1) smp_mb__before_critical_section()
2) smp_mb__after_critical_section()

For the x86 architecture, only compiler barrier will be needed.

Acked-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Waiman Long <Waima...@hp.com>

---
arch/x86/include/asm/barrier.h | 6 +++
include/linux/mcs_spinlock.h | 78 +-------------------------------------

kernel/locking/mcs_spinlock.c | 81 ++++++++++++++++++++++++++++++++++++++-

int locked; /* 1 if lock acquired */

@@ -32,68 +19,7 @@ struct mcs_spinlock {

extern
void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
-
-/*
- * In order to acquire the lock, the caller should declare a local node and
- * pass a reference of the node to this function in addition to the lock.
- * If the lock has already been acquired, then this will proceed to spin
- * on this node->locked until the previous lock holder sets the node->locked
- * in mcs_spin_unlock().
- *
- * The _raw_mcs_spin_lock() function should not be called directly. Instead,
- * users should call mcs_spin_lock().
- */
-static inline

-void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
- struct mcs_spinlock *prev;

-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-

- /* xchg() provides a memory barrier */

- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */

- return;
- }
- ACCESS_ONCE(prev->next) = node;

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-

- /* Make sure subsequent operations happen after the lock is acquired */
- smp_rmb();
-}
-
-/*
- * Releases the lock. The caller should pass in the corresponding node that
- * was used to acquire the lock.
- */
-static inline

-void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);

-
- if (likely(!next)) {
- /*

- * cmpxchg() provides a memory barrier.

- * Release the lock by setting it to NULL
- */

- if (likely(cmpxchg(lock, node, NULL) == node))

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();

- } else {
- /*
- * Make sure all operations within the critical section
- * happen before the lock is released.
- */
- smp_wmb();

- }
- ACCESS_ONCE(next->locked) = 1;
-}

+extern
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node);

#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 3c55626..2dfd207 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -7,15 +7,90 @@

* It avoids expensive cache bouncings that common test-and-set spin-lock

* implementations incur.
*/
+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>
#include <linux/mcs_spinlock.h>
#include <linux/export.h>

+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
/*

- * We don't inline mcs_spin_lock() so that perf can correctly account for the

- * time spent in this lock function.

+ * Fall back to use full memory barrier if those macros are not defined
+ * in a architecture specific header file.
+ */
+#ifndef smp_mb__before_critical_section
+#define smp_mb__before_critical_section() smp_mb()
+#endif
+
+#ifndef smp_mb__after_critical_section
+#define smp_mb__after_critical_section() smp_mb()
+#endif

+
+
+/*

+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
*/

void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{
- _raw_mcs_spin_lock(lock, node);

+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+

+ /* xchg() provides a memory barrier */

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */

+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;

+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+

+ /* Make sure subsequent operations happen after the lock is acquired */
+ smp_mb__before_critical_section();
}
EXPORT_SYMBOL_GPL(mcs_spin_lock);
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*

+ * cmpxchg() provides a memory barrier.

+ * Release the lock by setting it to NULL
+ */

+ if (likely(cmpxchg(lock, node, NULL) == node))

+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();

+ } else {
+ /*
+ * Make sure all operations within the critical section
+ * happen before the lock is released.
+ */
+ smp_mb__after_critical_section();

+ }
+ ACCESS_ONCE(next->locked) = 1;
+}

+EXPORT_SYMBOL_GPL(mcs_spin_unlock);

Tim Chen

unread,

Nov 6, 2013, 8:30:02 PM11/6/13

to

Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
check in mcs_spin_unlock() likely() as it is likely that a race did not occur
most of the time.

Also add in more comments describing how the local node is used in MCS locks.

Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>

---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index b5de3b0..96f14299 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -18,6 +18,12 @@ struct mcs_spinlock {
};

/*

+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().

+ *

* We don't inline mcs_spin_lock() so that perf can correctly account for the

* time spent in this lock function.

*/
@@ -33,7 +39,6 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
prev = xchg(lock, node);
if (likely(prev == NULL)) {

/* Lock acquired */

- node->locked = 1;
return;
}
ACCESS_ONCE(prev->next) = node;

@@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mutex_cpu_relax();

}

+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

@@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
/*

* Release the lock by setting it to NULL

*/
- if (cmpxchg(lock, node, NULL) == node)

+ if (likely(cmpxchg(lock, node, NULL) == node))

return;

/* Wait until the next pointer is set */

while (!(next = ACCESS_ONCE(node->next)))

Tim Chen

unread,

Nov 6, 2013, 8:30:02 PM11/6/13

to

The following changes are made to enable mcs_spinlock.h file to be
widely included in other files without causing problem:

1) Include a number of prerequisite header files and define
arch_mutex_cpu_relax(), if not previously defined.
2) Make mcs_spin_unlock() an inlined function and
rename mcs_spin_lock() to _raw_mcs_spin_lock() which is also an
inlined function.
3) Create a new mcs_spinlock.c file to contain the non-inlined
mcs_spin_lock() function.

Acked-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Waiman Long <Waima...@hp.com>
---

include/linux/mcs_spinlock.h | 27 ++++++++++++++++++++++-----
kernel/locking/Makefile | 6 +++---
kernel/locking/mcs_spinlock.c | 21 +++++++++++++++++++++

3 files changed, 46 insertions(+), 8 deletions(-)

create mode 100644 kernel/locking/mcs_spinlock.c

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 93d445d..f2c71e8 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -12,11 +12,27 @@
#ifndef __LINUX_MCS_SPINLOCK_H
#define __LINUX_MCS_SPINLOCK_H

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>

+

+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+

struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */

};

+extern
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
+
/*

* In order to acquire the lock, the caller should declare a local node and

* pass a reference of the node to this function in addition to the lock.

@@ -24,11 +40,11 @@ struct mcs_spinlock {

* on this node->locked until the previous lock holder sets the node->locked

* in mcs_spin_unlock().
*
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
+ * The _raw_mcs_spin_lock() function should not be called directly. Instead,
+ * users should call mcs_spin_lock().
*/
-static noinline
-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void _raw_mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *prev;

@@ -55,7 +71,8 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

* Releases the lock. The caller should pass in the corresponding node that

* was used to acquire the lock.

*/
-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{
struct mcs_spinlock *next = ACCESS_ONCE(node->next);

diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5..20d9d5c 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -13,12 +13,12 @@ obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
-obj-$(CONFIG_SMP) += spinlock.o

-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o mcs_spinlock.o

+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 0000000..3c55626
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,21 @@

+/*
+ * MCS lock

+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#include <linux/mcs_spinlock.h>
+#include <linux/export.h>
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ _raw_mcs_spin_lock(lock, node);
+}
+EXPORT_SYMBOL_GPL(mcs_spin_lock);

Tim Chen

unread,

Nov 6, 2013, 8:30:02 PM11/6/13

to

In this patch series, we separated out the MCS lock code which was
previously embedded in the mutex.c. This allows for easier reuse of
MCS lock in other places like rwsem and qrwlock. We also did some micro
optimizations and barrier cleanup.

This patches were previously part of the rwsem optimization patch series
but now we spearate them out.

Tim Chen

v4:
1. Move patch series to the latest tip after v3.12

v3:
1. modified memory barriers to support non x86 architectures that have
weak memory ordering.

v2:
1. change export mcs_spin_lock as a GPL export symbol
2. corrected mcs_spin_lock to references

Jason Low (2):
MCS Lock: optimizations and extra comments
MCS Lock: Barrier corrections

Tim Chen (1):
MCS Lock: Restructure the MCS lock defines and locking code into its
own file

Waiman Long (2):
MCS Lock: Make mcs_spinlock.h includable in other files
MCS Lock: Allow architecture specific memory barrier in lock/unlock

arch/x86/include/asm/barrier.h | 6 +++
include/linux/mcs_spinlock.h | 25 ++++++++++
include/linux/mutex.h | 5 +-

kernel/locking/Makefile | 6 +-
kernel/locking/mcs_spinlock.c | 96 ++++++++++++++++++++++++++++++++++++++++
kernel/locking/mutex.c | 60 +++----------------------
6 files changed, 140 insertions(+), 58 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h
create mode 100644 kernel/locking/mcs_spinlock.c

Tim Chen

unread,

Nov 6, 2013, 8:30:03 PM11/6/13

to

This patch corrects the way memory barriers are used in the MCS lock
and removes ones that are not needed. Also add comments on all barriers.

Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 96f14299..93d445d 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -36,16 +36,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;

+ /* xchg() provides a memory barrier */

prev = xchg(lock, node);
if (likely(prev == NULL)) {
/* Lock acquired */

return;
}
ACCESS_ONCE(prev->next) = node;

- smp_wmb();

/* Wait until the lock holder passes the lock down */

while (!ACCESS_ONCE(node->locked))

arch_mutex_cpu_relax();
+
+ /* Make sure subsequent operations happen after the lock is acquired */

+ smp_rmb();
}

/*
@@ -58,6 +61,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

if (likely(!next)) {

/*
+ * cmpxchg() provides a memory barrier.

* Release the lock by setting it to NULL
*/

if (likely(cmpxchg(lock, node, NULL) == node))

@@ -65,9 +69,14 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))

arch_mutex_cpu_relax();
+ } else {
+ /*
+ * Make sure all operations within the critical section
+ * happen before the lock is released.
+ */

+ smp_wmb();
}
ACCESS_ONCE(next->locked) = 1;

- smp_wmb();
}

#endif /* __LINUX_MCS_SPINLOCK_H */

Michel Lespinasse

unread,

Nov 7, 2013, 5:00:02 AM11/7/13

to

On Wed, Nov 6, 2013 at 5:39 PM, Linus Torvalds
<torv...@linux-foundation.org> wrote:
> Sorry about the HTML crap, the internet connection is too slow for my normal
> email habits, so I'm using my phone.
>
> I think the barriers are still totally wrong for the locking functions.
>
> Adding an smp_rmb after waiting for the lock is pure BS. Writes in the
> locked region could percolate out of the locked region.
>
> The thing is, you cannot do the memory ordering for locks in any same
> generic way. Not using our current barrier system. On x86 (and many others)
> the smp_rmb will work fine, because writes are never moved earlier. But on
> other architectures you really need an acquire to get a lock efficiently. No
> separate barriers. An acquire needs to be on the instruction that does the
> lock.
>
> Same goes for unlock. On x86 any store is a fine unlock, but on other
> architectures you need a store with a release marker.
>
> So no amount of barriers will ever do this correctly. Sure, you can add full
> memory barriers and it will be "correct" but it will be unbearably slow, and
> add totally unnecessary serialization. So *correct* locking will require
> architecture support.

Rather than writing arch-specific locking code, would you agree to
introduce acquire and release memory operations ?

The semantics of an acquire memory operation would be: the specified
memory operation occurs, and any reads or writes after that operation
are guaranteed not to be reordered before it (useful to implement lock
acquisitions).
The semantics of a release memory operation would be: the specified
memory operation occurs, and any reads or writes before that operation
are guaranteed not to be reordered after it (useful to implement lock
releases).

Now each arch would still need to define several acquire and release
operations, but this is a quite useful model to build generic code on.
For example, the fast path for the x86 spinlock implementation could
be expressed generically as an acquire fetch-and-add (for
__ticket_spin_lock) and a release add (for __ticket_spin_unlock).

Would you think this is a useful direction to move to ?

Thanks,

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Michel Lespinasse

unread,

Nov 7, 2013, 8:00:02 AM11/7/13

to

On Thu, Nov 7, 2013 at 4:06 AM, Linus Torvalds
<torv...@linux-foundation.org> wrote:

>
> On Nov 7, 2013 6:55 PM, "Michel Lespinasse" <wal...@google.com> wrote:
>>
>> Rather than writing arch-specific locking code, would you agree to
>> introduce acquire and release memory operations ?
>

> Yes, that's probably the right thing to do. What ops do we need? Store with
> release, cmpxchg and load with acquire? Anything else?

Depends on what lock types we want to implement on top; for MCS we would need:
- xchg acquire (common case) and load acquire (for spinning on our
locker's wait word)
- cmpxchg release (when there is no next locker) and store release
(when writing to the next locker's wait word)

One downside of the proposal is that using a load acquire for spinning
puts the memory barrier within the spin loop. So this model is very
intuitive and does not add unnecessary barriers on x86, but it my
place the barriers in a suboptimal place for architectures that need
them.

Paul E. McKenney

unread,

Nov 7, 2013, 9:40:01 AM11/7/13

to

On Thu, Nov 07, 2013 at 04:50:23AM -0800, Michel Lespinasse wrote:
> On Thu, Nov 7, 2013 at 4:06 AM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > On Nov 7, 2013 6:55 PM, "Michel Lespinasse" <wal...@google.com> wrote:
> >>
> >> Rather than writing arch-specific locking code, would you agree to
> >> introduce acquire and release memory operations ?
> >
> > Yes, that's probably the right thing to do. What ops do we need? Store with
> > release, cmpxchg and load with acquire? Anything else?
>
> Depends on what lock types we want to implement on top; for MCS we would need:
> - xchg acquire (common case) and load acquire (for spinning on our
> locker's wait word)
> - cmpxchg release (when there is no next locker) and store release
> (when writing to the next locker's wait word)
>
> One downside of the proposal is that using a load acquire for spinning
> puts the memory barrier within the spin loop. So this model is very
> intuitive and does not add unnecessary barriers on x86, but it my
> place the barriers in a suboptimal place for architectures that need
> them.

OK, I will bite... Why is a barrier in the spinloop suboptimal?

Can't say that I have tried measuring it, but the barrier should not
normally result in interconnect traffic. Given that the barrier is
required anyway, it should not affect lock-acquisition latency.

So what am I missing here?

Thanx, Paul

Michel Lespinasse

unread,

Nov 7, 2013, 3:10:02 PM11/7/13

to

On Thu, Nov 7, 2013 at 6:31 AM, Paul E. McKenney
<pau...@linux.vnet.ibm.com> wrote:
> On Thu, Nov 07, 2013 at 04:50:23AM -0800, Michel Lespinasse wrote:
>> On Thu, Nov 7, 2013 at 4:06 AM, Linus Torvalds
>> <torv...@linux-foundation.org> wrote:
>> >
>> > On Nov 7, 2013 6:55 PM, "Michel Lespinasse" <wal...@google.com> wrote:
>> >>
>> >> Rather than writing arch-specific locking code, would you agree to
>> >> introduce acquire and release memory operations ?
>> >
>> > Yes, that's probably the right thing to do. What ops do we need? Store with
>> > release, cmpxchg and load with acquire? Anything else?
>>
>> Depends on what lock types we want to implement on top; for MCS we would need:
>> - xchg acquire (common case) and load acquire (for spinning on our
>> locker's wait word)
>> - cmpxchg release (when there is no next locker) and store release
>> (when writing to the next locker's wait word)
>>
>> One downside of the proposal is that using a load acquire for spinning
>> puts the memory barrier within the spin loop. So this model is very
>> intuitive and does not add unnecessary barriers on x86, but it my
>> place the barriers in a suboptimal place for architectures that need
>> them.
>
> OK, I will bite... Why is a barrier in the spinloop suboptimal?

It's probably not a big deal - all I meant to say is that if you were
manually placing barriers, you would probably put one after the loop
instead. I don't deal much with architectures where such barriers are
needed, so I don't know for sure if the difference means much.

> Can't say that I have tried measuring it, but the barrier should not
> normally result in interconnect traffic. Given that the barrier is
> required anyway, it should not affect lock-acquisition latency.

Agree

> So what am I missing here?

I think you read my second email as me trying to shoot down a proposal
- I wasn't, as I really like the acquire/release model and find it
easy to program with, which is why I'm proposing it in the first
place. I just wanted to be upfront about all potential downsides, so
we can consider them and see if they are significant - I don't think
they are, but I'm not the best person to judge that as I mostly just
deal with x86 stuff.

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Tim Chen

unread,

Nov 7, 2013, 4:20:02 PM11/7/13

to

We could do a load acquire at the end of the
spin loop in the lock function and not in the spin loop itself if cost
of barrier within spin loop is a concern.

Michel, are you planning to do an implementation of
load-acquire/store-release functions of various architectures?

Or is the approach of arch specific memory barrier for MCS
an acceptable one before load-acquire and store-release
are available? Are there any technical issues remaining with
the patchset after including including Waiman's arch specific barrier?

Tim

>
> > Can't say that I have tried measuring it, but the barrier should not
> > normally result in interconnect traffic. Given that the barrier is
> > required anyway, it should not affect lock-acquisition latency.
>
> Agree
>
> > So what am I missing here?
>
> I think you read my second email as me trying to shoot down a proposal
> - I wasn't, as I really like the acquire/release model and find it
> easy to program with, which is why I'm proposing it in the first
> place. I just wanted to be upfront about all potential downsides, so
> we can consider them and see if they are significant - I don't think
> they are, but I'm not the best person to judge that as I mostly just
> deal with x86 stuff.
>

--

Peter Zijlstra

unread,

Nov 7, 2013, 5:30:02 PM11/7/13

to

On Thu, Nov 07, 2013 at 01:15:51PM -0800, Tim Chen wrote:
> Michel, are you planning to do an implementation of
> load-acquire/store-release functions of various architectures?

A little something like this:
http://marc.info/?l=linux-arch&m=138386254111507

It so happens we were working on that the past week or so due to another
issue ;-)

Michel Lespinasse

unread,

Nov 7, 2013, 5:50:01 PM11/7/13

to

On Thu, Nov 7, 2013 at 2:21 PM, Peter Zijlstra <pet...@infradead.org> wrote:
> On Thu, Nov 07, 2013 at 01:15:51PM -0800, Tim Chen wrote:
>> Michel, are you planning to do an implementation of
>> load-acquire/store-release functions of various architectures?
>
> A little something like this:
> http://marc.info/?l=linux-arch&m=138386254111507
>
> It so happens we were working on that the past week or so due to another
> issue ;-)

Haha, awesome, I wasn't aware of this effort.

Tim: my approach would be to provide the acquire/release operations in
arch-specific include files, and have a default implementation using
barriers for arches who don't provide these new ops. That way you make
it work on all arches at once (using the default implementation) and
make it fast on any arch that cares.

>> Or is the approach of arch specific memory barrier for MCS
>> an acceptable one before load-acquire and store-release
>> are available? Are there any technical issues remaining with
>> the patchset after including including Waiman's arch specific barrier?

I don't want to stand in the way of Waiman's change, and I had
actually taken the same approach with arch-specific barriers when
proposing some queue spinlocks in the past; however I do feel that
this comes back regularly enough that having acquire/release
primitives available would help, hence my proposal.

That said, earlier in the thread Linus said we should probably get all
our ducks in a row before going forward with this, so...

--
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Tim Chen

unread,

Nov 7, 2013, 8:20:01 PM11/7/13

to

With the load_acquire and store_release implemented, it should be
pretty straightforward to implement MCS with them. I'll respin
the patch series with these primitives.

Thanks.

Tim

Tim Chen

unread,

Nov 8, 2013, 3:00:02 PM11/8/13

to

In this patch series, we separated out the MCS lock code which was
previously embedded in the mutex.c. This allows for easier reuse of
MCS lock in other places like rwsem and qrwlock. We also did some micro
optimizations and barrier cleanup.

The original code has potential leaks between critical sections, which
was not a problem when MCS was embedded within the mutex but needs
to be corrected when allowing the MCS lock to be used by itself for
other locking purposes.

Proper barriers are now embedded with the usage of smp_load_acquire() in
mcs_spin_lock() and smp_store_release() in mcs_spin_unlock. See
http://marc.info/?l=linux-arch&m=138386254111507 for info on the
new smp_load_acquire() and smp_store_release() functions.

One thing to note is the use of smp_load_acquire in a spin loop
to check for lock acquisition. If there are concerns about
a potential barrier being in the spin loop for some architectures, please let
us know.

This patches were previously part of the rwsem optimization patch series
but now we spearate them out.

Tim Chen

Jason Law (1):

MCS Lock: optimizations and extra comments

Tim Chen (1):
MCS Lock: Restructure the MCS lock defines and locking code into its
own file

Waiman Long (2):

MCS Lock: Move mcs_lock/unlock function into its own file
MCS Lock: Barrier corrections

include/linux/mcs_spinlock.h | 25 +++++++++

include/linux/mutex.h | 5 +-
kernel/locking/Makefile | 6 +-

kernel/locking/mcs_spinlock.c | 108 +++++++++++++++++++++++++++++++++++++++++
kernel/locking/mutex.c | 60 +++--------------------
5 files changed, 146 insertions(+), 58 deletions(-)

create mode 100644 include/linux/mcs_spinlock.h
create mode 100644 kernel/locking/mcs_spinlock.c

--
1.7.4.4

Tim Chen

unread,

Nov 8, 2013, 3:00:02 PM11/8/13

to

From: Waiman Long <Waima...@hp.com>

This patch corrects the way memory barriers are used in the MCS lock

with smp_load_acquire and smp_store_release fucnction.
It removes ones that are not needed.

It uses architecture specific load-acquire and store-release
primitives for synchronization, if available. Generic implementations
are provided in case they are not defined even though they may not
be optimal. These generic implementation could be removed later on
once changes are made in all the relevant header files.

Suggested-by: Michel Lespinasse <wal...@google.com>
Signed-off-by: Waiman Long <Waima...@hp.com>

Signed-off-by: Jason Low <jason...@hp.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

---
kernel/locking/mcs_spinlock.c | 48 +++++++++++++++++++++++++++++++++++------
1 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index b6f27f8..df5c167 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -23,6 +23,31 @@
#endif

/*
+ * Fall back to use the regular atomic operations and memory barrier if
+ * the acquire/release versions are not defined.
+ */
+#ifndef xchg_acquire
+# define xchg_acquire(p, v) xchg(p, v)
+#endif
+
+#ifndef smp_load_acquire
+# define smp_load_acquire(p) \
+ ({ \
+ typeof(*p) __v = ACCESS_ONCE(*(p)); \
+ smp_mb(); \
+ __v; \
+ })
+#endif
+
+#ifndef smp_store_release
+# define smp_store_release(p, v) \
+ do { \
+ smp_mb(); \
+ ACCESS_ONCE(*(p)) = v; \
+ } while (0)
+#endif
+
+/*

* In order to acquire the lock, the caller should declare a local node and
* pass a reference of the node to this function in addition to the lock.

* If the lock has already been acquired, then this will proceed to spin

@@ -37,15 +62,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

node->locked = 0;
node->next = NULL;

- prev = xchg(lock, node);

+ /* xchg() provides a memory barrier */

+ prev = xchg_acquire(lock, node);

if (likely(prev == NULL)) {
/* Lock acquired */
return;
}
ACCESS_ONCE(prev->next) = node;
- smp_wmb();

- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
+ /*
+ * Wait until the lock holder passes the lock down.
+ * Using smp_load_acquire() provides a memory barrier that
+ * ensures subsequent operations happen after the lock is acquired.
+ */
+ while (!(smp_load_acquire(&node->locked)))
arch_mutex_cpu_relax();
}
EXPORT_SYMBOL_GPL(mcs_spin_lock);
@@ -54,7 +83,7 @@ EXPORT_SYMBOL_GPL(mcs_spin_lock);

* Releases the lock. The caller should pass in the corresponding node that
* was used to acquire the lock.
*/

-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
{
struct mcs_spinlock *next = ACCESS_ONCE(node->next);

@@ -68,7 +97,12 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod

while (!(next = ACCESS_ONCE(node->next)))
arch_mutex_cpu_relax();
}

- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
+ /*
+ * Pass lock to next waiter.
+ * smp_store_release() provides a memory barrier to ensure
+ * all operations in the critical section has been completed
+ * before unlocking.
+ */
+ smp_store_release(&next->locked , 1);
}
EXPORT_SYMBOL_GPL(mcs_spin_unlock);

Tim Chen

unread,

Nov 8, 2013, 3:00:02 PM11/8/13

to

From: Waiman Long <Waima...@hp.com>

The following changes are made:

1) Create a new mcs_spinlock.c file to contain the
mcs_spin_lock() and mcs_spin_unlock() function.
2) Include a number of prerequisite header files and define
arch_mutex_cpu_relax(), if not previously defined so the
mcs functions can be compiled for multiple architecture without
causing problems.

Signed-off-by: Waiman Long <Waima...@hp.com>
Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
---
include/linux/mcs_spinlock.h | 56 ++------------------
kernel/locking/Makefile | 6 +-
.../locking/mcs_spinlock.c | 31 ++++++-----
3 files changed, 23 insertions(+), 70 deletions(-)
copy include/linux/mcs_spinlock.h => kernel/locking/mcs_spinlock.c (78%)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index 96f14299..d54bb23 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h
@@ -17,57 +17,9 @@ struct mcs_spinlock {

int locked; /* 1 if lock acquired */
};

-/*
- * In order to acquire the lock, the caller should declare a local node and
- * pass a reference of the node to this function in addition to the lock.
- * If the lock has already been acquired, then this will proceed to spin
- * on this node->locked until the previous lock holder sets the node->locked
- * in mcs_spin_unlock().

- *

- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.

- */
-static noinline
-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

-{
- struct mcs_spinlock *prev;
-

- /* Init node */
- node->locked = 0;
- node->next = NULL;

-
- prev = xchg(lock, node);

- if (likely(prev == NULL)) {

- /* Lock acquired */
- return;
- }
- ACCESS_ONCE(prev->next) = node;

- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))

- arch_mutex_cpu_relax();
-}
-
-/*
- * Releases the lock. The caller should pass in the corresponding node that
- * was used to acquire the lock.
- */

-static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

-{
- struct mcs_spinlock *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*

- * Release the lock by setting it to NULL
- */
- if (likely(cmpxchg(lock, node, NULL) == node))
- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }

- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();

-}
+extern
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node);
+extern
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node);

#endif /* __LINUX_MCS_SPINLOCK_H */

diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5..20d9d5c 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -13,12 +13,12 @@ obj-$(CONFIG_LOCKDEP) += lockdep.o
ifeq ($(CONFIG_PROC_FS),y)
obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
-obj-$(CONFIG_SMP) += spinlock.o
-obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
+obj-$(CONFIG_SMP) += spinlock.o mcs_spinlock.o
+obj-$(CONFIG_PROVE_LOCKING) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
-obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o mcs_spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o

diff --git a/include/linux/mcs_spinlock.h b/kernel/locking/mcs_spinlock.c
similarity index 78%
copy from include/linux/mcs_spinlock.h
copy to kernel/locking/mcs_spinlock.c
index 96f14299..b6f27f8 100644
--- a/include/linux/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.c
@@ -1,7 +1,5 @@
/*
- * MCS lock defines
- *
- * This file contains the main data structure and API definitions of MCS lock.
+ * MCS lock
*

* The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock

* with the desirable properties of being fair, and with each cpu trying

@@ -9,13 +7,20 @@

* It avoids expensive cache bouncings that common test-and-set spin-lock

* implementations incur.
*/
-#ifndef __LINUX_MCS_SPINLOCK_H
-#define __LINUX_MCS_SPINLOCK_H

+/*
+ * asm/processor.h may define arch_mutex_cpu_relax().
+ * If it is not defined, cpu_relax() will be used.
+ */
+#include <asm/barrier.h>
+#include <asm/cmpxchg.h>
+#include <asm/processor.h>
+#include <linux/compiler.h>

+#include <linux/mcs_spinlock.h>
+#include <linux/export.h>

-struct mcs_spinlock {
- struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
-};

+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif

/*
* In order to acquire the lock, the caller should declare a local node and

@@ -23,11 +28,7 @@ struct mcs_spinlock {

* If the lock has already been acquired, then this will proceed to spin

* on this node->locked until the previous lock holder sets the node->locked
* in mcs_spin_unlock().

- *

- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.

*/
-static noinline

void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{
struct mcs_spinlock *prev;
@@ -47,6 +48,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
while (!ACCESS_ONCE(node->locked))
arch_mutex_cpu_relax();
}
+EXPORT_SYMBOL_GPL(mcs_spin_lock);

/*

* Releases the lock. The caller should pass in the corresponding node that

@@ -69,5 +71,4 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
ACCESS_ONCE(next->locked) = 1;
smp_wmb();
}
-
-#endif /* __LINUX_MCS_SPINLOCK_H */
+EXPORT_SYMBOL_GPL(mcs_spin_unlock);

Tim Chen

unread,

Nov 8, 2013, 3:00:02 PM11/8/13

to

We will need the MCS lock code for doing optimistic spinning for rwsem
and queue rwlock. Extracting the MCS code from mutex.c and put into
its own file allow us to reuse this code easily.

Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davi...@hp.com>
---
include/linux/mcs_spinlock.h | 64 ++++++++++++++++++++++++++++++++++++++++++
include/linux/mutex.h | 5 ++-
kernel/locking/mutex.c | 60 ++++----------------------------------

3 files changed, 74 insertions(+), 55 deletions(-)
create mode 100644 include/linux/mcs_spinlock.h

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644

index 0000000..b5de3b0
--- /dev/null
+++ b/include/linux/mcs_spinlock.h

@@ -0,0 +1,64 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *

+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */

+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;

+ int locked; /* 1 if lock acquired */
+};
+
+/*
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static noinline

+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+{

+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;

+ node->next = NULL;
+

+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */

+ node->locked = 1;

+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;

+ smp_wmb();

+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+

+static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*

+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();

+ }
+ ACCESS_ONCE(next->locked) = 1;

*
- * We don't inline mspin_lock() so that perf can correctly account for the

- * time spent in this lock function.
*/

-struct mspin_node {
- struct mspin_node *next ;

- int locked; /* 1 if lock acquired */
-};

-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{

- struct mspin_node *prev;

-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */

- node->locked = 1;

- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-

-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{

- struct mspin_node *next = ACCESS_ONCE(node->next);

-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */

- if (cmpxchg(lock, node, NULL) == node)

- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}

Tim Chen

unread,

Nov 8, 2013, 3:00:02 PM11/8/13

to

From: Jason Low <jason...@hp.com>

Remove unnecessary operation and make the cmpxchg(lock, node, NULL) == node
check in mcs_spin_unlock() likely() as it is likely that a race did not occur
most of the time.

Also add in more comments describing how the local node is used in MCS locks.

Reviewed-by: Tim Chen <tim.c...@linux.intel.com>
Signed-off-by: Jason Low <jason...@hp.com>
---
include/linux/mcs_spinlock.h | 13 +++++++++++--
1 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
index b5de3b0..96f14299 100644
--- a/include/linux/mcs_spinlock.h
+++ b/include/linux/mcs_spinlock.h

@@ -18,6 +18,12 @@ struct mcs_spinlock {
};

/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *

* We don't inline mcs_spin_lock() so that perf can correctly account for the

* time spent in this lock function.
*/

@@ -33,7 +39,6 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
prev = xchg(lock, node);
if (likely(prev == NULL)) {

/* Lock acquired */
- node->locked = 1;

return;
}
ACCESS_ONCE(prev->next) = node;

@@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
arch_mutex_cpu_relax();
}

+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */

static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)

{

struct mcs_spinlock *next = ACCESS_ONCE(node->next);

@@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
/*

* Release the lock by setting it to NULL

*/
- if (cmpxchg(lock, node, NULL) == node)

+ if (likely(cmpxchg(lock, node, NULL) == node))
return;

/* Wait until the next pointer is set */

while (!(next = ACCESS_ONCE(node->next)))

Waiman Long

unread,

Nov 12, 2013, 10:00:01 AM11/12/13

to

On 11/11/2013 04:17 PM, Tim Chen wrote:
>> You could then augment that with [cmp]xchg_{acquire,release} as
>> appropriate.

>>
>>> +/*
>>> * In order to acquire the lock, the caller should declare a local node and
>>> * pass a reference of the node to this function in addition to the lock.
>>> * If the lock has already been acquired, then this will proceed to spin
>>> @@ -37,15 +62,19 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
>>> node->locked = 0;
>>> node->next = NULL;
>>>
>>> - prev = xchg(lock, node);
>>> + /* xchg() provides a memory barrier */
>>> + prev = xchg_acquire(lock, node);
>>> if (likely(prev == NULL)) {
>>> /* Lock acquired */
>>> return;
>>> }
>>> ACCESS_ONCE(prev->next) = node;
>>> - smp_wmb();
>>> - /* Wait until the lock holder passes the lock down */
>>> - while (!ACCESS_ONCE(node->locked))
>>> + /*
>>> + * Wait until the lock holder passes the lock down.
>>> + * Using smp_load_acquire() provides a memory barrier that
>>> + * ensures subsequent operations happen after the lock is acquired.
>>> + */
>>> + while (!(smp_load_acquire(&node->locked)))
>>> arch_mutex_cpu_relax();

> An alternate implementation is
> while (!ACCESS_ONCE(node->locked))
> arch_mutex_cpu_relax();
> smp_load_acquire(&node->locked);
>
> Leaving the smp_load_acquire at the end to provide appropriate barrier.
> Will that be acceptable?
>
> Tim

I second Tim's opinion. It will be help to have a smp_mb_load_acquire()
function that provide a memory barrier with load-acquire semantic. I
don't think we need one for store-release as that will not be in a loop.

Peter, what do you think about adding that to your patch?

-Longman

George Spelvin

unread,

Nov 12, 2013, 12:20:03 PM11/12/13

to

> On Mon, Nov 11, 2013 at 09:17:52PM +0000, Tim Chen wrote:
>> An alternate implementation is
>> while (!ACCESS_ONCE(node->locked))
>> arch_mutex_cpu_relax();
>> smp_load_acquire(&node->locked);
>>
>> Leaving the smp_load_acquire at the end to provide appropriate barrier.
>> Will that be acceptable?

Will Deacon <will....@arm.com> wrote:
> It still doesn't solve my problem though: I want a way to avoid that busy
> loop by some architecture-specific manner. The arch_mutex_cpu_relax() hook
> is a start, but there is no corresponding hook on the unlock side to issue a
> wakeup. Given a sensible relax implementation, I don't have an issue with
> putting a load-acquire in a loop, since it shouldn't be aggresively spinning
> anymore.

So you want something like this?

/*
* This is a spin-wait with acquire semantics. That is, accesses after
* this are not allowed to be reordered before the load that meets
* the specified condition. This requires that it end with either a
* load-acquire or a full smp_mb(). The optimal way to do this is likely
* to be architecture-dependent. E.g. x86 MONITOR/MWAIT instructions.
*/
#ifndef smp_load_acquire_until
#define smp_load_acquire_until(addr, cond) \
while (!(smp_load_acquire(addr) cond)) { \
do { \
arch_mutex_cpu_relax(); \
} while (!(ACCESS_ONCE(*(addr)) cond)); \
}
#endif

smp_load_acquire_until(&node->locked, != 0);

Alternative implementations:

#define smp_load_acquire_until(addr, cond) { \
while (!(ACCESS_ONCE(*(addr)) cond)) \
arch_mutex_cpu_relax(); \
smp_mb(); }

#define smp_load_acquire_until(addr, cond) \
if (!(smp_load_acquire(addr) cond)) { \
do { \
arch_mutex_cpu_relax(); \
} while (!(ACCESS_ONCE(*(addr)) cond)); \
smp_mb(); \

Paul E. McKenney

unread,

Nov 19, 2013, 2:20:03 PM11/19/13

to

On Fri, Nov 08, 2013 at 11:52:15AM -0800, Tim Chen wrote:
> From: Waiman Long <Waima...@hp.com>
>
> The following changes are made:
>
> 1) Create a new mcs_spinlock.c file to contain the
> mcs_spin_lock() and mcs_spin_unlock() function.
> 2) Include a number of prerequisite header files and define
> arch_mutex_cpu_relax(), if not previously defined so the
> mcs functions can be compiled for multiple architecture without
> causing problems.
>
> Signed-off-by: Waiman Long <Waima...@hp.com>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

Assuming issues called out in patch #1 are resolved and testing called
out for patch #2 has happened:

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Paul E. McKenney

unread,

Nov 19, 2013, 2:20:04 PM11/19/13

to

Agreed, no one looks at this field in this case, so no need to initialize
it, unless for debug purposes.

> return;
> }
> ACCESS_ONCE(prev->next) = node;
> @@ -43,6 +48,10 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> arch_mutex_cpu_relax();
> }
>
> +/*
> + * Releases the lock. The caller should pass in the corresponding node that
> + * was used to acquire the lock.
> + */
> static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> {
> struct mcs_spinlock *next = ACCESS_ONCE(node->next);
> @@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
> /*
> * Release the lock by setting it to NULL
> */
> - if (cmpxchg(lock, node, NULL) == node)
> + if (likely(cmpxchg(lock, node, NULL) == node))

Agreed here as well. Takes a narrow race to hit this.

So, did your testing exercise this path? If the answer is "yes", and
if the issues that I called out in patch #1 are resolved:

Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Paul E. McKenney

unread,

Nov 19, 2013, 2:20:04 PM11/19/13

to

On Fri, Nov 08, 2013 at 11:51:52AM -0800, Tim Chen wrote:
> We will need the MCS lock code for doing optimistic spinning for rwsem
> and queue rwlock. Extracting the MCS code from mutex.c and put into
> its own file allow us to reuse this code easily.
>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
> Signed-off-by: Davidlohr Bueso <davi...@hp.com>

Please see comments below.

Thanx, Paul

OK, the full memory barriers implied by xchg() ensure that *node will be
initialized before the "ACCESS_ONCE(prev->next) = node" below puts the
node into the list. This rules out the misordering scenario that Tim
Chen called out in message-id <1380322005.3467.186.camel@schen9-DESK>
on September 27th.

Assuming of course a corresponding barrier on the lock handoff side.

> + if (likely(prev == NULL)) {
> + /* Lock acquired */
> + node->locked = 1;
> + return;
> + }
> + ACCESS_ONCE(prev->next) = node;
> + smp_wmb();

I don't see what the above memory barrier does. Here are some things
that it cannot be doing:

o Ordering the insertion into the list above with the polling
below. First, smp_wmb() does not order prior writes against
later reads, and second misordering is harmless. If we start
polling before the insertion is complete, all that happens
is that the first few polls have no chance of seeing a lock
grant.

o Ordering the polling against the initialization -- the above
xchg() is already doing that for us.

So what is its purpose?

> + /* Wait until the lock holder passes the lock down */
> + while (!ACCESS_ONCE(node->locked))
> + arch_mutex_cpu_relax();

On the other hand, I don't see how we get away without a barrier here.
As written, what prevents the caller's load from ->owner from being
reordered with the above load from ->locked? (Perhaps you can argue
that such reordering is only a performance problem, but if so we need
that argument recorded in comments.)

Of course, if anyone ever tries to use mcs_spin_lock() as a full lock,
they will need a memory barrier here to prevent the critical section
from leaking out.

> +}
> +
> +static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> +{
> + struct mcs_spinlock *next = ACCESS_ONCE(node->next);
> +
> + if (likely(!next)) {
> + /*
> + * Release the lock by setting it to NULL
> + */
> + if (cmpxchg(lock, node, NULL) == node)
> + return;
> + /* Wait until the next pointer is set */
> + while (!(next = ACCESS_ONCE(node->next)))
> + arch_mutex_cpu_relax();
> + }

We need a memory barrier somewhere before here in this function,
otherwise the critical section can leak out. I do not believe that
we can rely on the prohibition against speculative stores that Peter
Zijlstra and I have been discussing because that does not provide the
transitivity required by locking primitives. I believe that we -could-
make the access below be an smp_store_release(), though.

Placing the barrier here (or at least not preceding the initial
fetch from node->next) has the advantage of allowing it to pair with
the xchg() in mcs_spin_lock(), though given the dependency only an
smp_read_barrier_depends() is required for that purpose.

> + ACCESS_ONCE(next->locked) = 1;
> + smp_wmb();

I don't see what this barrier does for us. It is ordering the unlock
store with what, exactly?

If it really is doing something, we need a big fat comment stating what
that is, and checkpatch.pl will be happy to inform you. ;-)

Paul E. McKenney

unread,

Nov 19, 2013, 2:30:02 PM11/19/13

to

On Fri, Nov 08, 2013 at 11:52:38AM -0800, Tim Chen wrote:
> From: Waiman Long <Waima...@hp.com>
>
> This patch corrects the way memory barriers are used in the MCS lock
> with smp_load_acquire and smp_store_release fucnction.
> It removes ones that are not needed.
>
> It uses architecture specific load-acquire and store-release
> primitives for synchronization, if available. Generic implementations
> are provided in case they are not defined even though they may not
> be optimal. These generic implementation could be removed later on
> once changes are made in all the relevant header files.
>
> Suggested-by: Michel Lespinasse <wal...@google.com>
> Signed-off-by: Waiman Long <Waima...@hp.com>
> Signed-off-by: Jason Low <jason...@hp.com>
> Signed-off-by: Tim Chen <tim.c...@linux.intel.com>

Please see comments below.

Thanx, Paul

But if this is xchg_acquire() with only acquire semantics, it need not
ensure that the initializations of node->locked and node->next above
will happen before the "ACCESS_ONCE(prev->next) = node" below. This
therefore needs to remain xchg(). Or you need an smp_store_release()
below instead of an ACCESS_ONCE() assignment.

As currently written, the poor CPU doing the unlock can be fatally
disappointed by seeing pre-initialized values of ->locked and ->next.
This could, among other things, result in a hang where the handoff
happens before the initialization.

> if (likely(prev == NULL)) {
> /* Lock acquired */
> return;
> }
> ACCESS_ONCE(prev->next) = node;
> - smp_wmb();
> - /* Wait until the lock holder passes the lock down */
> - while (!ACCESS_ONCE(node->locked))
> + /*
> + * Wait until the lock holder passes the lock down.
> + * Using smp_load_acquire() provides a memory barrier that
> + * ensures subsequent operations happen after the lock is acquired.
> + */
> + while (!(smp_load_acquire(&node->locked)))
> arch_mutex_cpu_relax();

OK, this smp_load_acquire() makes sense!

> }
> EXPORT_SYMBOL_GPL(mcs_spin_lock);
> @@ -54,7 +83,7 @@ EXPORT_SYMBOL_GPL(mcs_spin_lock);
> * Releases the lock. The caller should pass in the corresponding node that
> * was used to acquire the lock.
> */
> -static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> {
> struct mcs_spinlock *next = ACCESS_ONCE(node->next);
>
> @@ -68,7 +97,12 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
> while (!(next = ACCESS_ONCE(node->next)))
> arch_mutex_cpu_relax();
> }
> - ACCESS_ONCE(next->locked) = 1;
> - smp_wmb();
> + /*
> + * Pass lock to next waiter.
> + * smp_store_release() provides a memory barrier to ensure
> + * all operations in the critical section has been completed
> + * before unlocking.
> + */
> + smp_store_release(&next->locked , 1);

This smp_store_release() makes sense as well!

Could you please get rid of the extraneous space before the comma?

Tim Chen

unread,

Nov 19, 2013, 2:50:01 PM11/19/13

to

I haven't instrumented the code to check the hit rate of this path. But
the slow path probably will only get hit in some cases under
heavy contention.

Tim Chen

unread,

Nov 19, 2013, 2:50:02 PM11/19/13

to

On Tue, 2013-11-19 at 11:10 -0800, Paul E. McKenney wrote:
> On Fri, Nov 08, 2013 at 11:51:52AM -0800, Tim Chen wrote:
> > We will need the MCS lock code for doing optimistic spinning for rwsem
> > and queue rwlock. Extracting the MCS code from mutex.c and put into
> > its own file allow us to reuse this code easily.
> >
> > Signed-off-by: Tim Chen <tim.c...@linux.intel.com>
> > Signed-off-by: Davidlohr Bueso <davi...@hp.com>
>
> Please see comments below.
>

Thanks for reviewing the code.

Agree that the smp_wmb is not needed. It is in the existing mcs code
residing in mutex.c and we're re-factoring the code only here and hasn't
corrected the memory barrier.

The particular smp_wmb() is removed in Patch 4/4 that corrects the
memory barriers.

>
> > + /* Wait until the lock holder passes the lock down */
> > + while (!ACCESS_ONCE(node->locked))
> > + arch_mutex_cpu_relax();
>
> On the other hand, I don't see how we get away without a barrier here.
> As written, what prevents the caller's load from ->owner from being
> reordered with the above load from ->locked? (Perhaps you can argue
> that such reordering is only a performance problem, but if so we need
> that argument recorded in comments.)
>
> Of course, if anyone ever tries to use mcs_spin_lock() as a full lock,
> they will need a memory barrier here to prevent the critical section
> from leaking out.

Agree too. The appropriate memory barrier is added in Patch 4/4.

Tim Chen

unread,

Nov 19, 2013, 2:50:02 PM11/19/13

to

Good point. Will keep it as xchg.

Will do.

Paul E. McKenney

unread,

Nov 19, 2013, 3:00:01 PM11/19/13

to

Ah, so I should have been more aggressive about reviewing some time back,
then... ;-)

Thanx, Paul

Tim Chen

unread,

Nov 19, 2013, 6:10:01 PM11/19/13

to

On Tue, 2013-11-19 at 11:13 -0800, Paul E. McKenney wrote:
> On Fri, Nov 08, 2013 at 11:52:05AM -0800, Tim Chen wrote:
> >
> > +/*
> > + * Releases the lock. The caller should pass in the corresponding node that
> > + * was used to acquire the lock.
> > + */
> > static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> > {
> > struct mcs_spinlock *next = ACCESS_ONCE(node->next);
> > @@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
> > /*
> > * Release the lock by setting it to NULL
> > */
> > - if (cmpxchg(lock, node, NULL) == node)
> > + if (likely(cmpxchg(lock, node, NULL) == node))
>
> Agreed here as well. Takes a narrow race to hit this.
>
> So, did your testing exercise this path? If the answer is "yes",

Paul,

I did some instrumentation and confirmed that the path in question has
been exercised. So this patch should be okay.

Tim

> and if the issues that I called out in patch #1 are resolved:
>
> Reviewed-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
>
> > return;
> > /* Wait until the next pointer is set */
> > while (!(next = ACCESS_ONCE(node->next)))
> >

--

Paul E. McKenney

unread,

Nov 19, 2013, 6:10:02 PM11/19/13

to

On Tue, Nov 19, 2013 at 02:57:41PM -0800, Tim Chen wrote:
> On Tue, 2013-11-19 at 11:13 -0800, Paul E. McKenney wrote:
> > On Fri, Nov 08, 2013 at 11:52:05AM -0800, Tim Chen wrote:
> > >
> > > +/*
> > > + * Releases the lock. The caller should pass in the corresponding node that
> > > + * was used to acquire the lock.
> > > + */
> > > static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> > > {
> > > struct mcs_spinlock *next = ACCESS_ONCE(node->next);
> > > @@ -51,7 +60,7 @@ static void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *nod
> > > /*
> > > * Release the lock by setting it to NULL
> > > */
> > > - if (cmpxchg(lock, node, NULL) == node)
> > > + if (likely(cmpxchg(lock, node, NULL) == node))
> >
> > Agreed here as well. Takes a narrow race to hit this.
> >
> > So, did your testing exercise this path? If the answer is "yes",
>
>
> Paul,
>
> I did some instrumentation and confirmed that the path in question has
> been exercised. So this patch should be okay.

Very good!

Thanx, Paul