[PATCH tip/core/rcu 16/16] rcu: Remove "extern" from function declarations in kernel/rcu/rcu.h

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:02 AM11/16/13

to

From: Teodora Baluta <teob...@gmail.com>

Function prototypes don't need to have the "extern" keyword since this
is the default behavior. Its explicit use is redundant. This commit
therefore removes them.

Signed-off-by: Teodora Baluta <teob...@gmail.com>
Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---
kernel/rcu/rcu.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index a8f981a2d110..79c3877e9c5b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -96,7 +96,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

-extern void kfree(const void *);
+void kfree(const void *);

static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
--
1.8.1.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:02 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

The current task-level idle entry/exit code forces an entry/exit on
each call, regardless of the nesting level. This commit therefore
properly accounts for nesting.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Reviewed-by: Frederic Weisbecker <fwei...@gmail.com>
---
kernel/rcu/tree.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c01213b19dee..acbfead2eb82 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -418,11 +418,12 @@ static void rcu_eqs_enter(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
- if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+ if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
- else
+ rcu_eqs_enter_common(rdtp, oldval, user);
+ } else {
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
- rcu_eqs_enter_common(rdtp, oldval, user);
+ }
}

/**
@@ -540,11 +541,12 @@ static void rcu_eqs_exit(bool user)
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(oldval < 0);
- if (oldval & DYNTICK_TASK_NEST_MASK)
+ if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- else
+ } else {
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(rdtp, oldval, user);
+ rcu_eqs_exit_common(rdtp, oldval, user);
+ }
}

/**

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:02 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

Some RCU bugs have been specific to the layout of the rcu_node tree,
but RCU will silently adjust the tree at boot time if appropriate.
This obscures valuable debugging information, so print a message when
this happens.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

---
kernel/rcu/tree.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acbfead2eb82..875f2a0f9d8d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3378,6 +3378,8 @@ static void __init rcu_init_geometry(void)
if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
nr_cpu_ids == NR_CPUS)
return;
+ pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n",
+ rcu_fanout_leaf, nr_cpu_ids);

/*
* Compute number of nodes that can be handled an rcu_node tree

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:02 AM11/16/13

to

From: "Signed-off-by: Fengguang Wu" <fenggu...@intel.com>

This commit fixes the following coccinelle warning:

kernel/rcu/tree.c:712:9-10: WARNING: return of 0/1 in function
'rcu_lockdep_current_cpu_online' with return type bool

Return statements in functions returning bool should use
true/false instead of 1/0.
Generated by: coccinelle/misc/boolreturn.cocci

Signed-off-by: Fengguang Wu <fenggu...@intel.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/tree.c | 2 +-

1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 875f2a0f9d8d..e0a58eca0092 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -725,7 +725,7 @@ bool rcu_lockdep_current_cpu_online(void)
bool ret;

if (in_nmi())
- return 1;
+ return true;
preempt_disable();
rdp = this_cpu_ptr(&rcu_sched_data);
rnp = rdp->mynode;

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:02 AM11/16/13

to

From: Chen Gang <gang...@asianux.com>

If the rcutorture SRCU output exceeds 4096 bytes, for example, if you
have more than about 75 CPUs, it will overflow the current statically
allocated buffer. This commit therefore replaces this static buffer
with a dynamically buffer whose size is based on the number of CPUs.

Benefits:

- Avoids both buffer overflow and output truncation.
- Handles an arbitrarily large number of CPUs.
- Straightforward implementation.

Shortcomings:

- Some memory is wasted:

1 cpu now comsumes 50 - 60 bytes, and this patch provides 200 bytes.
Therefore, for 1K CPUs, roughly 100KB of memory will be wasted.
However, the memory is freed immediately after printing, so this
wastage should not be a problem in practice.

Testing (Fedora16 2 CPUs, 2GB RAM x86_64):

- as module, with/without "torture_type=srcu".
- build-in not boot runnable, with/without "torture_type=srcu".
- build-in let boot runnable, with/without "torture_type=srcu".

Signed-off-by: Chen Gang <gang...@asianux.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/torture.c | 67 ++++++++++++++++++++++++++--------------------------
1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 69a4ec80a788..732f8ae3086a 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -139,8 +139,6 @@ MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
#define VERBOSE_PRINTK_ERRSTRING(s) \
do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)

-static char printk_buf[4096];
-
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
@@ -376,7 +374,7 @@ struct rcu_torture_ops {
void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
void (*cb_barrier)(void);
void (*fqs)(void);
- int (*stats)(char *page);
+ void (*stats)(char *page);
int irq_capable;
int can_boost;
const char *name;
@@ -578,21 +576,19 @@ static void srcu_torture_barrier(void)
srcu_barrier(&srcu_ctl);
}

-static int srcu_torture_stats(char *page)
+static void srcu_torture_stats(char *page)
{
- int cnt = 0;
int cpu;
int idx = srcu_ctl.completed & 0x1;

- cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):",
+ page += sprintf(page, "%s%s per-CPU(idx=%d):",
torture_type, TORTURE_FLAG, idx);
for_each_possible_cpu(cpu) {
- cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu,
+ page += sprintf(page, " %d(%lu,%lu)", cpu,
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
}
- cnt += sprintf(&page[cnt], "\n");
- return cnt;
+ sprintf(page, "\n");
}

static void srcu_torture_synchronize_expedited(void)
@@ -1052,10 +1048,9 @@ rcu_torture_reader(void *arg)
/*
* Create an RCU-torture statistics message in the specified buffer.
*/
-static int
+static void
rcu_torture_printk(char *page)
{
- int cnt = 0;
int cpu;
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
@@ -1071,8 +1066,8 @@ rcu_torture_printk(char *page)
if (pipesummary[i] != 0)
break;
}
- cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt],
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
"rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
rcu_torture_current,
rcu_torture_current_version,
@@ -1080,53 +1075,52 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ",
+ page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
atomic_read(&n_rcu_torture_mberror),
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror);
- cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ",
+ page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
- cnt += sprintf(&page[cnt],
+ page += sprintf(page,
"onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
n_online_successes, n_online_attempts,
n_offline_successes, n_offline_attempts,
min_online, max_online,
min_offline, max_offline,
sum_online, sum_offline, HZ);
- cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
+ page += sprintf(page, "barrier: %ld/%ld:%ld",
n_barrier_successes,
n_barrier_attempts,
n_rcu_torture_barrier_error);
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_barrier_error != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
n_rcu_torture_boost_failure != 0 ||
i > 1) {
- cnt += sprintf(&page[cnt], "!!! ");
+ page += sprintf(page, "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(1);
}
- cnt += sprintf(&page[cnt], "Reader Pipe: ");
+ page += sprintf(page, "Reader Pipe: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt], "Reader Batch: ");
+ page += sprintf(page, " %ld", pipesummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Reader Batch: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
- cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
- cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
- cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+ page += sprintf(page, " %ld", batchsummary[i]);
+ page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page, "Free-Block Circulation: ");
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- cnt += sprintf(&page[cnt], " %d",
+ page += sprintf(page, " %d",
atomic_read(&rcu_torture_wcount[i]));
}
- cnt += sprintf(&page[cnt], "\n");
+ page += sprintf(page, "\n");
if (cur_ops->stats)
- cnt += cur_ops->stats(&page[cnt]);
- return cnt;
+ cur_ops->stats(page);
}

/*
@@ -1140,10 +1134,17 @@ rcu_torture_printk(char *page)
static void
rcu_torture_stats_print(void)
{
- int cnt;
+ int size = nr_cpu_ids * 200 + 8192;
+ char *buf;

- cnt = rcu_torture_printk(printk_buf);
- pr_alert("%s", printk_buf);
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("rcu-torture: Out of memory, need: %d", size);
+ return;
+ }
+ rcu_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
}

/*

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

Whenever a CPU receives a scheduling-clock interrupt, RCU checks to see
if the RCU core needs anything from this CPU. If so, RCU raises
RCU_SOFTIRQ to carry out any needed processing.

This approach has worked well historically, but it is undesirable on
NO_HZ_FULL CPUs. Such CPUs are expected to spend almost all of their time
in userspace, so that scheduling-clock interrupts can be disabled while
there is only one runnable task on the CPU in question. Unfortunately,
raising any softirq has the potential to wake up ksoftirqd, which would
provide the second runnable task on that CPU, preventing disabling of
scheduling-clock interrupts.

What is needed instead is for RCU to leave NO_HZ_FULL CPUs alone,
relying on the grace-period kthreads' quiescent-state forcing to
do any needed RCU work on behalf of those CPUs.

This commit therefore refrains from raising RCU_SOFTIRQ on any
NO_HZ_FULL CPUs during any grace periods that have been in effect
for less than one second. The one-second limit handles the case
where an inappropriate workload is running on a NO_HZ_FULL CPU
that features lots of scheduling-clock interrupts, but no idle
or userspace time.

Reported-by: Mike Galbraith <bitb...@online.de>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Tested-by: Mike Galbraith <bitb...@online.de>
Toasted-by: Frederic Weisbecker <fwei...@gmail.com>
---
kernel/rcu/tree.c | 4 ++++
kernel/rcu/tree.h | 1 +
kernel/rcu/tree_plugin.h | 20 ++++++++++++++++++++
3 files changed, 25 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 13d1a1a0d60a..7be5efd62fe5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2783,6 +2783,10 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rsp, rdp);

+ /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
+ if (rcu_nohz_full_cpu(rsp))
+ return 0;
+
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
rdp->qs_pending && !rdp->passed_quiesce) {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index a87adfc2916b..8c19873f1ac9 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -571,6 +571,7 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
unsigned long maxj);
static void rcu_bind_gp_kthread(void);
static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp);

#endif /* #ifndef RCU_TREE_NONCORE */

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 29335faf96e7..1aa33a59fadc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2872,3 +2872,23 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
}

#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+
+/*
+ * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
+ * grace-period kthread will do force_quiescent_state() processing?
+ * The idea is to avoid waking up RCU core processing on such a
+ * CPU unless the grace period has extended for too long.
+ *
+ * This code relies on the fact that all NO_HZ_FULL CPUs are also
+ * CONFIG_RCU_NOCB_CPUs.
+ */
+static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(smp_processor_id()) &&
+ (!rcu_gp_in_progress(rsp) ||
+ ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
+ return 1;
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
+ return 0;
+}

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

Hello!

This series contains miscellaneous fixes for RCU:

1. Kick CPUs when they get halfway to the stall-warning timeout.

2. Because wait_event() and wake_up() do not necessarily imply
full memory barriers, supply them as needed.

3. Fix a deadlock between RCU, scheduler, and perf by deferring
wake-ups until locks are dropped.

4. Allow task-level idle entry/exit nesting.

5. Fix srcu_barrier()'s docbook header.

6. Announce rcu_node geometry adjustment at boot time.

7. Fix some bool-return bugs detected by coccinelle,
courtesy of Fengguang Wu.

8. Make CONFIG_RCU_FANOUT_EXACT work correctly for oddball
CONFIG_RCU_FANOUT and CONFIG_RCU_FANOUT_LEAF values.
(They were so oddball that even I wasn't testing them,
but they still need to be fixed.)

9. Provide function-header documentation of memory-barrier guarantees
provided by synchronize_srcu() and call_srcu().

10. Improve diagnostics for blocking in RCU callback functions.

11. Warn on allegedly impossible rcu_read_unlock_special() from irq.
And fix the first allegedly impossible warning that triggered. ;-)
Courtesy of Lai Jiangshan.

12. Add smp_mb__after_srcu_read_unlock() to force full barrier
in conjunction with srcu_read_unlock(), courtesy of Michael
S. Tsirkin.

13. Refuse to activate RCU core on NO_HZ_FULL CPUs, at least until
the grace period gets too long in the tooth.

14. Dynamically allocate rcutorture buffer for SRCU output to avoid
potential buffer overflows, courtesy of Chen Gang.

15-16: Remove "extern" declarations, courtesy of Teodora Baluta.

Thanx, Paul

b/Documentation/RCU/trace.txt | 20 +++++----
b/include/linux/rculist.h | 4 -
b/include/linux/rcupdate.h | 81 +++++++++++++++++++-------------------
b/include/linux/rcutiny.h | 2
b/include/linux/rcutree.h | 36 ++++++++--------
b/include/linux/srcu.h | 14 ++++++
b/kernel/rcu/rcu.h | 5 +-
b/kernel/rcu/srcu.c | 57 +++++++++++++++++++++++---
b/kernel/rcu/torture.c | 75 ++++++++++++++++++-----------------
b/kernel/rcu/tree.c | 79 ++++++++++++++++++++++++++++++-------
b/kernel/rcu/tree.h | 12 ++++-
b/kernel/rcu/tree_plugin.h | 89 +++++++++++++++++++++++++++++++++++-------
b/kernel/rcu/tree_trace.c | 3 -
b/kernel/rcu/update.c | 5 ++
14 files changed, 339 insertions(+), 143 deletions(-)

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

Currently blocking in an RCU callback function will result in
"scheduling while atomic", which could be triggered for any number
of reasons. To aid debugging, this patch introduces a rcu_callback_map
that is used to tie the inappropriate voluntary context switch back
to the fact that the function is being invoked from within a callback.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

---
include/linux/rcupdate.h | 1 +
kernel/rcu/rcu.h | 3 +++
kernel/rcu/update.c | 5 +++++
3 files changed, 9 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 39cbb889e20d..a94a5805d378 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -325,6 +325,7 @@ static inline void rcu_lock_release(struct lockdep_map *map)
extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
+extern struct lockdep_map rcu_callback_map;
extern int debug_lockdep_rcu_enabled(void);

/**
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7859a0a3951e..a8f981a2d110 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -102,13 +102,16 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;

+ rcu_lock_acquire(&rcu_callback_map);
if (__is_kfree_rcu_offset(offset)) {
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
+ rcu_lock_release(&rcu_callback_map);
return 1;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
+ rcu_lock_release(&rcu_callback_map);
return 0;
}
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 6cb3dff89e2b..802365ccd591 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,11 @@ struct lockdep_map rcu_sched_lock_map =
STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
EXPORT_SYMBOL_GPL(rcu_sched_lock_map);

+static struct lock_class_key rcu_callback_key;
+struct lockdep_map rcu_callback_map =
+ STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);
+EXPORT_SYMBOL_GPL(rcu_callback_map);
+
int notrace debug_lockdep_rcu_enabled(void)
{
return rcu_scheduler_active && debug_locks &&
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

It is all too easy to forget that wait_event() does not necessarily
imply a full memory barrier. The case where it does not is where the
condition transitions to true just as wait_event() starts execution.
This is actually a feature: The standard use of wait_event() involves
locking, in which case the locks provide the needed ordering (you hold a
lock across the wake_up() and acquire that same lock after wait_event()
returns).

Given that I did forget that wait_event() does not necessarily imply a
full memory barrier in one case, this commit fixes that case. This commit
also adds comments calling out the placement of existing memory barriers
relied on by wait_event() calls.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/torture.c | 8 +++++---
kernel/rcu/tree.c | 3 +++
kernel/rcu/tree_plugin.h | 6 +++++-
3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 3929cd451511..69a4ec80a788 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -1578,6 +1578,7 @@ static int rcu_torture_barrier_cbs(void *arg)
{
long myid = (long)arg;
bool lastphase = 0;
+ bool newphase;
struct rcu_head rcu;

init_rcu_head_on_stack(&rcu);
@@ -1585,10 +1586,11 @@ static int rcu_torture_barrier_cbs(void *arg)
set_user_nice(current, 19);
do {
wait_event(barrier_cbs_wq[myid],
- barrier_phase != lastphase ||
+ (newphase =
+ ACCESS_ONCE(barrier_phase)) != lastphase ||
kthread_should_stop() ||
fullstop != FULLSTOP_DONTSTOP);
- lastphase = barrier_phase;
+ lastphase = newphase;
smp_mb(); /* ensure barrier_phase load before ->call(). */
if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
break;
@@ -1625,7 +1627,7 @@ static int rcu_torture_barrier(void *arg)
if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
break;
n_barrier_attempts++;
- cur_ops->cb_barrier();
+ cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
n_rcu_torture_barrier_error++;
WARN_ON_ONCE(1);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e00946e432ae..fa496ea405a1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1520,6 +1520,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
rdp = this_cpu_ptr(rsp->rda);
if (rnp == rdp->mynode)
__note_gp_changes(rsp, rnp, rdp);
+ /* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched();
@@ -1564,6 +1565,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
wait_event_interruptible(rsp->gp_wq,
ACCESS_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ /* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
cond_resched();
@@ -1593,6 +1595,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
(!ACCESS_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp)),
j);
+ /* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!ACCESS_ONCE(rnp->qsmask) &&
!rcu_preempt_blocked_readers_cgp(rnp))
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3822ac0c4b27..c2aab07411dc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -779,8 +779,10 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (wake)
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
wake_up(&sync_rcu_preempt_exp_wq);
+ }
break;
}
mask = rnp->grpmask;
@@ -1852,6 +1854,7 @@ static int rcu_oom_notify(struct notifier_block *self,

/* Wait for callbacks from earlier instance to complete. */
wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+ smp_mb(); /* Ensure callback reuse happens after callback invocation. */

/*
* Prevent premature wakeup: ensure that all increments happen
@@ -2250,6 +2253,7 @@ static int rcu_nocb_kthread(void *arg)
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("Sleep"));
wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
+ /* Memory barrier provide by xchg() below. */
} else if (firsttime) {
firsttime = 0;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

The srcu_barrier() docbook header left out the "sp" argument, so this
commit adds that argument's docbook text.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/srcu.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..0f0c63111f20 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -491,6 +491,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);

/**
* srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
+ * @sp: srcu_struct on which to wait for in-flight callbacks.
*/
void srcu_barrier(struct srcu_struct *sp)
{
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

This commit documents the memory-barrier guarantees provided by
synchronize_srcu() and call_srcu().

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/srcu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 0f0c63111f20..3318d8284384 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -363,6 +363,29 @@ static void srcu_flip(struct srcu_struct *sp)
/*
* Enqueue an SRCU callback on the specified srcu_struct structure,
* initiating grace-period processing if it is not already running.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing SRCU read-side critical section. On systems with
+ * more than one CPU, this means that when "func()" is invoked, each CPU
+ * is guaranteed to have executed a full memory barrier since the end of
+ * its last corresponding SRCU read-side critical section whose beginning
+ * preceded the call to call_rcu(). It also means that each CPU executing
+ * an SRCU read-side critical section that continues beyond the start of
+ * "func()" must have executed a memory barrier after the call_rcu()
+ * but before the beginning of that SRCU read-side critical section.
+ * Note that these guarantees include CPUs that are offline, idle, or
+ * executing in user mode, as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting SRCU callback function "func()", then both CPU A and CPU
+ * B are guaranteed to execute a full memory barrier during the time
+ * interval between the call to call_rcu() and the invocation of "func()".
+ * This guarantee applies even if CPU A and CPU B are the same CPU (but
+ * again only if the system has more than one CPU).
+ *
+ * Of course, these guarantees apply only for invocations of call_srcu(),
+ * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
+ * srcu_struct structure.
*/
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
void (*func)(struct rcu_head *head))
@@ -459,7 +482,30 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
* Note that it is illegal to call synchronize_srcu() from the corresponding
* SRCU read-side critical section; doing so will result in deadlock.
* However, it is perfectly legal to call synchronize_srcu() on one
- * srcu_struct from some other srcu_struct's read-side critical section.
+ * srcu_struct from some other srcu_struct's read-side critical section,
+ * as long as the resulting graph of srcu_structs is acyclic.
+ *
+ * There are memory-ordering constraints implied by synchronize_srcu().
+ * On systems with more than one CPU, when synchronize_srcu() returns,
+ * each CPU is guaranteed to have executed a full memory barrier since
+ * the end of its last corresponding SRCU-sched read-side critical section
+ * whose beginning preceded the call to synchronize_srcu(). In addition,
+ * each CPU having an SRCU read-side critical section that extends beyond
+ * the return from synchronize_srcu() is guaranteed to have executed a
+ * full memory barrier after the beginning of synchronize_srcu() and before
+ * the beginning of that SRCU read-side critical section. Note that these
+ * guarantees include CPUs that are offline, idle, or executing in user mode,
+ * as well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked synchronize_srcu(), which returned
+ * to its caller on CPU B, then both CPU A and CPU B are guaranteed
+ * to have executed a full memory barrier during the execution of
+ * synchronize_srcu(). This guarantee applies even if CPU A and CPU B
+ * are the same CPU, but again only if the system has more than one CPU.
+ *
+ * Of course, these memory-ordering guarantees apply only when
+ * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are
+ * passed the same srcu_struct structure.
*/
void synchronize_srcu(struct srcu_struct *sp)
{
@@ -476,12 +522,8 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
* Wait for an SRCU grace period to elapse, but be more aggressive about
* spinning rather than blocking when waiting.
*
- * Note that it is also illegal to call synchronize_srcu_expedited()
- * from the corresponding SRCU read-side critical section;
- * doing so will result in deadlock. However, it is perfectly legal
- * to call synchronize_srcu_expedited() on one srcu_struct from some
- * other srcu_struct's read-side critical section, as long as
- * the resulting graph of srcu_structs is acyclic.
+ * Note that synchronize_srcu_expedited() has the same deadlock and
+ * memory-ordering properties as does synchronize_srcu().
*/
void synchronize_srcu_expedited(struct srcu_struct *sp)
{
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Michael S. Tsirkin" <m...@redhat.com>

The srcu_read_unlock() implementation currently contains a full memory
barrier, but that has not always been the case and it someday again might
not be the case. However, given that the memory barrier is there, it is
useful to be able to rely on it. This commit therefore adds a new
smp_mb__after_srcu_read_unlock() API that, together with a preceding
srcu_read_unlock(), is guaranteed to provide a full barrier.

Signed-off-by: Michael S. Tsirkin <m...@redhat.com>
Acked-by: "Lai Jiangshan" <la...@cn.fujitsu.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

include/linux/srcu.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index c114614ed172..9b058eecd403 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -237,4 +237,18 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
__srcu_read_unlock(sp, idx);
}

+/**
+ * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
+ *
+ * Converts the preceding srcu_read_unlock into a two-way memory barrier.
+ *
+ * Call this after srcu_read_unlock, to guarantee that all memory operations
+ * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
+ * the preceding srcu_read_unlock.
+ */
+static inline void smp_mb__after_srcu_read_unlock(void)
+{
+ /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
+}
+
#endif
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:04 AM11/16/13

to

From: Teodora Baluta <teob...@gmail.com>

Function prototypes don't need to have the "extern" keyword since this
is the default behavior. Its explicit use is redundant. This commit
therefore removes them.

Signed-off-by: Teodora Baluta <teob...@gmail.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

include/linux/rculist.h | 4 +--
include/linux/rcupdate.h | 80 ++++++++++++++++++++++++------------------------
include/linux/rcutiny.h | 2 +-
include/linux/rcutree.h | 36 +++++++++++-----------
4 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 45a0a9e81478..dbaf99084112 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -55,8 +55,8 @@ static inline void __list_add_rcu(struct list_head *new,
next->prev = new;
}
#else
-extern void __list_add_rcu(struct list_head *new,
- struct list_head *prev, struct list_head *next);
+void __list_add_rcu(struct list_head *new,
+ struct list_head *prev, struct list_head *next);
#endif

/**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a94a5805d378..52c1b13c4d76 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -50,13 +50,13 @@ extern int rcutorture_runnable; /* for sysctl */
#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */

#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
-extern void rcutorture_record_test_transition(void);
-extern void rcutorture_record_progress(unsigned long vernum);
-extern void do_trace_rcu_torture_read(const char *rcutorturename,
- struct rcu_head *rhp,
- unsigned long secs,
- unsigned long c_old,
- unsigned long c);
+void rcutorture_record_test_transition(void);
+void rcutorture_record_progress(unsigned long vernum);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
#else
static inline void rcutorture_record_test_transition(void)
{
@@ -65,11 +65,11 @@ static inline void rcutorture_record_progress(unsigned long vernum)
{
}
#ifdef CONFIG_RCU_TRACE
-extern void do_trace_rcu_torture_read(const char *rcutorturename,
- struct rcu_head *rhp,
- unsigned long secs,
- unsigned long c_old,
- unsigned long c);
+void do_trace_rcu_torture_read(const char *rcutorturename,
+ struct rcu_head *rhp,
+ unsigned long secs,
+ unsigned long c_old,
+ unsigned long c);
#else
#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
do { } while (0)
@@ -118,8 +118,8 @@ extern void do_trace_rcu_torture_read(const char *rcutorturename,
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
*/
-extern void call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
+void call_rcu(struct rcu_head *head,
+ void (*func)(struct rcu_head *head));

#else /* #ifdef CONFIG_PREEMPT_RCU */

@@ -149,8 +149,8 @@ extern void call_rcu(struct rcu_head *head,
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
*/
-extern void call_rcu_bh(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
+void call_rcu_bh(struct rcu_head *head,
+ void (*func)(struct rcu_head *head));

/**
* call_rcu_sched() - Queue an RCU for invocation after sched grace period.
@@ -171,16 +171,16 @@ extern void call_rcu_bh(struct rcu_head *head,
* See the description of call_rcu() for more detailed information on
* memory ordering guarantees.
*/
-extern void call_rcu_sched(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu));
+void call_rcu_sched(struct rcu_head *head,
+ void (*func)(struct rcu_head *rcu));

-extern void synchronize_sched(void);
+void synchronize_sched(void);

#ifdef CONFIG_PREEMPT_RCU

-extern void __rcu_read_lock(void);
-extern void __rcu_read_unlock(void);
-extern void rcu_read_unlock_special(struct task_struct *t);
+void __rcu_read_lock(void);
+void __rcu_read_unlock(void);
+void rcu_read_unlock_special(struct task_struct *t);
void synchronize_rcu(void);

/*
@@ -216,19 +216,19 @@ static inline int rcu_preempt_depth(void)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

/* Internal to kernel */
-extern void rcu_init(void);
-extern void rcu_sched_qs(int cpu);
-extern void rcu_bh_qs(int cpu);
-extern void rcu_check_callbacks(int cpu, int user);
+void rcu_init(void);
+void rcu_sched_qs(int cpu);
+void rcu_bh_qs(int cpu);
+void rcu_check_callbacks(int cpu, int user);
struct notifier_block;
-extern void rcu_idle_enter(void);
-extern void rcu_idle_exit(void);
-extern void rcu_irq_enter(void);
-extern void rcu_irq_exit(void);
+void rcu_idle_enter(void);
+void rcu_idle_exit(void);
+void rcu_irq_enter(void);
+void rcu_irq_exit(void);

#ifdef CONFIG_RCU_USER_QS
-extern void rcu_user_enter(void);
-extern void rcu_user_exit(void);
+void rcu_user_enter(void);
+void rcu_user_exit(void);
#else
static inline void rcu_user_enter(void) { }
static inline void rcu_user_exit(void) { }
@@ -262,7 +262,7 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
} while (0)

#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
-extern bool __rcu_is_watching(void);
+bool __rcu_is_watching(void);
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */

/*
@@ -289,8 +289,8 @@ void wait_rcu_gp(call_rcu_func_t crf);
* initialization.
*/
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-extern void init_rcu_head_on_stack(struct rcu_head *head);
-extern void destroy_rcu_head_on_stack(struct rcu_head *head);
+void init_rcu_head_on_stack(struct rcu_head *head);
+void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head_on_stack(struct rcu_head *head)
{
@@ -363,7 +363,7 @@ static inline int rcu_read_lock_held(void)
* rcu_read_lock_bh_held() is defined out of line to avoid #include-file
* hell.
*/
-extern int rcu_read_lock_bh_held(void);
+int rcu_read_lock_bh_held(void);

/**
* rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
@@ -449,7 +449,7 @@ static inline int rcu_read_lock_sched_held(void)

#ifdef CONFIG_PROVE_RCU

-extern int rcu_my_thread_group_empty(void);
+int rcu_my_thread_group_empty(void);

/**
* rcu_lockdep_assert - emit lockdep splat if specified condition not met
@@ -1006,7 +1006,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))

#ifdef CONFIG_RCU_NOCB_CPU
-extern bool rcu_is_nocb_cpu(int cpu);
+bool rcu_is_nocb_cpu(int cpu);
#else
static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
@@ -1014,8 +1014,8 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; }

/* Only for use by adaptive-ticks code. */
#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-extern bool rcu_sys_is_idle(void);
-extern void rcu_sysidle_force_exit(void);
+bool rcu_sys_is_idle(void);
+void rcu_sysidle_force_exit(void);
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */

static inline bool rcu_sys_is_idle(void)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 09ebcbe9fd78..6f01771b571c 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -125,7 +125,7 @@ static inline void exit_rcu(void)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern int rcu_scheduler_active __read_mostly;
-extern void rcu_scheduler_starting(void);
+void rcu_scheduler_starting(void);
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
static inline void rcu_scheduler_starting(void)
{
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 4b9c81548742..72137ee8c603 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,9 +30,9 @@
#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H

-extern void rcu_note_context_switch(int cpu);
-extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
-extern void rcu_cpu_stall_reset(void);
+void rcu_note_context_switch(int cpu);
+int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
+void rcu_cpu_stall_reset(void);

/*
* Note a virtualization-based context switch. This is simply a
@@ -44,9 +44,9 @@ static inline void rcu_virt_note_context_switch(int cpu)
rcu_note_context_switch(cpu);
}

-extern void synchronize_rcu_bh(void);
-extern void synchronize_sched_expedited(void);
-extern void synchronize_rcu_expedited(void);
+void synchronize_rcu_bh(void);
+void synchronize_sched_expedited(void);
+void synchronize_rcu_expedited(void);

void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));

@@ -71,25 +71,25 @@ static inline void synchronize_rcu_bh_expedited(void)
synchronize_sched_expedited();
}

-extern void rcu_barrier(void);
-extern void rcu_barrier_bh(void);
-extern void rcu_barrier_sched(void);
+void rcu_barrier(void);
+void rcu_barrier_bh(void);
+void rcu_barrier_sched(void);

extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
-extern long rcu_batches_completed_sched(void);
+long rcu_batches_completed(void);
+long rcu_batches_completed_bh(void);
+long rcu_batches_completed_sched(void);

-extern void rcu_force_quiescent_state(void);
-extern void rcu_bh_force_quiescent_state(void);
-extern void rcu_sched_force_quiescent_state(void);
+void rcu_force_quiescent_state(void);
+void rcu_bh_force_quiescent_state(void);
+void rcu_sched_force_quiescent_state(void);

-extern void exit_rcu(void);
+void exit_rcu(void);

-extern void rcu_scheduler_starting(void);
+void rcu_scheduler_starting(void);
extern int rcu_scheduler_active __read_mostly;

-extern bool rcu_is_watching(void);
+bool rcu_is_watching(void);

#endif /* __LINUX_RCUTREE_H */
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

When an RCU CPU stall warning occurs, the CPU invokes resched_cpu() on
itself. This can help move the grace period forward in some situations,
but it would be even better to do this -before- the RCU CPU stall warning.
This commit therefore causes resched_cpu() to be called every five jiffies
once the system is halfway to an RCU CPU stall warning.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/tree.c | 26 +++++++++++++++++++++++++-
kernel/rcu/tree.h | 2 ++
2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8a2c81e86dda..e00946e432ae 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -755,6 +755,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
}

/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
+
+/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -812,16 +818,34 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
*/
rcu_kick_nohz_cpu(rdp->cpu);

+ /*
+ * Alternatively, the CPU might be running in the kernel
+ * for an extended period of time without a quiescent state.
+ * Attempt to force the CPU through the scheduler to gain the
+ * needed quiescent state, but only if the grace period has gone
+ * on for an uncommonly long time. If there are many stuck CPUs,
+ * we will beat on the first one until it gets unstuck, then move
+ * to the next. Only do this for the primary flavor of RCU.
+ */
+ if (rdp->rsp == rcu_state &&
+ ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
+ rdp->rsp->jiffies_resched += 5;
+ resched_cpu(rdp->cpu);
+ }
+
return 0;
}

static void record_gp_stall_check_time(struct rcu_state *rsp)
{
unsigned long j = ACCESS_ONCE(jiffies);
+ unsigned long j1;

rsp->gp_start = j;
smp_wmb(); /* Record start time before stall time. */
- rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
+ j1 = rcu_jiffies_till_stall_check();
+ rsp->jiffies_stall = j + j1;
+ rsp->jiffies_resched = j + j1 / 2;
}

/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 52be957c9fe2..8e34d8674a4e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -453,6 +453,8 @@ struct rcu_state {
/* but in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
+ unsigned long jiffies_resched; /* Time at which to resched */
+ /* a reluctant CPU. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
const char *name; /* Name of structure. */
--
1.8.1.5

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:03 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

Each element of the rcu_state structure's ->levelspread[] array
is intended to contain the per-level fanout, where the zero-th
element corresponds to the root of the rcu_node tree, and the last
element corresponds to the leaves. In the CONFIG_RCU_FANOUT_EXACT
case, this means that the last element should be filled in
from CONFIG_RCU_FANOUT_LEAF (or from the rcu_fanout_leaf boot
parameter, if provided) and that the remaining elements should
be filled in from CONFIG_RCU_FANOUT. Unfortunately, the current
code in rcu_init_levelspread() takes the opposite approach, placing
CONFIG_RCU_FANOUT_LEAF in the zero-th element and CONFIG_RCU_FANOUT in
the remaining elements.

For typical power-of-two values, this generates odd but functional
rcu_node trees. However, other values, for example CONFIG_RCU_FANOUT=3
and CONFIG_RCU_FANOUT_LEAF=2, generate trees that can leave some CPUs
out of the grace-period computation, resulting in too-short grace periods
and therefore a broken RCU implementation.

This commit therefore fixes rcu_init_levelspread() to set the last
->levelspread[] array element from CONFIG_RCU_FANOUT_LEAF and the
remaining elements from CONFIG_RCU_FANOUT, thus generating the
intended rcu_node trees.

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/tree.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e0a58eca0092..13d1a1a0d60a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3246,9 +3246,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
{
int i;

- for (i = rcu_num_lvls - 1; i > 0; i--)
+ rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ for (i = rcu_num_lvls - 2; i >= 0; i--)
rsp->levelspread[i] = CONFIG_RCU_FANOUT;
- rsp->levelspread[0] = rcu_fanout_leaf;
}
#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
static void __init rcu_init_levelspread(struct rcu_state *rsp)

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:04 AM11/16/13

to

From: Lai Jiangshan <la...@cn.fujitsu.com>

After commit #10f39bb1b2c1 (rcu: protect __rcu_read_unlock() against
scheduler-using irq handlers), it is no longer possible to enter
the main body of rcu_read_lock_special() from an NMI, interrupt, or
softirq handler. In theory, this implies that the check for "in_irq()
|| in_serving_softirq()" must always fail, so that in theory this check
could be removed entirely.

In practice, this commit wraps this condition with a WARN_ON_ONCE().
If this warning never triggers, then the condition will be removed
entirely.

[ paulmck: And one way of triggering the WARN_ON() is if a scheduling
clock interrupt occurs in an RCU read-side critical section, setting
RCU_READ_UNLOCK_NEED_QS, which is handled by rcu_read_unlock_special().
Updated this commit to return if only that bit was set. ]

Signed-off-by: Lai Jiangshan <la...@cn.fujitsu.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Merge with 14b8cc88f674 (rcu: Warn on allegedly impossible rcu_read_unlock_special() from irq)

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>
---

kernel/rcu/tree_plugin.h | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 59a3a17a0c57..29335faf96e7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -361,10 +361,14 @@ void rcu_read_unlock_special(struct task_struct *t)
special = t->rcu_read_unlock_special;
if (special & RCU_READ_UNLOCK_NEED_QS) {
rcu_preempt_qs(smp_processor_id());
+ if (!t->rcu_read_unlock_special) {
+ local_irq_restore(flags);
+ return;
+ }
}

- /* Hardware IRQ handlers cannot block. */
- if (in_irq() || in_serving_softirq()) {
+ /* Hardware IRQ handlers cannot block, complain if they get here. */
+ if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
local_irq_restore(flags);
return;

Paul E. McKenney

unread,

Nov 16, 2013, 12:30:04 AM11/16/13

to

From: "Paul E. McKenney" <pau...@linux.vnet.ibm.com>

Dave Jones got the following lockdep splat:

> ======================================================
> [ INFO: possible circular locking dependency detected ]
> 3.12.0-rc3+ #92 Not tainted
> -------------------------------------------------------
> trinity-child2/15191 is trying to acquire lock:
> (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50
>
> but task is already holding lock:
> (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> which lock already depends on the new lock.
>
>
> the existing dependency chain (in reverse order) is:
>
> -> #3 (&ctx->lock){-.-...}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0
> [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0
> [<ffffffff81732052>] __schedule+0x1d2/0xa20
> [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0
> [<ffffffff817352b6>] retint_kernel+0x26/0x30
> [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50
> [<ffffffff813f0504>] pty_write+0x54/0x60
> [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0
> [<ffffffff813e5838>] tty_write+0x158/0x2d0
> [<ffffffff811c4850>] vfs_write+0xc0/0x1f0
> [<ffffffff811c52cc>] SyS_write+0x4c/0xa0
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> -> #2 (&rq->lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80
> [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0
> [<ffffffff81054336>] do_fork+0x126/0x460
> [<ffffffff81054696>] kernel_thread+0x26/0x30
> [<ffffffff8171ff93>] rest_init+0x23/0x140
> [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403
> [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4
>
> -> #1 (&p->pi_lock){-.-.-.}:
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff810979d1>] try_to_wake_up+0x31/0x350
> [<ffffffff81097d62>] default_wake_function+0x12/0x20
> [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40
> [<ffffffff8108ea38>] __wake_up_common+0x58/0x90
> [<ffffffff8108ff59>] __wake_up+0x39/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111b8d>] call_rcu+0x1d/0x20
> [<ffffffff81093697>] cpu_attach_domain+0x287/0x360
> [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0
> [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a
> [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202
> [<ffffffff817200be>] kernel_init+0xe/0x190
> [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0
>
> -> #0 (&rdp->nocb_wq){......}:
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2
>
> other info that might help us debug this:
>
> Chain exists of:
> &rdp->nocb_wq --> &rq->lock --> &ctx->lock
>
> Possible unsafe locking scenario:
>
> CPU0 CPU1
> ---- ----
> lock(&ctx->lock);
> lock(&rq->lock);
> lock(&ctx->lock);
> lock(&rdp->nocb_wq);
>
> *** DEADLOCK ***
>
> 1 lock held by trinity-child2/15191:
> #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230
>
> stack backtrace:
> CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92
> ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40
> ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0
> ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0
> Call Trace:
> [<ffffffff8172a363>] dump_stack+0x4e/0x82
> [<ffffffff81726741>] print_circular_bug+0x200/0x20f
> [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0
> [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60
> [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80
> [<ffffffff810cc243>] lock_acquire+0x93/0x200
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90
> [<ffffffff8108ff43>] ? __wake_up+0x23/0x50
> [<ffffffff8108ff43>] __wake_up+0x23/0x50
> [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0
> [<ffffffff81111450>] __call_rcu+0x140/0x820
> [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50
> [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30
> [<ffffffff81149abf>] put_ctx+0x4f/0x70
> [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230
> [<ffffffff81056b8d>] do_exit+0x30d/0xcc0
> [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0
> [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10
> [<ffffffff8105893c>] do_group_exit+0x4c/0xc0
> [<ffffffff810589c4>] SyS_exit_group+0x14/0x20
> [<ffffffff8173d4e4>] tracesys+0xdd/0xe2

The underlying problem is that perf is invoking call_rcu() with the
scheduler locks held, but in NOCB mode, call_rcu() will with high
probability invoke the scheduler -- which just might want to use its
locks. The reason that call_rcu() needs to invoke the scheduler is
to wake up the corresponding rcuo callback-offload kthread, which
does the job of starting up a grace period and invoking the callbacks
afterwards.

One solution (championed on a related problem by Lai Jiangshan) is to
simply defer the wakeup to some point where scheduler locks are no longer
held. Since we don't want to unnecessarily incur the cost of such
deferral, the task before us is threefold:

1. Determine when it is likely that a relevant scheduler lock is held.

2. Defer the wakeup in such cases.

3. Ensure that all deferred wakeups eventually happen, preferably
sooner rather than later.

We use irqs_disabled_flags() as a proxy for relevant scheduler locks
being held. This works because the relevant locks are always acquired
with interrupts disabled. We may defer more often than needed, but that
is at least safe.

The wakeup deferral is tracked via a new field in the per-CPU and
per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup.

This flag is checked by the RCU core processing. The __rcu_pending()
function now checks this flag, which causes rcu_check_callbacks()
to initiate RCU core processing at each scheduling-clock interrupt
where this flag is set. Of course this is not sufficient because
scheduling-clock interrupts are often turned off (the things we used to
be able to count on!). So the flags are also checked on entry to any
state that RCU considers to be idle, which includes both NO_HZ_IDLE idle
state and NO_HZ_FULL user-mode-execution state.

This approach should allow call_rcu() to be invoked regardless of what
locks you might be holding, the key word being "should".

Reported-by: Dave Jones <da...@redhat.com>

Signed-off-by: Paul E. McKenney <pau...@linux.vnet.ibm.com>

Cc: Peter Zijlstra <pet...@infradead.org>
---
Documentation/RCU/trace.txt | 20 ++++++++++-------
kernel/rcu/tree.c | 24 ++++++++++++++++----
kernel/rcu/tree.h | 9 ++++++--
kernel/rcu/tree_plugin.h | 55 ++++++++++++++++++++++++++++++++++++---------
kernel/rcu/tree_trace.c | 3 ++-
5 files changed, 86 insertions(+), 25 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index f3778f8952da..b8c3c813ea57 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -396,14 +396,14 @@ o Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node

The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:

- 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903
- 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113
- 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889
- 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469
- 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042
- 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422
- 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699
- 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147
+ 0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
+ 1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
+ 2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
+ 3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
+ 4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
+ 5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
+ 6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
+ 7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0

The fields are as follows:

@@ -432,6 +432,10 @@ o "gpc" is the number of times that an old grace period had
o "gps" is the number of times that a new grace period had started,
but this CPU was not yet aware of it.

+o "ndw" is the number of times that a wakeup of an rcuo
+ callback-offload kthread had to be deferred in order to avoid
+ deadlock.
+
o "nn" is the number of times that this CPU needed nothing.

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fa496ea405a1..c01213b19dee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
bool user)
{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
}
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ do_nocb_deferred_wakeup(rdp);
+ }
rcu_prepare_for_idle(smp_processor_id());
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic_inc(); /* See above. */
@@ -1915,13 +1922,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
* Adopt the RCU callbacks from the specified rcu_state structure's
* orphanage. The caller must hold the ->orphan_lock.
*/
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
{
int i;
struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);

/* No-CBs CPUs are handled specially. */
- if (rcu_nocb_adopt_orphan_cbs(rsp, rdp))
+ if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
return;

/* Do the accounting first. */
@@ -2000,7 +2007,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)

/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp);
+ rcu_adopt_orphan_cbs(rsp, flags);

/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
mask = rdp->grpmask; /* rnp->grplo is constant. */
@@ -2317,6 +2324,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
/* If there are callbacks ready, invoke them. */
if (cpu_has_callbacks_ready_to_invoke(rdp))
invoke_rcu_callbacks(rsp, rdp);
+
+ /* Do any needed deferred wakeups of rcuo kthreads. */
+ do_nocb_deferred_wakeup(rdp);
}

/*
@@ -2451,7 +2461,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),

if (cpu != -1)
rdp = per_cpu_ptr(rsp->rda, cpu);
- offline = !__call_rcu_nocb(rdp, head, lazy);
+ offline = !__call_rcu_nocb(rdp, head, lazy, flags);
WARN_ON_ONCE(offline);
/* _call_rcu() is illegal on offline CPU; leak the callback. */
local_irq_restore(flags);
@@ -2804,6 +2814,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 1;
}

+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp)) {
+ rdp->n_rp_nocb_defer_wakeup++;
+ return 1;
+ }
+
/* nothing to do */
rdp->n_rp_need_nothing++;
return 0;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e34d8674a4e..a87adfc2916b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -317,6 +317,7 @@ struct rcu_data {
unsigned long n_rp_cpu_needs_gp;
unsigned long n_rp_gp_completed;
unsigned long n_rp_gp_started;
+ unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;

/* 6) _rcu_barrier() and OOM callbacks. */
@@ -335,6 +336,7 @@ struct rcu_data {
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */

/* 8) RCU CPU stall data. */
@@ -550,9 +552,12 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy);
+ bool lazy, unsigned long flags);
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp);
+ struct rcu_data *rdp,
+ unsigned long flags);
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
static void rcu_kick_nohz_cpu(int cpu);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c2aab07411dc..59a3a17a0c57 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2104,7 +2104,8 @@ bool rcu_is_nocb_cpu(int cpu)
static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
struct rcu_head *rhp,
struct rcu_head **rhtp,
- int rhcount, int rhcount_lazy)
+ int rhcount, int rhcount_lazy,
+ unsigned long flags)
{
int len;
struct rcu_head **old_rhpp;
@@ -2125,9 +2126,16 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
}
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
- wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
+ if (!irqs_disabled_flags(flags)) {
+ wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rdp->nocb_defer_wakeup = true;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ TPS("WakeEmptyIsDeferred"));
+ }
rdp->qlen_last_fqs_check = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
wake_up_process(t); /* ... or if many callbacks queued. */
rdp->qlen_last_fqs_check = LONG_MAX / 2;
@@ -2148,12 +2156,12 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
* "rcuo" kthread can find it.
*/
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy)
+ bool lazy, unsigned long flags)
{

if (!rcu_is_nocb_cpu(rdp->cpu))
return 0;
- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+ __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
if (__is_kfree_rcu_offset((unsigned long)rhp->func))
trace_rcu_kfree_callback(rdp->rsp->name, rhp,
(unsigned long)rhp->func,
@@ -2171,7 +2179,8 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* not a no-CBs CPU.
*/
static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp)
+ struct rcu_data *rdp,
+ unsigned long flags)
{
long ql = rsp->qlen;
long qll = rsp->qlen_lazy;
@@ -2185,14 +2194,14 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
/* First, enqueue the donelist, if any. This preserves CB ordering. */
if (rsp->orphan_donelist != NULL) {
__call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
- rsp->orphan_donetail, ql, qll);
+ rsp->orphan_donetail, ql, qll, flags);
ql = qll = 0;
rsp->orphan_donelist = NULL;
rsp->orphan_donetail = &rsp->orphan_donelist;
}
if (rsp->orphan_nxtlist != NULL) {
__call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
- rsp->orphan_nxttail, ql, qll);
+ rsp->orphan_nxttail, ql, qll, flags);
ql = qll = 0;
rsp->orphan_nxtlist = NULL;
rsp->orphan_nxttail = &rsp->orphan_nxtlist;
@@ -2314,6 +2323,22 @@ static int rcu_nocb_kthread(void *arg)
return 0;
}

+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return ACCESS_ONCE(rdp->nocb_defer_wakeup);
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (!rcu_nocb_need_deferred_wakeup(rdp))
+ return;
+ ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
+ wake_up(&rdp->nocb_wq);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
+}
+
/* Initialize per-rcu_data variables for no-CBs CPUs. */
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
@@ -2369,13 +2394,14 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
}

static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy)
+ bool lazy, unsigned long flags)
{
return 0;
}

static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
- struct rcu_data *rdp)
+ struct rcu_data *rdp,
+ unsigned long flags)
{
return 0;
}
@@ -2384,6 +2410,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}

+static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+}
+
static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3596797b7e46..4def475336d4 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -364,9 +364,10 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
+ seq_printf(m, "gpc=%ld gps=%ld nn=%ld ndw%ld\n",
rdp->n_rp_gp_completed,
rdp->n_rp_gp_started,
+ rdp->n_rp_nocb_defer_wakeup,
rdp->n_rp_need_nothing);