1) get_online_cpus() must be allowed to be called recursively, so I added
get_online_cpus_nest for every task for new code.
This patch just allows get_online_cpus() to be called recursively,
but when it is not nested, get_online_cpus() will wait until
cpuhotplug finished, so the potential starvation is avoided.
And, the livelock of cpu_hotplug_begin() is avoided, so the comment
is removed.
2) This new code use PER_CPU counters, and this counters protected
by RCU. These counters acts like the reference counters of a modules.
(Actually, all these code is stolen from module.c: try_refcount_get()
is stolen from try_module_get(), put_online_cpus() from module_put()...)
After this patch applied, get_online_cpus() is very light and scale when
cpuhotplug is not running. It just disables preemption and increase
the cpu counter and then enables preemption.
3) Since we have try_refcount_get(), I add a new API try_get_online_cpus().
Signed-off-by: Lai Jiangshan <la...@cn.fujitsu.com>
---
include/linux/cpu.h | 2
include/linux/sched.h | 3 +
kernel/cpu.c | 131 ++++++++++++++++++++++++++++++++------------------
kernel/fork.c | 3 +
4 files changed, 94 insertions(+), 45 deletions(-)
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863..a32809c 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -112,6 +112,7 @@ extern struct sysdev_class cpu_sysdev_class;
extern void get_online_cpus(void);
extern void put_online_cpus(void);
+extern int try_get_online_cpus(void);
#define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri)
#define register_hotcpu_notifier(nb) register_cpu_notifier(nb)
#define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb)
@@ -134,6 +135,7 @@ static inline void cpu_hotplug_driver_unlock(void)
#define get_online_cpus() do { } while (0)
#define put_online_cpus() do { } while (0)
+#define try_get_online_cpus() (1)
#define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0)
/* These aren't inline functions due to a GCC bug. */
#define register_hotcpu_notifier(nb) ({ (void)(nb); 0; })
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46b6e5..0422ea3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1501,6 +1501,9 @@ struct task_struct {
unsigned long memsw_bytes; /* uncharged mem+swap usage */
} memcg_batch;
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ int get_online_cpus_nest;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bc1e3d5..ede02c6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,6 +14,7 @@
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
+#include <linux/percpu.h>
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -42,41 +43,82 @@ static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
-static struct {
- struct task_struct *active_writer;
- struct mutex lock; /* Synchronizes accesses to refcount, */
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- int refcount;
-} cpu_hotplug = {
- .active_writer = NULL,
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
- .refcount = 0,
-};
+DEFINE_MUTEX(cpu_hotplug_lock);
+struct task_struct *cpu_hotplug_task;
+DEFINE_PER_CPU(int, refcount);
+
+static int try_refcount_get(void)
+{
+ preempt_disable();
+
+ if (likely(!cpu_hotplug_task)) {
+ __get_cpu_var(refcount)++;
+ preempt_enable();
+ return 1;
+ }
+
+ preempt_enable();
+ return 0;
+}
+
+int try_get_online_cpus(void)
+{
+ if (cpu_hotplug_task == current)
+ return 1;
+
+ if (current->get_online_cpus_nest || try_refcount_get()) {
+ current->get_online_cpus_nest++;
+ return 1;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(try_get_online_cpus);
void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug_task == current)
+ return;
+
+ if (current->get_online_cpus_nest++)
+ return;
+
+ if (likely(try_refcount_get()))
return;
- mutex_lock(&cpu_hotplug.lock);
- cpu_hotplug.refcount++;
- mutex_unlock(&cpu_hotplug.lock);
+ mutex_lock(&cpu_hotplug_lock);
+ percpu_add(refcount, 1);
+ mutex_unlock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
+static unsigned int refcount_sum(void)
+{
+ unsigned int total = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ total += per_cpu(refcount, cpu);
+
+ return total;
+}
+
void put_online_cpus(void)
{
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug_task == current)
+ return;
+
+ if (WARN_ON(!current->get_online_cpus_nest))
return;
- mutex_lock(&cpu_hotplug.lock);
- if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
- wake_up_process(cpu_hotplug.active_writer);
- mutex_unlock(&cpu_hotplug.lock);
+ if (!--current->get_online_cpus_nest) {
+ preempt_disable();
+ __get_cpu_var(refcount)--;
+ if (cpu_hotplug_task)
+ wake_up_process(cpu_hotplug_task);
+ preempt_enable();
+ }
}
EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -85,41 +127,40 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
* refcount goes to zero.
*
* Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
+ * will be blocked by the cpu_hotplug_lock
*/
static void cpu_hotplug_begin(void)
{
- cpu_hotplug.active_writer = current;
+ mutex_lock(&cpu_hotplug_lock);
+
+ /*
+ * Set cpu_hotplug_task. Wait until all running try_refcount_get()
+ * finished and all these try_refcount_get() behavior are seen.
+ */
+ cpu_hotplug_task = current;
+ synchronize_sched();
+ /* Wait for zero refcount */
for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- if (likely(!cpu_hotplug.refcount))
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!refcount_sum())
break;
- __set_current_state(TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cpu_hotplug.lock);
schedule();
}
+
+ __set_current_state(TASK_RUNNING);
}
static void cpu_hotplug_done(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
+ /*
+ * Ensure try_refcount_get() sees the front befavior
+ * after it sees cpu_hotplug_task == NULL.
+ */
+ smp_mb();
+
+ cpu_hotplug_task = NULL;
+ mutex_unlock(&cpu_hotplug_lock);
}
#else /* #if CONFIG_HOTPLUG_CPU */
diff --git a/kernel/fork.c b/kernel/fork.c
index d67f1db..b162014 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1109,6 +1109,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.memcg = NULL;
#endif
p->stack_start = stack_start;
+#ifdef CONFIG_HOTPLUG_CPU
+ p->get_online_cpus_nest = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Well, iirc one of the goals of
cpu-hotplug: replace lock_cpu_hotplug() with get_online_cpus()
86ef5c9a8edd78e6bf92879f32329d89b2d55b5a
was avoiding the new members in task_struct. I leave this up to you
and Gautham.
Lai, I didn't read this patch carefully yet (and I can't apply it to
Linus's tree). But at first glance,
> void put_online_cpus(void)
> {
> ...
> + if (!--current->get_online_cpus_nest) {
> + preempt_disable();
> + __get_cpu_var(refcount)--;
> + if (cpu_hotplug_task)
> + wake_up_process(cpu_hotplug_task);
This looks unsafe. In theory nothing protects cpu_hotplug_task from
exiting if refcount_sum() becomes zero, this means wake_up_process()
can hit the freed/reused/unmapped task_struct. Probably cpu_hotplug_done()
needs another synhronize_sched() before return.
OTOH, I do not understand why the result of __get_cpu_var(refcount)
must be visible to refcount_sum() if we race with cpu_hotplug_begin(),
so it seems to me cpu_hotplug_begin() also needs synchronize_sched()
before refcount_sum().
Oleg.
because I tried to apply it without 1/2 ;)
> > void put_online_cpus(void)
> > {
> > ...
> > + if (!--current->get_online_cpus_nest) {
> > + preempt_disable();
> > + __get_cpu_var(refcount)--;
> > + if (cpu_hotplug_task)
> > + wake_up_process(cpu_hotplug_task);
>
> This looks unsafe. In theory nothing protects cpu_hotplug_task from
> exiting if refcount_sum() becomes zero, this means wake_up_process()
> can hit the freed/reused/unmapped task_struct. Probably cpu_hotplug_done()
> needs another synhronize_sched() before return.
Yes, I think this is true, at least in theory.
> OTOH, I do not understand why the result of __get_cpu_var(refcount)
> must be visible to refcount_sum() if we race with cpu_hotplug_begin(),
> so it seems to me cpu_hotplug_begin() also needs synchronize_sched()
> before refcount_sum().
No, I misread the unapplied patch, sorry for noise.
Old get_online_cpus() is read-preference, I think the goal of this ability
is allow get_online_cpus()/put_online_cpus() to be called nested.
But read-preference RWL may cause write side starvation, so I abandon this ability,
and use per-task counter for allowing get_online_cpus()/put_online_cpus()
to be called nested, I think this deal is absolutely worth.
>>
>>
>> Lai, I didn't read this patch carefully yet (and I can't apply it to
>> Linus's tree). But at first glance,
>
> because I tried to apply it without 1/2 ;)
>
>>> void put_online_cpus(void)
>>> {
>>> ...
>>> + if (!--current->get_online_cpus_nest) {
>>> + preempt_disable();
>>> + __get_cpu_var(refcount)--;
>>> + if (cpu_hotplug_task)
>>> + wake_up_process(cpu_hotplug_task);
>> This looks unsafe. In theory nothing protects cpu_hotplug_task from
>> exiting if refcount_sum() becomes zero, this means wake_up_process()
>> can hit the freed/reused/unmapped task_struct. Probably cpu_hotplug_done()
>> needs another synhronize_sched() before return.
>
> Yes, I think this is true, at least in theory.
preempt_disable() prevent cpu_hotplug_task from exiting.
Thanks, Lai
Sure, I understand why you added task_struct->get_online_cpus_nest.
> and use per-task counter for allowing get_online_cpus()/put_online_cpus()
> to be called nested, I think this deal is absolutely worth.
As I said, I am not going to argue. I can't justify this tradeoff.
> >>> void put_online_cpus(void)
> >>> {
> >>> ...
> >>> + if (!--current->get_online_cpus_nest) {
> >>> + preempt_disable();
> >>> + __get_cpu_var(refcount)--;
> >>> + if (cpu_hotplug_task)
> >>> + wake_up_process(cpu_hotplug_task);
> >> This looks unsafe. In theory nothing protects cpu_hotplug_task from
> >> exiting if refcount_sum() becomes zero, this means wake_up_process()
> >> can hit the freed/reused/unmapped task_struct. Probably cpu_hotplug_done()
> >> needs another synhronize_sched() before return.
> >
> > Yes, I think this is true, at least in theory.
>
> preempt_disable() prevent cpu_hotplug_task from exiting.
If the cpu_down() is the caller of cpu_hotplug_begin/done, then yes.
But unless I missed something, nothing protects from cpu_up() which
takes this lock too.
Just in case... I am not saying this is really possible in practice.
Oleg.
But, I must admit, I'd like to avoid adding the new member to task_struct.
What do you think about the code below?
I didn't even try to compile it, just to explain what I mean.
In short: we have the per-cpu fast counters, plus the slow counter
which is only used when cpu_hotplug_begin() is in progress.
Oleg.
static DEFINE_PER_CPU(long, cpuhp_fast_ctr);
static struct task_struct *cpuhp_writer;
static DEFINE_MUTEX(cpuhp_slow_lock)
static long cpuhp_slow_ctr;
static bool update_fast_ctr(int inc)
{
bool success = true;
preempt_disable();
if (likely(!cpuhp_writer))
__get_cpu_var(cpuhp_fast_ctr) += inc;
else if (cpuhp_writer != current)
success = false;
preempt_enable();
return success;
}
void get_online_cpus(void)
{
if (likely(update_fast_ctr(+1));
return;
mutex_lock(&cpuhp_slow_lock);
cpuhp_slow_ctr++;
mutex_unlock(&cpuhp_slow_lock);
}
void put_online_cpus(void)
{
if (likely(update_fast_ctr(-1));
return;
mutex_lock(&cpuhp_slow_lock);
if (!--cpuhp_slow_ctr && cpuhp_writer)
wake_up_process(cpuhp_writer);
mutex_unlock(&cpuhp_slow_lock);
}
static void clear_fast_ctr(void)
{
long total = 0;
int cpu;
for_each_possible_cpu(cpu) {
total += per_cpu(cpuhp_fast_ctr, cpu);
per_cpu(cpuhp_fast_ctr, cpu) = 0;
}
return total;
}
static void cpu_hotplug_begin(void)
{
cpuhp_writer = current;
synchronize_sched();
/* Nobody except us can use can use cpuhp_fast_ctr */
mutex_lock(&cpuhp_slow_lock);
cpuhp_slow_ctr += clear_fast_ctr();
while (cpuhp_slow_ctr) {
__set_current_state(TASK_UNINTERRUPTIBLE);
mutex_unlock(&&cpuhp_slow_lock);
schedule();
mutex_lock(&cpuhp_slow_lock);
}
}
static void cpu_hotplug_done(void)
{
cpuhp_writer = NULL;
mutex_unlock(&cpuhp_slow_lock);
get_online_cpus() in your code is still read-preference.
I wish we quit this ability of get_online_cpus().
Lai.
Why?
Because read-preference RWL will cause write site starvation.
A user run the following code will cause cpuhotplug starvation.
(100 processes run sched_setaffinity().)
Lai
#define _GNU_SOURCE
#include <sched.h>
#include <sys/types.h>
#include <unistd.h>
#include <stdlib.h>
#define NCPU 4
#define NPROCESS 100
cpu_set_t set;
pid_t target;
void stress_test(void)
{
int cpu;
srand((int)target);
for (;;) {
cpu = rand() % NCPU;
CPU_SET(cpu, &set);
sched_setaffinity(target, sizeof(set), &set);
CPU_CLR(cpu, &set);
}
}
int main(int argc, char *argv[])
{
pid_t ret;
int i;
target = getpid();
for (i = 1; i < NPROCESS; i++) {
ret = fork();
if (ret < 0)
break;
else if (ret)
target = ret;
else
stress_test();
}
stress_test();
Sure, but why is that a problem? Hotplug should be a rare event, who
cares if it takes a while to come through.
Yes,
> I wish we quit this ability of get_online_cpus().
Heh. Since I never read the changelogs, I didn't even notice this was
one of the goals of your patch. I thought this is just the side-effect.
Yes, if we want to block the new readers, I don't see how it is possible
to avoid the counter in task_struct.
I can't judge whether this new member worth the trouble. Once again, I am
not arguing, just I don't know. And I think your patch is correct (apart
from pure theoretical race with cpu_hotplug_done afaics).
Oleg.
> who cares if it takes a while to come through.
It is totally starvation, hotplug can not complete.
I think it is a kind of DOS attack.