[PATCH 05/12] sched: Use TASK_WAKING for fork wakups

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:04 PM12/16/09

to

foo11.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:04 PM12/16/09

to

foo7.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:03 PM12/16/09

to

foo1.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:04 PM12/16/09

to

foo4.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:03 PM12/16/09

to

foo10.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:03 PM12/16/09

to

foo13.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:04 PM12/16/09

to

foo8.patch

Peter Zijlstra

unread,

Dec 16, 2009, 12:10:04 PM12/16/09

to

foo12.patch

Peter Zijlstra

unread,

Dec 16, 2009, 1:00:01 PM12/16/09

to

On Wed, 2009-12-16 at 18:04 +0100, Peter Zijlstra wrote:
> plain text document attachment (foo13.patch)

still need to fix quilt...

From: Xiaotian Feng <df...@redhat.com>

> Sachin found cpu hotplug test failures on powerpc, which made the
> kernel hang on his POWER box.
>
> The problem is that we fail to re-activate a cpu when a hot-unplug
> fails. Fix this by moving the de-activation into _cpu_down after doing
> the initial checks.
>
> Remove the synchronize_sched() calls and rely on those implied by
> rebuilding the sched domains using the new mask.
>
> Reported-by: Sachin Sant <sac...@in.ibm.com>
> Signed-off-by: Xiaotian Feng <df...@redhat.com>
> Tested-by: Sachin Sant <sac...@in.ibm.com>
> Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:40:03 PM12/16/09

to

Commit-ID: 5da9a0fb673a0ea0a093862f95f6b89b3390c31e
Gitweb: http://git.kernel.org/tip/5da9a0fb673a0ea0a093862f95f6b89b3390c31e
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:38 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:57 +0100

sched: Fix select_task_rq() vs hotplug issues

Since select_task_rq() is now responsible for guaranteeing
->cpus_allowed and cpu_active_mask, we need to verify this.

select_task_rq_rt() can blindly return
smp_processor_id()/task_cpu() without checking the valid masks,
select_task_rq_fair() can do the same in the rare case that all
SD_flags are disabled.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---
kernel/sched.c | 75 +++++++++++++++++++++++++++++--------------------------
1 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 63e55ac..cc40bda 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2317,6 +2317,43 @@ void task_oncpu_function_call(struct task_struct *p,
}

#ifdef CONFIG_SMP
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+ int dest_cpu;
+ const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ return dest_cpu;
+
+ /* Any allowed, online CPU? */
+ dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+ if (dest_cpu < nr_cpu_ids)
+ return dest_cpu;
+
+ /* No more Mr. Nice Guy. */
+ if (dest_cpu >= nr_cpu_ids) {
+ rcu_read_lock();
+ cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+ rcu_read_unlock();
+ dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
+ }
+
+ return dest_cpu;
+}
+
/*
* Called from:
*
@@ -2343,14 +2380,8 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
* not worry about this generic constraint ]
*/
if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
- !cpu_active(cpu))) {
-
- cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
- /*
- * XXX: race against hot-plug modifying cpu_active_mask
- */
- BUG_ON(cpu >= nr_cpu_ids);
- }
+ !cpu_active(cpu)))
+ cpu = select_fallback_rq(task_cpu(p), p);

return cpu;
}
@@ -7319,36 +7350,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
int dest_cpu;
- const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

again:
- /* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
- goto move;
-
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- goto move;
-
- /* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
- dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
-
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- pr_info("process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, dead_cpu);
- }
- }
+ dest_cpu = select_fallback_rq(dead_cpu, p);

-move:
/* It can have affinity changed while we were choosing. */
if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
goto again;

tip-bot for Xiaotian Feng

unread,

Dec 16, 2009, 1:40:02 PM12/16/09

to

Commit-ID: 9ee349ad6d326df3633d43f54202427295999c47
Gitweb: http://git.kernel.org/tip/9ee349ad6d326df3633d43f54202427295999c47
Author: Xiaotian Feng <df...@redhat.com>
AuthorDate: Wed, 16 Dec 2009 18:04:32 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:53 +0100

sched: Fix set_cpu_active() in cpu_down()

Sachin found cpu hotplug test failures on powerpc, which made
the kernel hang on his POWER box.

The problem is that we fail to re-activate a cpu when a
hot-unplug fails. Fix this by moving the de-activation into
_cpu_down after doing the initial checks.

Remove the synchronize_sched() calls and rely on those implied
by rebuilding the sched domains using the new mask.

Reported-by: Sachin Sant <sac...@in.ibm.com>
Signed-off-by: Xiaotian Feng <df...@redhat.com>
Tested-by: Sachin Sant <sac...@in.ibm.com>

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/cpu.c | 24 +++---------------------
1 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 291ac58..1c8ddd6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -ENOMEM;

cpu_hotplug_begin();
+ set_cpu_active(cpu, false);
err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
hcpu, -1, &nr_calls);
if (err == NOTIFY_BAD) {
@@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu)
goto out;
}

- set_cpu_active(cpu, false);
-
- /*
- * Make sure the all cpus did the reschedule and are not
- * using stale version of the cpu_active_mask.
- * This is not strictly necessary becuase stop_machine()
- * that we run down the line already provides the required
- * synchronization. But it's really a side effect and we do not
- * want to depend on the innards of the stop_machine here.
- */
- synchronize_sched();
-
err = _cpu_down(cpu, 0);

out:
@@ -382,19 +371,12 @@ int disable_nonboot_cpus(void)
return error;
cpu_maps_update_begin();
first_cpu = cpumask_first(cpu_online_mask);
- /* We take down all of the non-boot CPUs in one shot to avoid races
+ /*
+ * We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);

- for_each_online_cpu(cpu) {
- if (cpu == first_cpu)
- continue;
- set_cpu_active(cpu, false);
- }
-
- synchronize_sched();
-
printk("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
if (cpu == first_cpu)

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:40:03 PM12/16/09

to

Commit-ID: e4f4288842ee12747e10c354d72be7d424c0b627
Gitweb: http://git.kernel.org/tip/e4f4288842ee12747e10c354d72be7d424c0b627
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:34 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:55 +0100

sched: Select_task_rq_fair() must honour SD_LOAD_BALANCE

We should skip !SD_LOAD_BALANCE domains.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>

CC: sta...@kernel.org

Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/sched_fair.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5bedf6e..ec1d271 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,6 +1429,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
}

for_each_domain(cpu, tmp) {
+ if (!(tmp->flags & SD_LOAD_BALANCE))
+ continue;
+
/*
* If power savings logic is enabled for a domain, see if we
* are not overloaded, if so, don't balance wider.

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:40:04 PM12/16/09

to

Commit-ID: e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0
Gitweb: http://git.kernel.org/tip/e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:33 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:54 +0100

sched: Fix task_hot() test order

Make sure not to access sched_fair fields before verifying it is
indeed a sched_fair task.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>

CC: sta...@kernel.org
LKML-Reference: <200912161705...@chello.nl>

Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/sched.c | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 9c30858..1d8ca25 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2046,6 +2046,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;

+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -2054,9 +2057,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
&p->se == cfs_rq_of(&p->se)->last))
return 1;

- if (p->sched_class != &fair_sched_class)
- return 0;
-
if (sysctl_sched_migration_cost == -1)
return 1;
if (sysctl_sched_migration_cost == 0)

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:40:03 PM12/16/09

to

Commit-ID: 06b83b5fbea273672822b6ee93e16781046553ec
Gitweb: http://git.kernel.org/tip/06b83b5fbea273672822b6ee93e16781046553ec
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:35 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:55 +0100

sched: Use TASK_WAKING for fork wakups

For later convenience use TASK_WAKING for fresh tasks.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>

LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/sched.c | 18 +++++++++---------
1 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 1d8ca25..1672823 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2540,14 +2540,6 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
-
- /*
- * We mark the process as running here, but have not actually
- * inserted it onto the runqueue yet. This guarantees that
- * nobody will actually run it, and a signal or other external
- * event cannot wake it up and insert it on the runqueue either.
- */
- p->state = TASK_RUNNING;
}

/*
@@ -2558,6 +2550,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
int cpu = get_cpu();

__sched_fork(p);
+ /*
+ * We mark the process as waking here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_WAKING;

/*
* Revert to default priority/policy on fork if requested.
@@ -2626,7 +2624,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
struct rq *rq;

rq = task_rq_lock(p, &flags);
- BUG_ON(p->state != TASK_RUNNING);
+ BUG_ON(p->state != TASK_WAKING);
+ p->state = TASK_RUNNING;
update_rq_clock(rq);
activate_task(rq, p, 0);
trace_sched_wakeup_new(rq, p, 1);
@@ -6984,6 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);

__sched_fork(idle);
+ idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:40:03 PM12/16/09

to

Commit-ID: 933b0618d8b2a59c7a0742e43836544e02f1e9bd
Gitweb: http://git.kernel.org/tip/933b0618d8b2a59c7a0742e43836544e02f1e9bd
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:31 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:53 +0100

sched: Mark boot-cpu active before smp_init()

A UP machine has 1 active cpu, not having the boot-cpu in the
active map when starting the scheduler confuses things.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

init/main.c | 7 +------
1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/init/main.c b/init/main.c
index c3db4a9..dac44a9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -369,12 +369,6 @@ static void __init smp_init(void)
{
unsigned int cpu;

- /*
- * Set up the current CPU as possible to migrate to.
- * The other ones will be done by cpu_up/cpu_down()
- */
- set_cpu_active(smp_processor_id(), true);
-
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
@@ -486,6 +480,7 @@ static void __init boot_cpu_init(void)
int cpu = smp_processor_id();
/* Mark the boot cpu "present", "online" etc for SMP and UP case */
set_cpu_online(cpu, true);
+ set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:50:02 PM12/16/09

to

Commit-ID: e2912009fb7b715728311b0d8fe327a1432b3f79
Gitweb: http://git.kernel.org/tip/e2912009fb7b715728311b0d8fe327a1432b3f79
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:36 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:56 +0100

sched: Ensure set_task_cpu() is never called on blocked tasks

In order to clean up the set_task_cpu() rq dependencies we need
to ensure it is never called on blocked tasks because such usage
does not pair with consistent rq->lock usage.

This puts the migration burden on ttwu().

Furthermore we need to close a race against changing
->cpus_allowed, since select_task_rq() runs with only preemption
disabled.

For sched_fork() this is safe because the child isn't in the
tasklist yet, for wakeup we fix this by synchronizing
set_cpus_allowed_ptr() against TASK_WAKING, which leaves
sched_exec to be a problem

This also closes a hole in (6ad4c1888 sched: Fix balance vs
hotplug race) where ->select_task_rq() doesn't validate the
result against the sched_domain/root_domain.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 1672823..33d7965 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
/* Must have done schedule() in kthread() before we set_task_cpu */
if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
WARN_ON(1);
return;
}

- raw_spin_lock_irqsave(&rq->lock, flags);
- update_rq_clock(rq);
- set_task_cpu(p, cpu);
p->cpus_allowed = cpumask_of_cpu(cpu);
p->rt.nr_cpus_allowed = 1;
p->flags |= PF_THREAD_BOUND;
- raw_spin_unlock_irqrestore(&rq->lock, flags);
}
EXPORT_SYMBOL(kthread_bind);

@@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
struct cfs_rq *old_cfsrq = task_cfs_rq(p),
*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

+#ifdef CONFIG_SCHED_DEBUG
+ /*
+ * We should never call set_task_cpu() on a blocked task,
+ * ttwu() will sort out the placement.
+ */
+ WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING);
+#endif
+
trace_sched_migrate_task(p, new_cpu);

if (old_cpu != new_cpu) {
@@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

/*
* If the task is not on a runqueue (and not running), then
- * it is sufficient to simply update the task's cpu field.
+ * the next wake-up will properly place the task.
*/
- if (!p->se.on_rq && !task_running(rq, p)) {
- update_rq_clock(rq);
- set_task_cpu(p, dest_cpu);
+ if (!p->se.on_rq && !task_running(rq, p))
return 0;
- }

init_completion(&req->done);
req->task = p;
@@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p,
}

#ifdef CONFIG_SMP
+/*
+ * Called from:
+ *
+ * - fork, @p is stable because it isn't on the tasklist yet
+ *
+ * - exec, @p is unstable XXX
+ *
+ * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
+ * we should be good.
+ */
static inline

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

{
- return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+
+ /*
+ * In order not to call set_task_cpu() on a blocking task we need
+ * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * cpu.
+ *
+ * Since this is common to all placement strategies, this lives here.
+ *
+ * [ this allows ->select_task() to simply return task_cpu(p) and
+ * not worry about this generic constraint ]
+ */
+ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+ !cpu_active(cpu))) {
+
+ cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+ /*
+ * XXX: race against hot-plug modifying cpu_active_mask
+ */
+ BUG_ON(cpu >= nr_cpu_ids);
+ }
+
+ return cpu;
}
#endif

@@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
struct rq *rq;
int ret = 0;

+ /*
+ * Since we rely on wake-ups to migrate sleeping tasks, don't change
+ * the ->cpus_allowed mask from under waking tasks, which would be
+ * possible when we change rq->lock in ttwu(), so synchronize against
+ * TASK_WAKING to avoid that.
+ */
+again:
+ while (p->state == TASK_WAKING)
+ cpu_relax();
+
rq = task_rq_lock(p, &flags);
+
+ if (p->state == TASK_WAKING) {
+ task_rq_unlock(rq, &flags);
+ goto again;
+ }
+
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
@@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
struct rq *rq_dest, *rq_src;
- int ret = 0, on_rq;
+ int ret = 0;

if (unlikely(!cpu_active(dest_cpu)))
return ret;
@@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
goto fail;

- on_rq = p->se.on_rq;
- if (on_rq)
+ /*
+ * If we're not on a rq, the next wake-up will ensure we're
+ * placed properly.
+ */
+ if (p->se.on_rq) {
deactivate_task(rq_src, p, 0);
-
- set_task_cpu(p, dest_cpu);
- if (on_rq) {
+ set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);

tip-bot for Peter Zijlstra

unread,

Dec 16, 2009, 1:50:02 PM12/16/09

to

Commit-ID: 88ec22d3edb72b261f8628226cd543589a6d5e1b
Gitweb: http://git.kernel.org/tip/88ec22d3edb72b261f8628226cd543589a6d5e1b
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Wed, 16 Dec 2009 18:04:41 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Wed, 16 Dec 2009 19:01:58 +0100

sched: Remove the cfs_rq dependency from set_task_cpu()

In order to remove the cfs_rq dependency from set_task_cpu() we
need to ensure the task is cfs_rq invariant for all callsites.

The simple approach is to substract cfs_rq->min_vruntime from
se->vruntime on dequeue, and add cfs_rq->min_vruntime on
enqueue.

However, this has the downside of breaking FAIR_SLEEPERS since
we loose the old vruntime as we only maintain the relative
position.

To solve this, we observe that we only migrate runnable tasks,
we do this using deactivate_task(.sleep=0) and
activate_task(.wakeup=0), therefore we can restrain the
min_vruntime invariance to that state.

The only other case is wakeup balancing, since we want to
maintain the old vruntime we cannot make it relative on dequeue,
but since we don't migrate inactive tasks, we can do so right
before we activate it again.

This is where we need the new pre-wakeup hook, we need to call
this while still holding the old rq->lock. We could fold it into
->select_task_rq(), but since that has multiple callsites and
would obfuscate the locking requirements, that seems like a
fudge.

This leaves the fork() case, simply make sure that ->task_fork()
leaves the ->vruntime in a relative state.

This covers all cases where set_task_cpu() gets called, and
ensures it sees a relative vruntime.

Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

include/linux/sched.h | 2 +-
kernel/sched.c | 6 +----
kernel/sched_fair.c | 50 +++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c9fa1c..973b2b8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1116,7 +1116,7 @@ struct sched_class {
struct task_struct *task);

#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*moved_group) (struct task_struct *p);
+ void (*moved_group) (struct task_struct *p, int on_rq);
#endif
};

diff --git a/kernel/sched.c b/kernel/sched.c
index 6c571bd..f92ce63 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

{
int old_cpu = task_cpu(p);
- struct cfs_rq *old_cfsrq = task_cfs_rq(p),
- *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

#ifdef CONFIG_SCHED_DEBUG
/*
@@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1, 1, NULL, 0);
}
- p->se.vruntime -= old_cfsrq->min_vruntime -
- new_cfsrq->min_vruntime;

__set_task_cpu(p, new_cpu);
}
@@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk)

#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk);
+ tsk->sched_class->moved_group(tsk, on_rq);
#endif

if (unlikely(running))
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ec1d271..42ac3c9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
}
@@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
se->vruntime = vruntime;
}

+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_MIGRATE 2
+
static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
+ * Update the normalized vruntime before updating min_vruntime
+ * through callig update_curr().
+ */
+ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
+ se->vruntime += cfs_rq->min_vruntime;
+
+ /*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
account_entity_enqueue(cfs_rq, se);

- if (wakeup) {
+ if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
@@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+
+ /*
+ * Normalize the entity after updating the min_vruntime because the
+ * update can refer to the ->curr item and we need to reflect this
+ * movement in our normalized position.
+ */
+ if (!sleep)
+ se->vruntime -= cfs_rq->min_vruntime;
}

/*
@@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int flags = 0;
+
+ if (wakeup)
+ flags |= ENQUEUE_WAKEUP;

+ if (p->state == TASK_WAKING)

+ flags |= ENQUEUE_MIGRATE;

for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, wakeup);
- wakeup = 1;
+ enqueue_entity(cfs_rq, se, flags);
+ flags = ENQUEUE_WAKEUP;
}

hrtick_update(rq);
@@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)

#ifdef CONFIG_SMP

+static void task_waking_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ se->vruntime -= cfs_rq->min_vruntime;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
@@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p)
resched_task(rq->curr);
}

+ se->vruntime -= cfs_rq->min_vruntime;
+
raw_spin_unlock_irqrestore(&rq->lock, flags);
}

@@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq)
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p)
+static void moved_group_fair(struct task_struct *p, int on_rq)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);

update_curr(cfs_rq);
- place_entity(cfs_rq, &p->se, 1);
+ if (!on_rq)
+ place_entity(cfs_rq, &p->se, 1);
}
#endif

@@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = {
.move_one_task = move_one_task_fair,
.rq_online = rq_online_fair,
.rq_offline = rq_offline_fair,
+
+ .task_waking = task_waking_fair,
#endif

.set_curr_task = set_curr_task_fair,

tip-bot for Ingo Molnar

unread,

Dec 17, 2009, 12:20:02 AM12/17/09

to

Commit-ID: 416eb39556a03d1c7e52b0791e9052ccd71db241
Gitweb: http://git.kernel.org/tip/416eb39556a03d1c7e52b0791e9052ccd71db241
Author: Ingo Molnar <mi...@elte.hu>
AuthorDate: Thu, 17 Dec 2009 06:05:49 +0100
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Thu, 17 Dec 2009 06:05:49 +0100

sched: Make warning less noisy

Cc: Peter Zijlstra <a.p.zi...@chello.nl>

Cc: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <200912161705...@chello.nl>
Signed-off-by: Ingo Molnar <mi...@elte.hu>
---

kernel/sched.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 8a2bfd3..af7dfa7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2041,7 +2041,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

* We should never call set_task_cpu() on a blocked task,

* ttwu() will sort out the placement.

*/
- WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING);
+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING);
#endif

trace_sched_migrate_task(p, new_cpu);