[PATCH 1/2] sched: normalize sleeper's vruntime during group change

Dima Zavin

unread,

Sep 29, 2010, 2:50:01 AM9/29/10

to

This adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group.

This allows us to properly normalize a sleeping task's vruntime
when moving it between different cgroups.

Cc: Arve Hjønnevåg <ar...@android.com>
Signed-off-by: Dima Zavin <di...@android.com>
---
include/linux/sched.h | 1 +
kernel/sched.c | 5 +++++
kernel/sched_fair.c | 16 +++++++++++++++-
3 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {

#ifdef CONFIG_FAIR_GROUP_SCHED
void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*prep_move_group) (struct task_struct *p, int on_rq);
#endif
};

diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);

+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->prep_move_group)
+ tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
set_task_rq(tsk, task_cpu(tsk));

#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..008fe57 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
static void moved_group_fair(struct task_struct *p, int on_rq)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ struct sched_entity *se = &p->se;

update_curr(cfs_rq);
- if (!on_rq)
+ if (!on_rq) {
+ se->vruntime += cfs_rq->min_vruntime;
place_entity(cfs_rq, &p->se, 1);
+ }
+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{
+ struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ struct sched_entity *se = &p->se;
+
+ /* normalize the runtime of a sleeping task before moving it */
+ if (!on_rq)
+ se->vruntime -= cfs_rq->min_vruntime;
}
#endif

@@ -3883,6 +3896,7 @@ static const struct sched_class fair_sched_class = {

#ifdef CONFIG_FAIR_GROUP_SCHED
.moved_group = moved_group_fair,
+ .prep_move_group = prep_move_group_fair,
#endif
};

--
1.6.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Dima Zavin

unread,

Sep 29, 2010, 2:50:01 AM9/29/10

to

After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead.

Cc: Arve Hjønnevåg <ar...@android.com>
Signed-off-by: Dima Zavin <di...@android.com>
---

kernel/sched_fair.c | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 008fe57..cb24ddb 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -802,6 +802,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
+ u64 min_vruntime;
+
/*
* Update run-time statistics of the 'current'.
*/
@@ -826,6 +828,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);
+
+ min_vruntime = cfs_rq->min_vruntime;
update_min_vruntime(cfs_rq);

/*
@@ -834,7 +838,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* movement in our normalized position.
*/
if (!(flags & DEQUEUE_SLEEP))
- se->vruntime -= cfs_rq->min_vruntime;
+ se->vruntime -= min_vruntime;
}

/*

Pekka Enberg

unread,

Sep 29, 2010, 3:00:02 AM9/29/10

to

On Wed, Sep 29, 2010 at 9:46 AM, Dima Zavin <di...@android.com> wrote:
> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
>
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.
>
> Cc: Arve Hjønnevåg <ar...@android.com>
> Signed-off-by: Dima Zavin <di...@android.com>

Nitpick: this changelog probably needs better description of the
problem it's trying to solve or a link to the previous discussion
about it.

Dima Zavin

unread,

Sep 29, 2010, 3:20:01 AM9/29/10

to

This adds a new callback, prep_move_task, to struct sched_class
to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group.

This allows us to properly normalize a sleeping task's vruntime
when moving it between different cgroups.

More details about the problem:
http://lkml.org/lkml/2010/9/28/24

Cc: Arve Hjønnevåg <ar...@android.com>
Signed-off-by: Dima Zavin <di...@android.com>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..008fe57 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c

--

Mike Galbraith

unread,

Sep 29, 2010, 4:20:02 AM9/29/10

to

On Tue, 2010-09-28 at 23:46 -0700, Dima Zavin wrote:
> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
>
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.

> diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
> index db3f674..008fe57 100644
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -3827,10 +3827,23 @@ static void set_curr_task_fair(struct rq *rq)
> static void moved_group_fair(struct task_struct *p, int on_rq)
> {
> struct cfs_rq *cfs_rq = task_cfs_rq(p);
> + struct sched_entity *se = &p->se;
>
> update_curr(cfs_rq);
> - if (!on_rq)
> + if (!on_rq) {
> + se->vruntime += cfs_rq->min_vruntime;
> place_entity(cfs_rq, &p->se, 1);
> + }
> +}

I'm still missing the place_entity() rationale. Why is a sleeping task
eligible for pain therapy that a runnable task is spared? That state
can change a usec from now.

Nit: place_entity(cfs_rq, se, 1);

-Mike

Dima Zavin

unread,

Sep 29, 2010, 3:10:03 PM9/29/10

to

I see your point, and I can't think of a reason why we'd want to
penalize the sleeper with START_DEBIT,
but now I am wondering why it was there in the first place.

--Dima

Dima Zavin

unread,

Sep 29, 2010, 5:50:01 PM9/29/10

to

If you switch the cgroup of a sleeping thread, its vruntime does
not get adjusted correctly for the difference between the
min_vruntime values of the two groups.

This patch adds a new callback, prep_move_task, to struct sched_class

to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group. This allows us to properly normalize
a sleeping task's vruntime when moving it between different cgroups.

More details about the problem:
http://lkml.org/lkml/2010/9/28/24

Cc: Arve Hjønnevåg <ar...@android.com>
Signed-off-by: Dima Zavin <di...@android.com>
---
include/linux/sched.h | 1 +
kernel/sched.c | 5 +++++

kernel/sched_fair.c | 14 +++++++++++++-
3 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db..ba3494e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1073,6 +1073,7 @@ struct sched_class {

#ifdef CONFIG_FAIR_GROUP_SCHED
void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*prep_move_group) (struct task_struct *p, int on_rq);
#endif
};

diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb..fe4bb20 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,6 +8297,11 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);

+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (tsk->sched_class->prep_move_group)
+ tsk->sched_class->prep_move_group(tsk, on_rq);
+#endif
+
set_task_rq(tsk, task_cpu(tsk));

#ifdef CONFIG_FAIR_GROUP_SCHED

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674..6ded59f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3827,10 +3827,21 @@ static void set_curr_task_fair(struct rq *rq)

static void moved_group_fair(struct task_struct *p, int on_rq)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
+ struct sched_entity *se = &p->se;

update_curr(cfs_rq);

if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);

+ se->vruntime += cfs_rq->min_vruntime;

+}
+
+static void prep_move_group_fair(struct task_struct *p, int on_rq)
+{

+ struct cfs_rq *cfs_rq = task_cfs_rq(p);

+ struct sched_entity *se = &p->se;

+
+ /* normalize the runtime of a sleeping task before moving it */

+ if (!on_rq)

+ se->vruntime -= cfs_rq->min_vruntime;
}
#endif

@@ -3883,6 +3894,7 @@ static const struct sched_class fair_sched_class = {

#ifdef CONFIG_FAIR_GROUP_SCHED
.moved_group = moved_group_fair,
+ .prep_move_group = prep_move_group_fair,
#endif
};

--
1.6.6

--

Peter Zijlstra

unread,

Sep 30, 2010, 6:50:02 AM9/30/10

to

On Tue, 2010-09-28 at 23:46 -0700, Dima Zavin wrote:

> This adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group.
>
> This allows us to properly normalize a sleeping task's vruntime
> when moving it between different cgroups.

Don't much like these patches, and changelogs need full descriptions not
fuzzy links out to the interwebs that might or might not stay valid.

If you change a task's cgroup, the task is new in that cgroup and should
be placed like a new task would, carrying over any history it has in the
old cgroup is dubious at best.

Please explain this stuff..

Dima Zavin

unread,

Sep 30, 2010, 3:20:02 PM9/30/10

to

Peter,

>> This adds a new callback, prep_move_task, to struct sched_class
>> to give sched_fair the opportunity to adjust the task's vruntime
>> just before setting its new group.
>>
>> This allows us to properly normalize a sleeping task's vruntime
>> when moving it between different cgroups.
>
> Don't much like these patches, and changelogs need full descriptions not
> fuzzy links out to the interwebs that might or might not stay valid.

Sorry, I'll resend with a better changelog and no tubes.

> If you change a task's cgroup, the task is new in that cgroup and should
> be placed like a new task would, carrying over any history it has in the
> old cgroup is dubious at best.

That is certainly not the behavior today for running tasks as I
understand it. They get their vruntime normalized before being taken
off the run_queue of old group, and then get the new min_vruntime
added when they get re-enqueued. For sleeping tasks it's just plain
broken (see below).

Also, I am skeptical that the behavior you describe is desired. If you
were to just place the task in the new cgroup like it was a brand new
task, it could allow threads to game the system and potentially let
them reset their vruntime back. If I do a bunch of work and then move
myself out of the group and then back onto it, I may get lower
vruntime than by just staying on the group.

> Please explain this stuff..

The situation today is quite bad for sleeping tasks. Currently, when
you move a sleeping thread between cgroups, the thread can retain its
old vruntime value if the old group was far ahead of the new group
since it essentially does a max(se->vruntime, new_vruntime) in
place_entity. This can prevent the task from running for a very long
time. That is what this patch was trying to address. It normalizes the
sleeper thread's vruntime before moving it to the new group.

Thanks in advance.

--Dima

Peter Zijlstra

unread,

Oct 1, 2010, 8:10:03 AM10/1/10

to

On Thu, 2010-09-30 at 12:14 -0700, Dima Zavin wrote:
>
> > Please explain this stuff..
>
> The situation today is quite bad for sleeping tasks. Currently, when
> you move a sleeping thread between cgroups, the thread can retain its
> old vruntime value if the old group was far ahead of the new group
> since it essentially does a max(se->vruntime, new_vruntime) in
> place_entity. This can prevent the task from running for a very long
> time. That is what this patch was trying to address. It normalizes the
> sleeper thread's vruntime before moving it to the new group.
>
>

Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
have to, so I'll take your word for it.

But doesn't normal cross-cpu task migration already solve this problem?
Therefore wouldn't it be possible to adapt/extend that code to also deal
with this particular issue?

It would avoid growing more cgroup fudge..

Dima Zavin

unread,

Oct 4, 2010, 3:20:02 PM10/4/10

to

>> > Please explain this stuff..
>>
>> The situation today is quite bad for sleeping tasks. Currently, when
>> you move a sleeping thread between cgroups, the thread can retain its
>> old vruntime value if the old group was far ahead of the new group
>> since it essentially does a max(se->vruntime, new_vruntime) in
>> place_entity. This can prevent the task from running for a very long
>> time. That is what this patch was trying to address. It normalizes the
>> sleeper thread's vruntime before moving it to the new group.
>>
>>
>
> Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
> have to, so I'll take your word for it.
>
> But doesn't normal cross-cpu task migration already solve this problem?
> Therefore wouldn't it be possible to adapt/extend that code to also deal
> with this particular issue?

It does, but from what I can tell it does so lazily for sleeping
tasks, i.e. the logic is in try_to_wake_up(). The cgroup attach moves
the task immediately, so when we attempt to wake it up it will already
be too late for the wake_up code to do the right thing since the task
has the new cpu_rq assigned from sched_move_task(). The wakeup logic
will not have the old group info.

--Dima

Dima Zavin

unread,

Oct 6, 2010, 7:10:02 PM10/6/10

to

If you switch the cgroup of a sleeping thread, its vruntime does
not get adjusted correctly for the difference between the
min_vruntime values of the two groups.

The problem becomes most apparent when one has cgroups whose
cpu shares differ greatly, say group A.shares=1024 and group B.shares=52.
After some time, the vruntime of the group with the larger share (A)
will be way ahead of the group with the small share (B). Currently,
when a sleeping task is moved from group A to group B, it will retain its
larger vruntime value and thus will be way ahead of all the other tasks
in its new group. This will prevent this task from executing for an
extended period of time.

This patch adds a new callback, prep_move_task, to struct sched_class

to give sched_fair the opportunity to adjust the task's vruntime
just before setting its new group. This allows us to properly normalize
a sleeping task's vruntime when moving it between different cgroups.

Cc: Arve Hjønnevåg <ar...@android.com>

--

Mike Galbraith

unread,

Oct 6, 2010, 10:30:01 PM10/6/10

to

On Wed, 2010-10-06 at 15:56 -0700, Dima Zavin wrote:
> If you switch the cgroup of a sleeping thread, its vruntime does
> not get adjusted correctly for the difference between the
> min_vruntime values of the two groups.
>
> The problem becomes most apparent when one has cgroups whose
> cpu shares differ greatly, say group A.shares=1024 and group B.shares=52.
> After some time, the vruntime of the group with the larger share (A)
> will be way ahead of the group with the small share (B). Currently,
> when a sleeping task is moved from group A to group B, it will retain its
> larger vruntime value and thus will be way ahead of all the other tasks
> in its new group. This will prevent this task from executing for an
> extended period of time.

Yeah, seems clear that normalization is a must.

Questionable is whether we should apply START_DEBIT. That's part of a
different problem though (SCHED_PROCESS).

> This patch adds a new callback, prep_move_task, to struct sched_class
> to give sched_fair the opportunity to adjust the task's vruntime
> just before setting its new group. This allows us to properly normalize
> a sleeping task's vruntime when moving it between different cgroups.

Acked-by: Mike Galbraith <efa...@gmx.de>

--

Dima Zavin

unread,

Oct 7, 2010, 5:10:01 PM10/7/10

to

Mike,

Thanks for the Ack for patch 1/2, could you take a look at this one too?

Should I re-upload the series as v2 or you can pick the latest from
patch 1 and take this one?

--Dima

Mike Galbraith

unread,

Oct 8, 2010, 3:00:02 AM10/8/10

to

On Thu, 2010-10-07 at 14:00 -0700, Dima Zavin wrote:
> Mike,
>
> Thanks for the Ack for patch 1/2, could you take a look at this one too?

Ok, did that. I'd do it like below instead.

> Should I re-upload the series as v2 or you can pick the latest from
> patch 1 and take this one?

Peter's the merge point, I just help break stuff ;-)

I tested the below with pinned/unpinned mixes of sleepers and hogs, and
saw no ill effects. My thought on the logic is embedded in the comment.

From: Dima Zavin <di...@android.com>
Subject: [PATCH 2/2] sched: use the old min_vruntime when normalizing on dequeue
Date: Tue, 28 Sep 2010 23:46:14 -0700

After pulling the thread off the run-queue during a cgroup change,
the cfs_rq.min_vruntime gets recalculated. The dequeued thread's vruntime
then gets normalized to this new value. This can then lead to the thread
getting an unfair boost in the new group if the vruntime of the next
task in the old run-queue was way further ahead.

Cc: Arve Hjønnevåg <ar...@android.com>
Signed-off-by: Dima Zavin <di...@android.com>
---

kernel/sched_fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -826,15 +826,17 @@ dequeue_entity(struct cfs_rq *cfs_rq, st

if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
account_entity_dequeue(cfs_rq, se);

- update_min_vruntime(cfs_rq);

/*
- * Normalize the entity after updating the min_vruntime because the
- * update can refer to the ->curr item and we need to reflect this
- * movement in our normalized position.
+ * Normalize vruntime prior to updating min_vruntime. Any motion
+ * referring to ->curr will have been captured by update_curr() above.
+ * We don't want to preserve what lag might become as a result of
+ * this dequeue, we want to preserve what lag is at dequeue time.

*/
if (!(flags & DEQUEUE_SLEEP))

se->vruntime -= cfs_rq->min_vruntime;
+

+ update_min_vruntime(cfs_rq);
}

/*

Peter Zijlstra

unread,

Oct 15, 2010, 10:00:01 AM10/15/10

to

On Mon, 2010-10-04 at 12:18 -0700, Dima Zavin wrote:
> >> > Please explain this stuff..
> >>
> >> The situation today is quite bad for sleeping tasks. Currently, when
> >> you move a sleeping thread between cgroups, the thread can retain its
> >> old vruntime value if the old group was far ahead of the new group
> >> since it essentially does a max(se->vruntime, new_vruntime) in
> >> place_entity. This can prevent the task from running for a very long
> >> time. That is what this patch was trying to address. It normalizes the
> >> sleeper thread's vruntime before moving it to the new group.
> >>
> >>
> >
> > Hrm,.. ok, I tend to not use this cgroup gunk more that I absolutely
> > have to, so I'll take your word for it.
> >
> > But doesn't normal cross-cpu task migration already solve this problem?
> > Therefore wouldn't it be possible to adapt/extend that code to also deal
> > with this particular issue?
>
> It does, but from what I can tell it does so lazily for sleeping
> tasks, i.e. the logic is in try_to_wake_up(). The cgroup attach moves
> the task immediately, so when we attempt to wake it up it will already
> be too late for the wake_up code to do the right thing since the task
> has the new cpu_rq assigned from sched_move_task(). The wakeup logic
> will not have the old group info.

Wouldn't something like the below work as expected?

---
Subject: sched, cgroup: Fixup broken cgroup movement
From: Peter Zijlstra <a.p.zi...@chello.nl>
Date: Fri Oct 15 15:24:15 CEST 2010

Reported-by: Dima Zavin <di...@android.com>
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
---
include/linux/sched.h | 2 +-
kernel/sched.c | 8 ++++----
kernel/sched_fair.c | 25 +++++++++++++++++++------
3 files changed, 24 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -8388,12 +8388,12 @@ void sched_move_task(struct task_struct

if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);

- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1073,7 +1073,7 @@ struct sched_class {
struct task_struct *task);

#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};

Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c

@@ -3831,13 +3831,26 @@ static void set_curr_task_fair(struct rq
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif

@@ -3889,7 +3902,7 @@ static const struct sched_class fair_sch
.get_rr_interval = get_rr_interval_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};

tip-bot for Peter Zijlstra

unread,

Oct 22, 2010, 9:10:02 AM10/22/10

to

Commit-ID: b2b5ce022acf5e9f52f7b78c5579994fdde191d4
Gitweb: http://git.kernel.org/tip/b2b5ce022acf5e9f52f7b78c5579994fdde191d4
Author: Peter Zijlstra <a.p.zi...@chello.nl>
AuthorDate: Fri, 15 Oct 2010 15:24:15 +0200
Committer: Ingo Molnar <mi...@elte.hu>
CommitDate: Fri, 22 Oct 2010 14:16:45 +0200

sched, cgroup: Fixup broken cgroup movement

Dima noticed that we fail to correct the ->vruntime of sleeping tasks
when we move them between cgroups.

Reported-by: Dima Zavin <di...@android.com>
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>

Tested-by: Mike Galbraith <efa...@gmx.de>
LKML-Reference: <1287150604.29097.1513.camel@twins>
Signed-off-by: Ingo Molnar <mi...@elte.hu>

---
include/linux/sched.h | 2 +-
kernel/sched.c | 8 ++++----
kernel/sched_fair.c | 25 +++++++++++++++++++------
3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2cca9a9..be312c1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h

@@ -1073,7 +1073,7 @@ struct sched_class {
struct task_struct *task);

#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};

diff --git a/kernel/sched.c b/kernel/sched.c
index 5998222..3fe253e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8498,12 +8498,12 @@ void sched_move_task(struct task_struct *tsk)

if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);

- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 74cccfa..3acc2a4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3866,13 +3866,26 @@ static void set_curr_task_fair(struct rq *rq)

@@ -3924,7 +3937,7 @@ static const struct sched_class fair_sched_class = {