[RFC/RFT PATCH] sched: automated per tty task groups

Mike Galbraith

unread,

Oct 19, 2010, 5:20:02 AM10/19/10

to

Greetings,

Comments, suggestions etc highly welcome.

This patch implements an idea from Linus, to automatically create task groups
per tty, to improve desktop interactivity under hefty load such as kbuild. The
feature is enabled from boot by default, The default setting can be changed via
the boot option ttysched=0, and can be can be turned on or off on the fly via
echo [01] > /proc/sys/kernel/sched_tty_sched_enabled.

A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10

pert/s: 229 >5484.43us: 41 min: 0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
pert/s: 222 >5652.28us: 43 min: 0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
pert/s: 211 >5809.38us: 43 min: 0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
pert/s: 223 >6147.92us: 43 min: 0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
pert/s: 218 >6252.64us: 43 min: 0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

Signed-off-by: Mike Galbraith <efa...@gmx.de>

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h
@@ -1900,6 +1900,20 @@ int sched_rt_handler(struct ctl_table *t

extern unsigned int sysctl_sched_compat_yield;

+#ifdef CONFIG_SCHED_DESKTOP
+int sched_tty_sched_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
+extern unsigned int sysctl_sched_tty_sched_enabled;
+
+void tty_sched_create_group(struct tty_struct *tty);
+void tty_sched_destroy_group(struct tty_struct *tty);
+#else
+static inline void tty_sched_create_group(struct tty_struct *tty) { }
+static inline void tty_sched_destroy_group(struct tty_struct *tty) { }
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
Index: linux-2.6.36.git/include/linux/tty.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/tty.h
+++ linux-2.6.36.git/include/linux/tty.h
@@ -327,6 +327,9 @@ struct tty_struct {
/* If the tty has a pending do_SAK, queue it here - akpm */
struct work_struct SAK_work;
struct tty_port *port;
+#ifdef CONFIG_SCHED_DESKTOP
+ struct task_group *tg;
+#endif
};

/* Each of a tty's open files has private_data pointing to tty_file_private */
Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -78,6 +78,7 @@

#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_tty.h"

#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -612,11 +613,16 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+
+ tty_sched_check_attach(p, &tg);
+
+ return tg;
}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1920,6 +1926,7 @@ static void deactivate_task(struct rq *r
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_tty.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
Index: linux-2.6.36.git/drivers/char/tty_io.c
===================================================================
--- linux-2.6.36.git.orig/drivers/char/tty_io.c
+++ linux-2.6.36.git/drivers/char/tty_io.c
@@ -185,6 +185,7 @@ void free_tty_struct(struct tty_struct *
{
kfree(tty->write_buf);
tty_buffer_free_all(tty);
+ tty_sched_destroy_group(tty);
kfree(tty);
}

@@ -2823,6 +2824,7 @@ void initialize_tty_struct(struct tty_st
tty->ops = driver->ops;
tty->index = idx;
tty_line_name(driver, idx, tty->name);
+ tty_sched_create_group(tty);
}

/**
Index: linux-2.6.36.git/kernel/sched_tty.h
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_tty.h
@@ -0,0 +1,7 @@
+#ifdef CONFIG_SCHED_DESKTOP
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg);
+#else
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg) { }
+#endif
Index: linux-2.6.36.git/kernel/sched_tty.c
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_tty.c
@@ -0,0 +1,128 @@
+#ifdef CONFIG_SCHED_DESKTOP
+#include <linux/tty.h>
+
+unsigned int __read_mostly sysctl_sched_tty_sched_enabled = 1;
+
+void tty_sched_create_group(struct tty_struct *tty)
+{
+ tty->tg = sched_create_group(&init_task_group);
+ if (IS_ERR(tty->tg)) {
+ tty->tg = &init_task_group;
+ WARN_ON(1);
+ }
+}
+EXPORT_SYMBOL(tty_sched_create_group);
+
+void tty_sched_destroy_group(struct tty_struct *tty)
+{
+ if (tty->tg && tty->tg != &init_task_group)
+ sched_destroy_group(tty->tg);
+}
+EXPORT_SYMBOL(tty_sched_destroy_group);
+
+static inline void
+tty_sched_check_attach(struct task_struct *p, struct task_group **tg)
+{
+ struct tty_struct *tty;
+ int attach = 0, enabled = sysctl_sched_tty_sched_enabled;
+
+ rcu_read_lock();
+ tty = p->signal->tty;
+ if (!tty)
+ goto out_unlock;
+
+ if (enabled && *tg == &root_task_group) {
+ *tg = p->signal->tty->tg;
+ attach = 1;
+ } else if (!enabled && *tg == tty->tg) {
+ *tg = &root_task_group;
+ attach = 1;
+ }
+
+ if (attach && !p->se.on_rq) {
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ p->se.vruntime += (*tg)->cfs_rq[task_cpu(p)]->min_vruntime;
+ }
+
+out_unlock:
+ rcu_read_unlock();
+}
+
+void tty_sched_move_task(struct task_struct *p, struct task_group *tg)
+{
+ struct sched_entity *se = &p->se;
+ struct rq *rq;
+ unsigned long flags;
+ int on_rq, running, cpu;
+
+ rq = task_rq_lock(p, &flags);
+
+ running = task_current(rq, p);
+ on_rq = se->on_rq;
+ cpu = rq->cpu;
+
+ if (on_rq)
+ dequeue_task(rq, p, 0);
+ if (unlikely(running))
+ p->sched_class->put_prev_task(rq, p);
+
+ if (!on_rq)
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
+
+ se->cfs_rq = tg->cfs_rq[cpu];
+ se->parent = tg->se[cpu];
+
+ p->rt.rt_rq = tg->rt_rq[cpu];
+ p->rt.parent = tg->rt_se[cpu];
+
+ if (!on_rq)
+ se->vruntime += cfs_rq_of(se)->min_vruntime;
+
+ if (unlikely(running))
+ p->sched_class->set_curr_task(rq);
+ if (on_rq)
+ enqueue_task(rq, p, 0);
+
+ task_rq_unlock(rq, &flags);
+}
+
+int sched_tty_sched_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct task_struct *p, *t;
+ struct task_group *tg;
+ unsigned long flags;
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ read_lock_irqsave(&tasklist_lock, flags);
+
+ rcu_read_lock();
+ for_each_process(p) {
+ tg = task_group(p);
+ tty_sched_move_task(p, tg);
+ list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+ tty_sched_move_task(t, tg);
+ }
+ }
+ rcu_read_unlock();
+
+ read_unlock_irqrestore(&tasklist_lock, flags);
+
+ return 0;
+}
+
+static int __init setup_tty_sched(char *str)
+{
+ unsigned long val;
+
+ val = simple_strtoul(str, NULL, 0);
+ sysctl_sched_tty_sched_enabled = val ? 1 : 0;
+
+ return 1;
+}
+__setup("ttysched=", setup_tty_sched);
+#endif
Index: linux-2.6.36.git/kernel/sysctl.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sysctl.c
+++ linux-2.6.36.git/kernel/sysctl.c
@@ -384,6 +384,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHED_DESKTOP
+ {
+ .procname = "sched_tty_sched_enabled",
+ .data = &sysctl_sched_tty_sched_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_tty_sched_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
Index: linux-2.6.36.git/init/Kconfig
===================================================================
--- linux-2.6.36.git.orig/init/Kconfig
+++ linux-2.6.36.git/init/Kconfig
@@ -652,6 +652,19 @@ config DEBUG_BLK_CGROUP

endif # CGROUPS

+config SCHED_DESKTOP
+ bool "Desktop centric group scheduling"
+ depends on EXPERIMENTAL
+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED
+ select RT_GROUP_SCHED
+ select BLK_CGROUP
+ help
+ This option optimizes the group scheduler for common desktop workloads,
+ by creating separate per tty groups. This separation of workloads isolates
+ aggressive CPU burners (like build jobs) from desktop applications.
+
config MM_OWNER
bool

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

unread,

Oct 19, 2010, 5:30:01 AM10/19/10

to

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> Greetings,
>
> Comments, suggestions etc highly welcome.

You might be wanting to exclude RT tasks from the tty groups since there
is no interface to grant them any runtime and such :-)

Also, I think tty_sched_move_task and sched_move_task() should be
sharing lots more code -- I recently proposed a fix for
sched_move_task() because the Android people complained, but they
haven't replied back yet..

Peter Zijlstra

unread,

Oct 19, 2010, 5:40:03 AM10/19/10

to

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:

> + read_lock_irqsave(&tasklist_lock, flags);
> +
> + rcu_read_lock();
> + for_each_process(p) {
> + tg = task_group(p);
> + tty_sched_move_task(p, tg);
> + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> + tty_sched_move_task(t, tg);
> + }
> + }
> + rcu_read_unlock();
> +
> + read_unlock_irqrestore(&tasklist_lock, flags);

I don't think you need to disable IRQs for tasklist lock, nor do I think
you actually need it.

If you enable tty groups and then scan all the existing tasks you've
covered them all, new tasks will already be placed right, dying tasks we
don't care about anyway.

Mike Galbraith

unread,

Oct 19, 2010, 5:50:01 AM10/19/10

to

On Tue, 2010-10-19 at 11:29 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> > + read_lock_irqsave(&tasklist_lock, flags);
> > +
> > + rcu_read_lock();
> > + for_each_process(p) {
> > + tg = task_group(p);
> > + tty_sched_move_task(p, tg);
> > + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> > + tty_sched_move_task(t, tg);
> > + }
> > + }
> > + rcu_read_unlock();
> > +
> > + read_unlock_irqrestore(&tasklist_lock, flags);
>
> I don't think you need to disable IRQs for tasklist lock, nor do I think
> you actually need it.

OK, thanks. (No such thing as too paranoid;)

-Mike

Mike Galbraith

unread,

Oct 19, 2010, 5:50:02 AM10/19/10

to

On Tue, 2010-10-19 at 11:43 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
> >
> > Yeah. I should be able to just do sched_move_task(), but it doesn't
> > currently work even with your patch, turning tty_sched on/off can lead
> > to incredible delays before you get box back. With virgin source, it's
> > size infinity for all intents and purposes.
>
> Ah, first feedback I've got on that patch,. but surely since you created
> one that does work we can fix my patch and use the normal path? :-)

Yeah. I wanted to get the RFC out the door, so took a shortcut.

-Mike

Peter Zijlstra

unread,

Oct 19, 2010, 5:50:01 AM10/19/10

to

On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
>
> Yeah. I should be able to just do sched_move_task(), but it doesn't
> currently work even with your patch, turning tty_sched on/off can lead
> to incredible delays before you get box back. With virgin source, it's
> size infinity for all intents and purposes.

Ah, first feedback I've got on that patch,. but surely since you created
one that does work we can fix my patch and use the normal path? :-)

Mike Galbraith

unread,

Oct 19, 2010, 5:50:02 AM10/19/10

to

On Tue, 2010-10-19 at 11:26 +0200, Peter Zijlstra wrote:
> On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:
> > Greetings,
> >
> > Comments, suggestions etc highly welcome.
>
> You might be wanting to exclude RT tasks from the tty groups since there
> is no interface to grant them any runtime and such :-)

:)

> Also, I think tty_sched_move_task and sched_move_task() should be
> sharing lots more code -- I recently proposed a fix for
> sched_move_task() because the Android people complained, but they
> haven't replied back yet..

Yeah. I should be able to just do sched_move_task(), but it doesn't

currently work even with your patch, turning tty_sched on/off can lead
to incredible delays before you get box back. With virgin source, it's
size infinity for all intents and purposes.

-Mike

Mike Galbraith

unread,

Oct 19, 2010, 7:30:02 AM10/19/10

to

It was suggested that I show a bit more info.

On Tue, 2010-10-19 at 11:16 +0200, Mike Galbraith wrote:

> A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10
>
> pert/s: 229 >5484.43us: 41 min: 0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
> pert/s: 222 >5652.28us: 43 min: 0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
> pert/s: 211 >5809.38us: 43 min: 0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
> pert/s: 223 >6147.92us: 43 min: 0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
> pert/s: 218 >6252.64us: 43 min: 0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

The same load without per tty task groups.

pert/s: 31 >40475.37us: 3 min: 0.37 max:48103.60 avg:29573.74 sum/s:916786us overhead:90.24%
pert/s: 23 >41237.70us: 12 min: 0.36 max:56010.39 avg:40187.01 sum/s:924301us overhead:91.99%
pert/s: 24 >42150.22us: 12 min: 8.86 max:61265.91 avg:39459.91 sum/s:947038us overhead:92.20%
pert/s: 26 >42344.91us: 11 min: 3.83 max:52029.60 avg:36164.70 sum/s:940282us overhead:91.12%
pert/s: 24 >44262.90us: 14 min: 5.05 max:82735.15 avg:40314.33 sum/s:967544us overhead:92.22%
^^^^^usecs ^^^^^usecs ^^the competition got

Average service latency is an order of magnitude better with tty_sched.
(Imagine that pert is Xorg or whatnot instead)

Using Mathieu Desnoyers' wakeup-latency testcase (attached):

With taskset -c 3 make -j 10 running..

taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency

without:
maximum latency: 42963.2 µs
average latency: 9077.0 µs
missed timer events: 0

with:
maximum latency: 4160.7 µs
average latency: 149.4 µs
missed timer events: 0

Patch makes a big difference in desktop feel under hefty load here.

-Mike

wakeup-latency.c

Ingo Molnar

unread,

Oct 19, 2010, 8:00:02 AM10/19/10

to

* Mike Galbraith <efa...@gmx.de> wrote:

> Using Mathieu Desnoyers' wakeup-latency testcase (attached):
>
> With taskset -c 3 make -j 10 running..
>
> taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency
>
> without:

> maximum latency: 42963.2 �s
> average latency: 9077.0 �s

> missed timer events: 0
>
> with:

> maximum latency: 4160.7 �s
> average latency: 149.4 �s

> missed timer events: 0
>
> Patch makes a big difference in desktop feel under hefty load here.

That's really nice!

Could this feature realistically do block IO isolation as well? It's always annoying
when some big IO job is making the desktop jerky. Especially as your patch is
selecting the block cgroup feature already:

+ select BLK_CGROUP

Thanks,

Ingo

Mike Galbraith

unread,

Oct 19, 2010, 9:20:02 AM10/19/10

to

On Tue, 2010-10-19 at 13:56 +0200, Ingo Molnar wrote:

> Could this feature realistically do block IO isolation as well? It's always annoying
> when some big IO job is making the desktop jerky. Especially as your patch is
> selecting the block cgroup feature already:
>
> + select BLK_CGROUP

I know my cgroup pgid config helps a bunch with rummaging in my email
while other IO is going on. I've been attributing that to BLK_CGROUP,
but have no proof.

-Mike

Linus Torvalds

unread,

Oct 19, 2010, 11:30:01 AM10/19/10

to

On Tue, Oct 19, 2010 at 4:29 AM, Mike Galbraith <efa...@gmx.de> wrote:
> It was suggested that I show a bit more info.

Yes, I was going to complain that the numbers in the commit message
made no sense without something to compare the numbers to.

> The same load without per tty task groups.

Very impressive. This definitely looks like something people will notice.

That said, I do think we should think carefully about calling this a
"tty" feature. I think we might want to leave the door open to other
heuristics than _just_ the tty group. I think the tty group approach
is wonderful for traditional Unix loads in a desktop environment, but
I suspect we might hit issues with IDE's etc too. I don't know if we
can notice things like that automatically, but I think it's worth
thinking about.

So I think the patch looks pretty good, and the numbers seem to look
just stunningly so, but I'd like to name the feature more along the
lines of "automatic process group scheduling" rather than about tty's
per se.

And you actually did that for the Kconfig option, which makes me quite happy.

The one other thing I do wonder about is how noticeable the group
scheduling overhead is. If people compare with a non-CGROUP_SCHED
kernel, will a desktop-optimized kernel suddenly have horrible pipe
latency due to much higher scheduling cost? Right now that whole
feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
I never timed it when I tried it out long ago..

Linus

Mike Galbraith

unread,

Oct 19, 2010, 2:20:02 PM10/19/10

to

On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> On Tue, Oct 19, 2010 at 4:29 AM, Mike Galbraith <efa...@gmx.de> wrote:

> So I think the patch looks pretty good, and the numbers seem to look
> just stunningly so, but I'd like to name the feature more along the
> lines of "automatic process group scheduling" rather than about tty's
> per se.

Oh, absolutely, that's what it's all about really. What I'd _like_ is
to get per process group scheduling working on the cheap..ish. Your
idea of tty cgoups looked much simpler though, so I figured that would
be a great place to start. It turned out to be much simpler than I
thought it would be, which is encouraging, and it works well in testing
(so far that is).

> And you actually did that for the Kconfig option, which makes me quite happy.

(Ingo's input.. spot on)

> The one other thing I do wonder about is how noticeable the group
> scheduling overhead is.

Very noticeable, cgroups is far from free. It would make no sense for a
performance freak to even think about it. I don't run cgroup enabled
kernels usually, and generally strip to the bone because I favor
throughput very very heavily, but when I look at the desktop under load,
the cost/performance trade-off ~seems to work out.

> If people compare with a non-CGROUP_SCHED
> kernel, will a desktop-optimized kernel suddenly have horrible pipe
> latency due to much higher scheduling cost? Right now that whole
> feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> I never timed it when I tried it out long ago..

The scheduling cost is quite high. But realistically, the cost of a
distro kernel with full featured network stack is (much) higher. I
seriously doubt the cost of cgroups would be noticed by the typical
_desktop_ user. Overall latencies for any switchy microbenchmark will
certainly be considerably higher with the feature enabled.

-Mike

Mike Galbraith

unread,

Oct 19, 2010, 3:00:02 PM10/19/10

to

> On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> >
> > If people compare with a non-CGROUP_SCHED
> > kernel, will a desktop-optimized kernel suddenly have horrible pipe
> > latency due to much higher scheduling cost? Right now that whole
> > feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> > I never timed it when I tried it out long ago..

Q/D test of kernels w/wo, with same .config using pipe-test (pure sched)
gives on my box ~590khz with tty_sched active, 620khz without cgroups
acitve in same kernel/config without patch. last time I measured
stripped down config (not long ago, but not yesterday either) gave max
ctx rate ~690khz on this box.

(note: very Q, very D numbers, no variance testing, ballpark)

Ingo Molnar

unread,

Oct 19, 2010, 11:00:02 PM10/19/10

to

* Mike Galbraith <efa...@gmx.de> wrote:

> > On Tue, 2010-10-19 at 08:28 -0700, Linus Torvalds wrote:
> > >
> > > If people compare with a non-CGROUP_SCHED
> > > kernel, will a desktop-optimized kernel suddenly have horrible pipe
> > > latency due to much higher scheduling cost? Right now that whole
> > > feature is hidden by EXPERIMENTAL, I don't know how much it hurts, and
> > > I never timed it when I tried it out long ago..
>
> Q/D test of kernels w/wo, with same .config using pipe-test (pure sched) gives on
> my box ~590khz with tty_sched active, 620khz without cgroups acitve in same
> kernel/config without patch. last time I measured stripped down config (not long
> ago, but not yesterday either) gave max ctx rate ~690khz on this box.
>
> (note: very Q, very D numbers, no variance testing, ballpark)

That's 5% overhead in context switches. Definitely not in the 'horrible' category.

This would be a rather tempting item for 2.6.37 ... especially as it really mainly
reuses existing group scheduling functionality, in a clever way.

Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and
resend the patch?

I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core
kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather
meaningless distinction. Since the feature is default-n, people will get the old
scheduler by default but can also choose this desktop-centric scheduling mode.

I'd even argue to make it default-y, because this patch clearly cures a form of
kbuild cancer.

Thanks,

Ingo

Markus Trippelsdorf

unread,

Oct 20, 2010, 10:10:02 AM10/20/10

to

Mike Galbraith wrote:

> Comments, suggestions etc highly welcome.

I've tested your patch and it runs smoothly on my machine.
However I had several NULL pointer dereference BUGs that happened when I
left X or rebooted my system. I think this is caused by your patch.
There is nothing in the logs unfortunately, but I scribbled down the
following by hand (not the whole trace, I'm too lazy):

BUG: unable to handle NULL pointer dereference at 0..038
IP: pick_next_task_fair 0xa7/0x1a0
...
Call Trace: schedule
...

Mike Galbraith

unread,

Oct 20, 2010, 10:50:01 AM10/20/10

to

On Wed, 2010-10-20 at 15:55 +0200, Markus Trippelsdorf wrote:
> Mike Galbraith wrote:
>
> > Comments, suggestions etc highly welcome.
>
> I've tested your patch and it runs smoothly on my machine.
> However I had several NULL pointer dereference BUGs that happened when I
> left X or rebooted my system. I think this is caused by your patch.
> There is nothing in the logs unfortunately, but I scribbled down the
> following by hand (not the whole trace, I'm too lazy):
>
> BUG: unable to handle NULL pointer dereference at 0..038
> IP: pick_next_task_fair 0xa7/0x1a0
> ...
> Call Trace: schedule
> ...

Hm. Not much I can do without the trace, but thanks for testing and
reporting anyway, guess I need to do some heavy stress testing. I'm
re-writing it as I write this anyway.

thanks,

-Mike

Mike Galbraith

unread,

Oct 21, 2010, 4:00:02 AM10/21/10

to

On Tue, 2010-10-19 at 11:43 +0200, Peter Zijlstra wrote:

> On Tue, 2010-10-19 at 11:39 +0200, Mike Galbraith wrote:
> >
> > Yeah. I should be able to just do sched_move_task(), but it doesn't
> > currently work even with your patch, turning tty_sched on/off can lead
> > to incredible delays before you get box back. With virgin source, it's
> > size infinity for all intents and purposes.
>
> Ah, first feedback I've got on that patch,. but surely since you created
> one that does work we can fix my patch and use the normal path? :-)

Actually, your patch works just peachy, my fiddling with sleepers
vruntime at attach time was a dainbramaged thing to do.

-Mike

Mike Galbraith

unread,

Oct 21, 2010, 4:20:02 AM10/21/10

to

On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:

> Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and
> resend the patch?

Here she comes. Better/Worse?

Changes:
- tty->autogroup.
- only autogroup fair class tasks.
- removed dainbramaged sleeper vruntime twiddling.
- removed paranoid locking.
- removed noop detatch code.

> I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core
> kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather
> meaningless distinction. Since the feature is default-n, people will get the old
> scheduler by default but can also choose this desktop-centric scheduling mode.
>
> I'd even argue to make it default-y, because this patch clearly cures a form of
> kbuild cancer.

You top dogs can make the default call.. it it's accepted that is ;-)

Marcus: care to try the below? Works fine for me (but so did first
cut). It depends on the attached patch, and applied to virgin shiny new
2.6.36.

A recurring complaint from CFS users is that parallel kbuild has a negative
impact on desktop interactivity. This patch implements an idea from Linus,
to automatically create task groups. This patch only implements Linus' per
tty task group suggestion, and only for fair class tasks, but leaves the way
open for enhancement.

How it works: at tty alloc/dealloc time, a task group is created/destroyed,
so there is always a task group active per tty. When we select a runqueue,
if the task has a has a tty association, and no task group, attach it to a
per tty autogroup on the fly.

The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP is
selected, but can be disabled via the boot option noautogroup, and can be
also be turned on/off on the fly via..
echo [01] > /proc/sys/kernel/sched_autogroup_enabled.
..which will automatically move tasks to/from the root task group.

Some numbers.

A 100% hog overhead measurement proggy pinned to the same CPU as a make -j10

About measurement proggy:
pert/sec = perturbations/sec
min/max/avg = scheduler service latencies in usecs
sum/s = time accrued by the competition per sample period (1 sec here)
overhead = %CPU received by the competition per sample period

pert/s: 31 >40475.37us: 3 min: 0.37 max:48103.60 avg:29573.74 sum/s:916786us overhead:90.24%
pert/s: 23 >41237.70us: 12 min: 0.36 max:56010.39 avg:40187.01 sum/s:924301us overhead:91.99%
pert/s: 24 >42150.22us: 12 min: 8.86 max:61265.91 avg:39459.91 sum/s:947038us overhead:92.20%
pert/s: 26 >42344.91us: 11 min: 3.83 max:52029.60 avg:36164.70 sum/s:940282us overhead:91.12%
pert/s: 24 >44262.90us: 14 min: 5.05 max:82735.15 avg:40314.33 sum/s:967544us overhead:92.22%

Same load with this patch applied.

pert/s: 229 >5484.43us: 41 min: 0.15 max:12069.42 avg:2193.81 sum/s:502382us overhead:50.24%
pert/s: 222 >5652.28us: 43 min: 0.46 max:12077.31 avg:2248.56 sum/s:499181us overhead:49.92%
pert/s: 211 >5809.38us: 43 min: 0.16 max:12064.78 avg:2381.70 sum/s:502538us overhead:50.25%
pert/s: 223 >6147.92us: 43 min: 0.15 max:16107.46 avg:2282.17 sum/s:508925us overhead:50.49%
pert/s: 218 >6252.64us: 43 min: 0.16 max:12066.13 avg:2324.11 sum/s:506656us overhead:50.27%

Average service latency is an order of magnitude better with autogroup.
(Imagine that pert were Xorg or whatnot instead)

Using Mathieu Desnoyers' wakeup-latency testcase:

With taskset -c 3 make -j 10 running..

taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency

without:
maximum latency: 42963.2 µs
average latency: 9077.0 µs
missed timer events: 0

with:
maximum latency: 4160.7 µs
average latency: 149.4 µs
missed timer events: 0

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h
@@ -1900,6 +1900,20 @@ int sched_rt_handler(struct ctl_table *t

extern unsigned int sysctl_sched_compat_yield;

+#ifdef CONFIG_SCHED_AUTOGROUP
+int sched_autogroup_handler(struct ctl_table *table, int write,

+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+

+extern unsigned int sysctl_sched_autogroup_enabled;
+
+void sched_autogroup_create_tty(struct tty_struct *tty);
+void sched_autogroup_destroy_tty(struct tty_struct *tty);
+#else
+static inline void sched_autogroup_create_tty(struct tty_struct *tty) { }
+static inline void sched_autogroup_destroy_tty(struct tty_struct *tty) { }

+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
Index: linux-2.6.36.git/include/linux/tty.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/tty.h
+++ linux-2.6.36.git/include/linux/tty.h
@@ -327,6 +327,9 @@ struct tty_struct {
/* If the tty has a pending do_SAK, queue it here - akpm */
struct work_struct SAK_work;
struct tty_port *port;

+#ifdef CONFIG_SCHED_AUTOGROUP

+ struct task_group *tg;
+#endif
};

/* Each of a tty's open files has private_data pointing to tty_file_private */
Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -78,6 +78,7 @@

#include "sched_cpupri.h"
#include "workqueue_sched.h"

+#include "sched_autogroup.h"

#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -612,11 +613,16 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+

+ autogroup_check_attach(p, &tg);

+
+ return tg;
}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1920,6 +1926,7 @@ static void deactivate_task(struct rq *r
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"

+#include "sched_autogroup.c"

#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
Index: linux-2.6.36.git/drivers/char/tty_io.c
===================================================================
--- linux-2.6.36.git.orig/drivers/char/tty_io.c
+++ linux-2.6.36.git/drivers/char/tty_io.c
@@ -185,6 +185,7 @@ void free_tty_struct(struct tty_struct *
{
kfree(tty->write_buf);
tty_buffer_free_all(tty);

+ sched_autogroup_destroy_tty(tty);

kfree(tty);
}

@@ -2823,6 +2824,7 @@ void initialize_tty_struct(struct tty_st
tty->ops = driver->ops;
tty->index = idx;
tty_line_name(driver, idx, tty->name);

+ sched_autogroup_create_tty(tty);
}

/**
Index: linux-2.6.36.git/kernel/sched_autogroup.h
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.h
@@ -0,0 +1,7 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg);
+#else
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg) { }
+#endif

Index: linux-2.6.36.git/kernel/sysctl.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sysctl.c
+++ linux-2.6.36.git/kernel/sysctl.c
@@ -384,6 +384,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},

+#ifdef CONFIG_SCHED_AUTOGROUP
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,

+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,

+ .proc_handler = sched_autogroup_handler,

+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
Index: linux-2.6.36.git/init/Kconfig
===================================================================
--- linux-2.6.36.git.orig/init/Kconfig
+++ linux-2.6.36.git/init/Kconfig
@@ -652,6 +652,19 @@ config DEBUG_BLK_CGROUP

endif # CGROUPS

+config SCHED_AUTOGROUP
+ bool "Automatic process group scheduling"

+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED

+ select BLK_CGROUP
+ help

+ This option optimizes the scheduler for common desktop workloads by
+ automatically creating and populating task groups. This separation
+ of workloads isolates aggressive CPU burners (like build jobs) from
+ desktop applications. Task group autogeneration is currently based
+ upon task tty association.
+
config MM_OWNER
bool

Index: linux-2.6.36.git/kernel/sched_autogroup.c
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.c
@@ -0,0 +1,76 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/tty.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+
+void sched_autogroup_create_tty(struct tty_struct *tty)

+{
+ tty->tg = sched_create_group(&init_task_group);
+ if (IS_ERR(tty->tg)) {
+ tty->tg = &init_task_group;
+ WARN_ON(1);
+ }
+}

+EXPORT_SYMBOL(sched_autogroup_create_tty);
+
+void sched_autogroup_destroy_tty(struct tty_struct *tty)

+{
+ if (tty->tg && tty->tg != &init_task_group)
+ sched_destroy_group(tty->tg);
+}

+EXPORT_SYMBOL(sched_autogroup_destroy_tty);
+
+static void
+autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
+{
+ struct tty_struct *tty = p->signal->tty;
+
+ if (!tty)
+ return;
+

+ *tg = p->signal->tty->tg;
+}

+
+static inline void
+autogroup_check_attach(struct task_struct *p, struct task_group **tg)
+{
+ if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
+ p->sched_class != &fair_sched_class)
+ return;
+
+ rcu_read_lock();
+
+ autogroup_attach_tty(p, tg);

+
+ rcu_read_unlock();
+}
+

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)

+{
+ struct task_struct *p, *t;
+ struct task_group *tg;

+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+

+ for_each_process(p) {
+ tg = task_group(p);

+ sched_move_task(p);

+ list_for_each_entry_rcu(t, &p->thread_group, thread_group) {

+ sched_move_task(t);
+ }
+ }

+
+ return 0;
+}
+

+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;

+
+ return 1;
+}

+__setup("noautogroup", setup_autogroup);
+#endif

cgroup_fixup_broken_cgroup_movement.diff

Ingo Molnar

unread,

Oct 21, 2010, 4:40:01 AM10/21/10

to

* Mike Galbraith <efa...@gmx.de> wrote:

> On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
>
> > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and
> > resend the patch?
>
> Here she comes. Better/Worse?
>
> Changes:
> - tty->autogroup.
> - only autogroup fair class tasks.
> - removed dainbramaged sleeper vruntime twiddling.
> - removed paranoid locking.
> - removed noop detatch code.

I really like the new 'autogroup scheduler' name - as we really dont want to turn
this into anything but an intelligent grouping thing. Via the naming we can resist
heuristics for example.

Btw., how does Xorg fare with this? Can we remove sleeper fairness for example and
simplify other bits of the CFS logic as a side-effect?

Markus Trippelsdorf

unread,

Oct 21, 2010, 4:50:01 AM10/21/10

to

On Thu, Oct 21, 2010 at 10:11:55AM +0200, Mike Galbraith wrote:
> On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
>
> > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and
> > resend the patch?
>
> Here she comes. Better/Worse?
>
> Changes:
> - tty->autogroup.
> - only autogroup fair class tasks.
> - removed dainbramaged sleeper vruntime twiddling.
> - removed paranoid locking.
> - removed noop detatch code.
>
> > I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core
> > kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather
> > meaningless distinction. Since the feature is default-n, people will get the old
> > scheduler by default but can also choose this desktop-centric scheduling mode.
> >
> > I'd even argue to make it default-y, because this patch clearly cures a form of
> > kbuild cancer.
>
> You top dogs can make the default call.. it it's accepted that is ;-)
>
> Marcus: care to try the below? Works fine for me (but so did first
> cut). It depends on the attached patch, and applied to virgin shiny new
> 2.6.36.

Works fine interactively, but I still get the same kernel BUG on reboot
time as before. Would a photo of the trace help you?

--
Markus

Mike Galbraith

unread,

Oct 21, 2010, 4:50:02 AM10/21/10

to

On Thu, 2010-10-21 at 10:31 +0200, Ingo Molnar wrote:

> Btw., how does Xorg fare with this? Can we remove sleeper fairness for example and
> simplify other bits of the CFS logic as a side-effect?

Works a treat for me. As I write this in evolution, I have amarok
playing with a visualization and a make -j100 running. Song switch is
instant, visualization is nice and smooth despite unaccelerated Xorg.

We can't whack fair sleepers though, not without inventing a new
preemption model to take it's place.

-Mike

Mike Galbraith

unread,

Oct 21, 2010, 5:00:03 AM10/21/10

to

On Thu, 2010-10-21 at 10:48 +0200, Markus Trippelsdorf wrote:
> On Thu, Oct 21, 2010 at 10:11:55AM +0200, Mike Galbraith wrote:
> > On Wed, 2010-10-20 at 04:56 +0200, Ingo Molnar wrote:
> >
> > > Mind doing more of the tty->desktop renames/generalizations as Linus suggested, and
> > > resend the patch?
> >
> > Here she comes. Better/Worse?
> >
> > Changes:
> > - tty->autogroup.
> > - only autogroup fair class tasks.
> > - removed dainbramaged sleeper vruntime twiddling.
> > - removed paranoid locking.
> > - removed noop detatch code.
> >
> > > I'd also suggest to move it out of EXPERIMENTAL - we dont really do that for core
> > > kernel features as most distros enable CONFIG_EXPERIMENTAL so it's a rather
> > > meaningless distinction. Since the feature is default-n, people will get the old
> > > scheduler by default but can also choose this desktop-centric scheduling mode.
> > >
> > > I'd even argue to make it default-y, because this patch clearly cures a form of
> > > kbuild cancer.
> >
> > You top dogs can make the default call.. it it's accepted that is ;-)
> >
> > Marcus: care to try the below? Works fine for me (but so did first
> > cut). It depends on the attached patch, and applied to virgin shiny new
> > 2.6.36.
>
> Works fine interactively, but I still get the same kernel BUG on reboot
> time as before. Would a photo of the trace help you?

Odd. Yeah, please send me the photo and your .config.

-Mike

Marco

unread,

Oct 21, 2010, 5:20:02 AM10/21/10

to

Hello, i'm a little confused about tty grouping.
The benefit in terms of latency is only due to the grouping of task
under the appropriate cgroup hierarchy?

Can this issue be solved by userspace daemon (think of libpam-systemd,
libpam-cgroup)? Perhaps tty grouping in kernel space is more
efficient.

I'm missing something?

Marco

Peter Zijlstra

unread,

Oct 21, 2010, 6:30:01 AM10/21/10

to

On Thu, 2010-10-21 at 09:55 +0200, Mike Galbraith wrote:
> Actually, your patch works just peachy, my fiddling with sleepers
> vruntime at attach time was a dainbramaged thing to do.

OK, I'll queue it with a Tested-by from you. Thanks!

Mathieu Desnoyers

unread,

Oct 21, 2010, 7:00:03 AM10/21/10

to

* Mike Galbraith (efa...@gmx.de) wrote:
[...]

Hi Mike,

This per-tty task grouping approach looks very promising. I'll give it a spin
when I find the time. Meanwhile, a little question about locking here: how is
the read lock supposed to protect from p->signal (and p->signal->tty)
modifications ? What's the locking scheme here ? So maybe just simple
rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
required. In all cases, I feel something is missing there.

Thanks!

Mathieu

--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

Peter Zijlstra

unread,

Oct 21, 2010, 7:30:03 AM10/21/10

to

> Meanwhile, a little question about locking here: how is

> the read lock supposed to protect from p->signal (and p->signal->tty)
> modifications ? What's the locking scheme here ? So maybe just simple
> rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> required. In all cases, I feel something is missing there.

Oleg, could you comment?

Mike Galbraith

unread,

Oct 21, 2010, 7:30:03 AM10/21/10

to

On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:

My assumption is that no additional locking is needed. The tty is
refcounted, dropped in release_task()->__exit_signal(), at which point
the task is unhashed, is history. The tty can't go away until the last
task referencing it goes away.

-Mike

Mathieu Desnoyers

unread,

Oct 21, 2010, 12:30:01 PM10/21/10

to

* Markus Trippelsdorf (mar...@trippelsdorf.de) wrote:

> OK here you go. Both are attached.

In the backtrace, the scheduler is called from:

do_group_exit()
__dequeue_signal()
do_exit()

given that task_group() is called from many spots in the scheduler, I wonder if
some checks making sure that tg != NULL in task_group() would be appropriate ?
Also checking that p->signal is non-NULL in autogroup_attach_tty() might help.

Thanks,

Mathieu

--
Mathieu Desnoyers
Operating System Efficiency R&D Consultant
EfficiOS Inc.
http://www.efficios.com

Oleg Nesterov

unread,

Oct 21, 2010, 1:40:01 PM10/21/10

to

On 10/21, Peter Zijlstra wrote:
>
> On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> > * Mike Galbraith (efa...@gmx.de) wrote:
> > [...]
> > > +static void
> > > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > > +{
> > > + struct tty_struct *tty = p->signal->tty;
> > > +
> > > + if (!tty)
> > > + return;
> > > +
> > > + *tg = p->signal->tty->tg;
> > > +}

minor nit, I think in theory this needs barrier(), or

struct tty_struct *tty = ACCESS_ONCE(p->signal->tty);

if (tty)
*tg = tty->tg;

> > > +static inline void
> > > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > > +{
> > > + if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > > + p->sched_class != &fair_sched_class)
> > > + return;
> > > +
> > > + rcu_read_lock();
> > > +
> > > + autogroup_attach_tty(p, tg);
> > > +
> > > + rcu_read_unlock();
> > > +}
> > > +
>
> > Meanwhile, a little question about locking here: how is
> > the read lock supposed to protect from p->signal (and p->signal->tty)
> > modifications ? What's the locking scheme here ? So maybe just simple
> > rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> > required. In all cases, I feel something is missing there.
>
> Oleg, could you comment?

No, I don't understand this ;) But I know nothig about task groups,
most probably this is OK.

It is not clear to me why do we need rcu_read_lock() and how it can help.
The tty can go away right after dereferencing signal->tty.

Even if the task doesn't exit, it (or its sub-thread) can do sys_setsid()
at any moment and free this tty. If any thread was moved to tty->sg, doesn't
this mean that, say, ->cfs_rq will point to the already freed tg->cfs_rq?

From http://marc.info/?l=linux-kernel&m=128764874422614

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct task_struct *p, *t;
+ struct task_group *tg;
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ for_each_process(p) {

Hmm. This needs rcu lock at least?

+ tg = task_group(p);

Why?

+ sched_move_task(p);
+ list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+ sched_move_task(t);
+ }
+ }

Looks like, you can just do

do_each_thread(p, t) {
sched_move_task(t);
} while_each_thread(p, t);

With the same effect.

Oleg.

Mike Galbraith

unread,

Oct 21, 2010, 3:20:02 PM10/21/10

to

On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:
> On 10/21, Peter Zijlstra wrote:
> >
> > On Thu, 2010-10-21 at 06:51 -0400, Mathieu Desnoyers wrote:
> > > * Mike Galbraith (efa...@gmx.de) wrote:
> > > [...]
> > > > +static void
> > > > +autogroup_attach_tty(struct task_struct *p, struct task_group **tg)
> > > > +{
> > > > + struct tty_struct *tty = p->signal->tty;
> > > > +
> > > > + if (!tty)
> > > > + return;
> > > > +
> > > > + *tg = p->signal->tty->tg;
> > > > +}
>
> minor nit, I think in theory this needs barrier(), or
>
> struct tty_struct *tty = ACCESS_ONCE(p->signal->tty);
>
> if (tty)
> *tg = tty->tg;

Thanks.

> > > > +static inline void
> > > > +autogroup_check_attach(struct task_struct *p, struct task_group **tg)
> > > > +{
> > > > + if (!sysctl_sched_autogroup_enabled || *tg != &root_task_group ||
> > > > + p->sched_class != &fair_sched_class)
> > > > + return;
> > > > +
> > > > + rcu_read_lock();
> > > > +
> > > > + autogroup_attach_tty(p, tg);
> > > > +
> > > > + rcu_read_unlock();
> > > > +}
> > > > +
> >
> > > Meanwhile, a little question about locking here: how is
> > > the read lock supposed to protect from p->signal (and p->signal->tty)
> > > modifications ? What's the locking scheme here ? So maybe just simple
> > > rcu_dereference are missing, or maybe the tsk->sighand->siglock might be
> > > required. In all cases, I feel something is missing there.
> >
> > Oleg, could you comment?
>
> No, I don't understand this ;) But I know nothig about task groups,
> most probably this is OK.
>
> It is not clear to me why do we need rcu_read_lock() and how it can help.
> The tty can go away right after dereferencing signal->tty.

It was inherited.

> Even if the task doesn't exit, it (or its sub-thread) can do sys_setsid()
> at any moment and free this tty. If any thread was moved to tty->sg, doesn't
> this mean that, say, ->cfs_rq will point to the already freed tg->cfs_rq?

Ah, so isn't as safe as it looked. Thanks!

> >From http://marc.info/?l=linux-kernel&m=128764874422614
>
> +int sched_autogroup_handler(struct ctl_table *table, int write,
> + void __user *buffer, size_t *lenp, loff_t *ppos)
> +{
> + struct task_struct *p, *t;
> + struct task_group *tg;
> + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
> +
> + if (ret || !write)
> + return ret;
> +
> + for_each_process(p) {
>
> Hmm. This needs rcu lock at least?

(used to be paranoid locking there.. vs required locking)

> + tg = task_group(p);
>
> Why?

A cleanup leftover.

>
> + sched_move_task(p);
> + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> + sched_move_task(t);
> + }
> + }
>
> Looks like, you can just do
>
> do_each_thread(p, t) {
> sched_move_task(t);
> } while_each_thread(p, t);
>
> With the same effect.

Yeah.

So in theory, the tty can go away on me. I knew this was too easy.

-Mike

Marco

unread,

Oct 21, 2010, 6:20:01 PM10/21/10

to

I've made some tests to give a try to tty grouping patches.

Kernel 2.6.36+CONFIG_SCHED_AUTOGROUP+Improve_tick_preemption.patch

These are the result of :
taskset -c 0 ./wakeup-latency& sleep 30;killall wakeup-latency (owner marco)
vs
taskset -c 0 make -j 10 bzImage

Vanilla 2.6.36
maximum latency: 52879.7 µs
average latency: 4217.5 µs
missed timer events: 0

Kernel 2.6.36+autogroup enabled
maximum latency: 2840.8 µs
average latency: 15.5 µs
missed timer events: 0

Kernel 2.6.36+libpam-systemd enabled
maximum latency: 26361.6 µs
average latency: 4548.6 µs
missed timer events: 0

Kernel 2.6.36+START-NICE sched feature(v3)
maximum latency: 76182.8 µs
average latency: 4930.1 µs
missed timer events: 0

Quite impressive.....
Now, i can play supertuxkart while make'ing -j 10 and stellarium shows
the highest fps i've seen so far.
Very good job, guys.

Marco

Mike Galbraith

unread,

Oct 26, 2010, 3:10:01 AM10/26/10

to

On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:

> It is not clear to me why do we need rcu_read_lock() and how it can help.
> The tty can go away right after dereferencing signal->tty.

Which was Marcus' crash. Didn't happen here only because I didn't have
CONFIG_PREEMPT set.

Changes since v2:
- drop

Mike Galbraith

unread,

Oct 26, 2010, 3:30:02 AM10/26/10

to

On Tue, 2010-10-26 at 09:07 +0200, Mike Galbraith wrote:
> On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:
>
> > It is not clear to me why do we need rcu_read_lock() and how it can help.
> > The tty can go away right after dereferencing signal->tty.
>
> Which was Marcus' crash. Didn't happen here only because I didn't have
> CONFIG_PREEMPT set.
>
> Changes since v2:
> - drop

Bumped mouse, message escaped.

Doesn't matter though, damn thing just blew up during enable/disable
plus hackbench stress test, despite holding a reference to the tty at
every place tty changes (under sighand lock), and moving the task with
that reference held.

CONFIG_PREEMPT is being a little S.O.B.

-Mike

Linus Torvalds

unread,

Oct 26, 2010, 11:50:02 AM10/26/10

to

On Tue, Oct 26, 2010 at 12:29 AM, Mike Galbraith <mgalb...@suse.de> wrote:
> On Tue, 2010-10-26 at 09:07 +0200, Mike Galbraith wrote:
>> On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:
>>
>> > It is not clear to me why do we need rcu_read_lock() and how it can help.
>> > The tty can go away right after dereferencing signal->tty.
>>
>> Which was Marcus' crash. �Didn't happen here only because I didn't have
>> CONFIG_PREEMPT set.
>>
>> Changes since v2:
>> � - drop
>
> Bumped mouse, message escaped.
>
> Doesn't matter though, damn thing just blew up during enable/disable
> plus hackbench stress test, despite holding a reference to the tty at
> every place tty changes (under sighand lock), and moving the task with
> that reference held.

So I have a suggestion that may not be popular with you, because it
does end up changing the approach of your patch a lot.

And I have to say, I like how your last patch looked. It was
surprisingly small, simple, and clean. So I hate saying "I think it
should perhaps do things a bit differently". That said, I would
suggest:

- don't depend on "tsk->signal->tty" at all.

- INSTEAD, introduce a "tsk->signal->sched_group" pointer that points
to whatever the current auto-task_group is. Remember, long-term, we'd
want to maybe have other heuristics than just the tty groups, so we'd
want this separate from the tty logic _anyway_

- at fork time, just copy the task_group pointer in copy_signal() if
it is non-NULL, and increment the refcount (I don't think struct
task_group is refcounted now, but this would require it).

- at free_signal_struct(), just do a
"put_task_group(sig->task_group);" before freeing it.

- make the scheduler use the "tsk->signal->sched_group" as the
default group if nothing else exists.

Now, all the basic logic is _entirely_ unaware of any tty logic, and
it's generic. And none of it has any races with some odd tty release
logic or anything like that.

Now, after this, the only thing you'd need to do is hook into
__proc_set_tty(), which already holds the sighand lock, and _there_
you would attach the task_group to the process. Notice how it would
never be attached to a tty at all, so tty_release etc would never be
involved in any taskgroup thing - it's not really the tty that owns
the taskgroup, it's simply the act of becoming a tty task group leader
that attaches the task to a new scheduling group.

It also means, for example, that if a process loses its tty (and
doesn't get a new one - think hangup), it still remains in whatever
scheduling group it started out with. The tty really is immaterial.

And the nice thing about this is that it should be trivial to make
other things than tty's trigger this same thing, if we find a pattern
(or create some new interface to let people ask for it) for something
that should create a new group (like perhaps spawning a graphical
application from the window manager rather than from a tty).

Comments?

Linus

Mike Galbraith

unread,

Oct 26, 2010, 10:00:01 PM10/26/10

to

On Tue, 2010-10-26 at 08:47 -0700, Linus Torvalds wrote:
> On Tue, Oct 26, 2010 at 12:29 AM, Mike Galbraith <mgalb...@suse.de> wrote:
> > On Tue, 2010-10-26 at 09:07 +0200, Mike Galbraith wrote:
> >> On Thu, 2010-10-21 at 18:29 +0200, Oleg Nesterov wrote:
> >>
> >> > It is not clear to me why do we need rcu_read_lock() and how it can help.
> >> > The tty can go away right after dereferencing signal->tty.
> >>
> >> Which was Marcus' crash. Didn't happen here only because I didn't have
> >> CONFIG_PREEMPT set.
> >>
> >> Changes since v2:
> >> - drop
> >
> > Bumped mouse, message escaped.
> >
> > Doesn't matter though, damn thing just blew up during enable/disable
> > plus hackbench stress test, despite holding a reference to the tty at
> > every place tty changes (under sighand lock), and moving the task with
> > that reference held.
>
> So I have a suggestion that may not be popular with you, because it
> does end up changing the approach of your patch a lot.

Suggestions highly welcome. The raciness is driving me nuts. I can't
afford additional locking, and barriers ain't working.

Much more tasteful than what I was about to do as a last resort funky
race killer, namely make my on/off switch a machine wide atomic bomb :)

Thanks!

-Mike

Mike Galbraith

unread,

Nov 11, 2010, 10:30:01 AM11/11/10

to

Greetings from sunny Arizona!

On Tue, 2010-10-26 at 08:47 -0700, Linus Torvalds wrote:

I _finally_ got back to this yesterday, and implemented your suggestion,
though with a couple minor variations. Putting the autogroup pointer in
the signal struct didn't look right to me, so I plugged it into the task
struct instead. I also didn't refcount taskgroups, wanted the patchlet
to be as self-contained as possible, so refcounted the autogroup struct
instead. I also left group movement on tty disassociation in place, but
may nuke it.

The below has withstood an all night thrashing in my laptop with a
PREEMPT_RT kernel, and looks kinda presentable to me, so...

A recurring complaint from CFS users is that parallel kbuild has a negative
impact on desktop interactivity. This patch implements an idea from Linus,
to automatically create task groups. This patch only implements Linus' per
tty task group suggestion, and only for fair class tasks, but leaves the way
open for enhancement.

Implementation: each task struct contains an inherited pointer to a refcounted
autogroup struct containing a task group pointer, the default for all tasks
pointing to the init_task_group. When a task calls __proc_set_tty(), the
task's reference to the default group is dropped, a new task group is created,
and the task is moved out of the old group and into the new. Children thereafter
inherit this task group, and increase it's refcount. Calls to __tty_hangup()
and proc_clear_tty() move the caller back to the init_task_group, and possibly
destroy the task group. On exit, reference to the current task group is dropped,
and the task group is potentially destroyed. At runqueue selection time, iff
a task has no cgroup assignment, it's current autogroup is used.

Some numbers.

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h

@@ -1159,6 +1159,7 @@ struct sched_rt_entity {
};

struct rcu_node;
+struct autogroup;

struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
@@ -1181,6 +1182,10 @@ struct task_struct {
struct sched_entity se;
struct sched_rt_entity rt;

+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
@@ -1900,6 +1905,21 @@ int sched_rt_handler(struct ctl_table *t

extern unsigned int sysctl_sched_compat_yield;

+#ifdef CONFIG_SCHED_AUTOGROUP

+extern unsigned int sysctl_sched_autogroup_enabled;
+

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+

+extern void sched_autogroup_create_attach(struct task_struct *p);
+extern void sched_autogroup_detatch(struct task_struct *p);
+extern void sched_autogroup_exit(struct task_struct *p);
+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }
+static inline void sched_autogroup_detatch(struct task_struct *p) { }
+static inline void sched_autogroup_exit(struct task_struct *p) { }

+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);

Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c
@@ -78,6 +78,7 @@

#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"

#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -612,11 +613,16 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+

+ autogroup_task_group(p, &tg);

+
+ return tg;
}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1920,6 +1926,7 @@ static void deactivate_task(struct rq *r
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif

@@ -2569,6 +2576,7 @@ void sched_fork(struct task_struct *p, i
* Silence PROVE_RCU.
*/
rcu_read_lock();
+ autogroup_fork(p);
set_task_cpu(p, cpu);
rcu_read_unlock();

@@ -7749,7 +7757,7 @@ void __init sched_init(void)
#ifdef CONFIG_CGROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
-
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
@@ -8279,15 +8287,11 @@ void sched_destroy_group(struct task_gro
/* change task's runqueue when it moves between groups.
* The caller of this function should have put the task in its new group
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
+ * reflect its new group. Called with the runqueue lock held.
*/
-void sched_move_task(struct task_struct *tsk)
+void __sched_move_task(struct task_struct *tsk, struct rq *rq)
{
int on_rq, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);

running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
@@ -8308,7 +8312,15 @@ void sched_move_task(struct task_struct
tsk->sched_class->set_curr_task(rq);
if (on_rq)
enqueue_task(rq, tsk, 0);
+}
+
+void sched_move_task(struct task_struct *tsk)
+{
+ struct rq *rq;
+ unsigned long flags;

+ rq = task_rq_lock(tsk, &flags);
+ __sched_move_task(tsk, rq);
task_rq_unlock(rq, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */

Index: linux-2.6.36.git/drivers/char/tty_io.c
===================================================================
--- linux-2.6.36.git.orig/drivers/char/tty_io.c
+++ linux-2.6.36.git/drivers/char/tty_io.c

@@ -580,6 +580,7 @@ void __tty_hangup(struct tty_struct *tty
spin_lock_irq(&p->sighand->siglock);
if (p->signal->tty == tty) {
p->signal->tty = NULL;
+ sched_autogroup_detatch(p);
/* We defer the dereferences outside fo
the tasklist lock */
refs++;
@@ -3070,6 +3071,7 @@ void proc_clear_tty(struct task_struct *
spin_lock_irqsave(&p->sighand->siglock, flags);
tty = p->signal->tty;
p->signal->tty = NULL;
+ sched_autogroup_detatch(p);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
tty_kref_put(tty);
}
@@ -3089,12 +3091,14 @@ static void __proc_set_tty(struct task_s
tty->session = get_pid(task_session(tsk));
if (tsk->signal->tty) {
printk(KERN_DEBUG "tty not NULL!!\n");
+ sched_autogroup_detatch(tsk);
tty_kref_put(tsk->signal->tty);
}
}
put_pid(tsk->signal->tty_old_pgrp);
tsk->signal->tty = tty_kref_get(tty);
tsk->signal->tty_old_pgrp = NULL;
+ sched_autogroup_create_attach(tsk);
}

static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty)
Index: linux-2.6.36.git/kernel/exit.c
===================================================================
--- linux-2.6.36.git.orig/kernel/exit.c
+++ linux-2.6.36.git/kernel/exit.c
@@ -174,6 +174,7 @@ repeat:
write_lock_irq(&tasklist_lock);
tracehook_finish_release_task(p);
__exit_signal(p);
+ sched_autogroup_exit(p);

/*
* If we are the last non-leader member of the thread

Index: linux-2.6.36.git/kernel/sched_autogroup.h
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.h

@@ -0,0 +1,10 @@

+#ifdef CONFIG_SCHED_AUTOGROUP
+static inline void

+autogroup_task_group(struct task_struct *p, struct task_group **tg);
+static void __sched_move_task(struct task_struct *tsk, struct rq *rq);
+#else /* !CONFIG_SCHED_AUTOGROUP */
+static inline void autogroup_init(struct task_struct *init_task) { }
+static inline void autogroup_fork(struct task_struct *p) { }
+static inline void
+autogroup_task_group(struct task_struct *p, struct task_group **tg) { }
+#endif /* CONFIG_SCHED_AUTOGROUP */

Index: linux-2.6.36.git/kernel/sched_autogroup.c
===================================================================
--- /dev/null
+++ linux-2.6.36.git/kernel/sched_autogroup.c

@@ -0,0 +1,161 @@
+#ifdef CONFIG_SCHED_AUTOGROUP

+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+

+struct autogroup {
+ struct kref kref;
+ struct task_group *tg;
+};
+
+static struct autogroup autogroup_default;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &init_task_group;
+ kref_init(&autogroup_default.kref);
+ init_task->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+ sched_destroy_group(ag->tg);
+ kfree(ag);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+ kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+ kref_get(&ag->kref);
+ return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+ struct autogroup *ag = kmalloc(sizeof(*ag), GFP_KERNEL);
+
+ if (!ag)
+ goto out_fail;
+
+ ag->tg = sched_create_group(&init_task_group);
+ kref_init(&ag->kref);
+
+ if (!(IS_ERR(ag->tg)))
+ return ag;
+
+out_fail:
+ if (ag) {
+ kfree(ag);
+ WARN_ON(1);
+ } else
+ WARN_ON(1);
+
+ return autogroup_kref_get(&autogroup_default);
+}
+
+static void autogroup_fork(struct task_struct *p)
+{
+ p->autogroup = autogroup_kref_get(current->autogroup);

+}
+
+static inline void

+autogroup_task_group(struct task_struct *p, struct task_group **tg)
+{
+ int enabled = sysctl_sched_autogroup_enabled;
+
+ enabled &= (*tg == &root_task_group);
+ enabled &= (p->sched_class == &fair_sched_class);
+ enabled &= (!(p->flags & PF_EXITING));
+
+ if (enabled)
+ *tg = p->autogroup->tg;
+}
+
+static void
+autogroup_move_task(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct rq *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(p, &flags);
+ prev = p->autogroup;
+ if (prev == ag) {
+ task_rq_unlock(rq, &flags);
+ return;
+ }
+
+ p->autogroup = autogroup_kref_get(ag);
+ __sched_move_task(p, rq);
+ task_rq_unlock(rq, &flags);
+
+ autogroup_kref_put(prev);
+}
+
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ autogroup_move_task(p, autogroup_create());
+
+ /*
+ * Correct freshly allocated group's refcount.
+ * Move takes a reference on destination, but
+ * create already initialized refcount to 1.
+ */
+ if (p->autogroup != &autogroup_default)
+ autogroup_kref_put(p->autogroup);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+void sched_autogroup_detatch(struct task_struct *p)
+{
+ autogroup_move_task(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detatch);
+
+void sched_autogroup_exit(struct task_struct *p)
+{
+ autogroup_kref_put(p->autogroup);
+}
+

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct task_struct *p, *t;

+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+

+ /*
+ * Exclude cgroup, task group and task create/destroy
+ * during global classification.
+ */
+ cgroup_lock();
+ spin_lock(&task_group_lock);
+ read_lock(&tasklist_lock);
+
+ do_each_thread(p, t) {
+ sched_move_task(t);
+ } while_each_thread(p, t);
+
+ read_unlock(&tasklist_lock);
+ spin_unlock(&task_group_lock);
+ cgroup_unlock();

+
+ return 0;
+}
+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+#endif

@@ -652,6 +652,18 @@ config DEBUG_BLK_CGROUP

endif # CGROUPS

+config SCHED_AUTOGROUP
+ bool "Automatic process group scheduling"
+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED

+ help
+ This option optimizes the scheduler for common desktop workloads by
+ automatically creating and populating task groups. This separation
+ of workloads isolates aggressive CPU burners (like build jobs) from
+ desktop applications. Task group autogeneration is currently based
+ upon task tty association.
+
config MM_OWNER
bool

Index: linux-2.6.36.git/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.36.git.orig/Documentation/kernel-parameters.txt
+++ linux-2.6.36.git/Documentation/kernel-parameters.txt
@@ -1610,6 +1610,8 @@ and is between 256 and 4096 characters.
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.

+ noautogroup Disable scheduler automatic task group creation.
+
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.

Ingo Molnar

unread,

Nov 11, 2010, 1:10:02 PM11/11/10

to

* Mike Galbraith <efa...@gmx.de> wrote:

> I _finally_ got back to this yesterday, and implemented your suggestion, though
> with a couple minor variations. Putting the autogroup pointer in the signal
> struct didn't look right to me, so I plugged it into the task struct instead. I
> also didn't refcount taskgroups, wanted the patchlet to be as self-contained as
> possible, so refcounted the autogroup struct instead. I also left group movement
> on tty disassociation in place, but may nuke it.
>
> The below has withstood an all night thrashing in my laptop with a PREEMPT_RT
> kernel, and looks kinda presentable to me, so...

The patch and the diffstat gives me warm fuzzy feelings:

Very well contained, minimally invasive to anything else!

( Noticed only one very small detail: sched_autogroup.h has an illness of lack of
newlines which makes it a bit hard to read - but this is cured easily. )

I'll test and apply this patch to the scheduler tree, so if anyone has objections
please holler now :-)

Linus, does this look OK to you too, can i add your Acked-by?

Thanks,

Ingo

Linus Torvalds

unread,

Nov 11, 2010, 1:50:02 PM11/11/10

to

On Thu, Nov 11, 2010 at 7:26 AM, Mike Galbraith <efa...@gmx.de> wrote:
>
> I _finally_ got back to this yesterday, and implemented your suggestion,
> though with a couple minor variations. Putting the autogroup pointer in
> the signal struct didn't look right to me, so I plugged it into the task
> struct instead. I also didn't refcount taskgroups, wanted the patchlet
> to be as self-contained as possible, so refcounted the autogroup struct
> instead. I also left group movement on tty disassociation in place, but
> may nuke it.

Ok, the patch looks fine, but I do have a few comments:

- the reason I suggested the signal struct was really that I thought
it would avoid extra (unnecessary) cost in thread creation/teardown.

Maybe I should have made that clear, but this seems to
unnecessarily do the whole atomic_inc/dec for each thread. That seems
a bit sad.

That said, if not having to dereference ->signal simplifies the
scheduler interaction, I guess the extra atomic ref at thread
creation/deletion is fine. So I don't think this is wrong, it's just
something I wanted to bring up.

- You misspelled "detach". That just drives me wild. Please fix.

- What I _do_ think is wrong is how I think you're trying to be "too
precise". I think that's fundamentally wrong, because I think we
should make it very clear that it's a heuristic. So I dislike seeing
these functions: sched_autogroup_handler() - we shouldn't care about
old state, sched_autogroup_detach() - even with the fixed spelling I
don't really see why a tty hangup should cause the process to go back
to the default group, for example.

IOW, I think you tried a bit _too_ hard to make it a 1:1 relationship
with the tty. I don't think it needs to be. Just because a process
loses its tty because of a hangup, I don't think that that should have
any real implications for the auto-group scheduling. Or maybe it
should, but that decision should be based on "does it help scheduling
behavior" rather than on "it always matches the tty". See what I'm
saying?

That said, I do love how the patch looks. I think this is absolutely
the right thing to do. My issues are small details. I'd Ack it even in
this form (well, as long as spelling is fixed, that really does rub me
the wrong way), and the other things are more details that are about
how I'm thinking about it rather than "you need to do it this way".

Linus

Mike Galbraith

unread,

Nov 11, 2010, 2:10:02 PM11/11/10

to

On Thu, 2010-11-11 at 10:34 -0800, Linus Torvalds wrote:
> On Thu, Nov 11, 2010 at 7:26 AM, Mike Galbraith <efa...@gmx.de> wrote:
> >
> > I _finally_ got back to this yesterday, and implemented your suggestion,
> > though with a couple minor variations. Putting the autogroup pointer in
> > the signal struct didn't look right to me, so I plugged it into the task
> > struct instead. I also didn't refcount taskgroups, wanted the patchlet
> > to be as self-contained as possible, so refcounted the autogroup struct
> > instead. I also left group movement on tty disassociation in place, but
> > may nuke it.
>
> Ok, the patch looks fine, but I do have a few comments:
>
> - the reason I suggested the signal struct was really that I thought
> it would avoid extra (unnecessary) cost in thread creation/teardown.
>
> Maybe I should have made that clear, but this seems to
> unnecessarily do the whole atomic_inc/dec for each thread. That seems
> a bit sad.
>
> That said, if not having to dereference ->signal simplifies the
> scheduler interaction, I guess the extra atomic ref at thread
> creation/deletion is fine. So I don't think this is wrong, it's just
> something I wanted to bring up.

Ah, ok. Anything that cuts overhead is worth doing.

> - You misspelled "detach". That just drives me wild. Please fix.

(well, _somebody_ has to keep the speeling police occupied;)

> - What I _do_ think is wrong is how I think you're trying to be "too
> precise". I think that's fundamentally wrong, because I think we
> should make it very clear that it's a heuristic. So I dislike seeing
> these functions: sched_autogroup_handler() - we shouldn't care about
> old state, sched_autogroup_detach() - even with the fixed spelling I
> don't really see why a tty hangup should cause the process to go back
> to the default group, for example.
>
> IOW, I think you tried a bit _too_ hard to make it a 1:1 relationship
> with the tty. I don't think it needs to be. Just because a process
> loses its tty because of a hangup, I don't think that that should have
> any real implications for the auto-group scheduling. Or maybe it
> should, but that decision should be based on "does it help scheduling
> behavior" rather than on "it always matches the tty". See what I'm
> saying?

Yeah, and it doesn't in the common case at least. The handler's
classifier was because a 100% pinned hog would never obey the user's
wishes, but I can whack it along with the hangup. Less is more in the
scheduler.

Thanks for the comments.

-Mike

Markus Trippelsdorf

unread,

Nov 11, 2010, 2:20:03 PM11/11/10

to

On 2010.11.11 at 08:26 -0700, Mike Galbraith wrote:
> I _finally_ got back to this yesterday, and implemented your suggestion,
> though with a couple minor variations. Putting the autogroup pointer in
> the signal struct didn't look right to me, so I plugged it into the task
> struct instead. I also didn't refcount taskgroups, wanted the patchlet
> to be as self-contained as possible, so refcounted the autogroup struct
> instead. I also left group movement on tty disassociation in place, but
> may nuke it.

...

>
> With taskset -c 3 make -j 10 running..
>
> taskset -c 3 ./wakeup-latency& sleep 30;killall wakeup-latency
>
> without:

> maximum latency: 42963.2 盜
> average latency: 9077.0 盜

> missed timer events: 0
>
> with:

> maximum latency: 4160.7 盜
> average latency: 149.4 盜
> missed timer events: 0

Just to add some data; here are the results from my machine (AMD 4
cores) running a -j4 kernel build, while I browsed the web:

1) perf sched record sleep 30

without:
total_wakeups: 44306
avg_wakeup_latency (ns): 36784
min_wakeup_latency (ns): 0
max_wakeup_latency (ns): 9378852

with:
total_wakeups: 43836
avg_wakeup_latency (ns): 67607
min_wakeup_latency (ns): 0
max_wakeup_latency (ns): 8983036

2) perf record -a -e sched:sched_switch -e sched:sched_wakeup sleep 10

without:
total_wakeups: 13195
avg_wakeup_latency (ns): 48484
min_wakeup_latency (ns): 0
max_wakeup_latency (ns): 8722497

with:
total_wakeups: 14106
avg_wakeup_latency (ns): 92532
min_wakeup_latency (ns): 20
max_wakeup_latency (ns): 5642393

So the avg_wakeup_latency nearly doubled with your patch, while the
max_wakeup_latency is lowered by a good amount.

--
Markus

Linus Torvalds

unread,

Nov 11, 2010, 2:40:02 PM11/11/10

to

On Thu, Nov 11, 2010 at 11:08 AM, Mike Galbraith <efa...@gmx.de> wrote:
>
>> �- the reason I suggested the signal struct was really that I thought

>> it would avoid extra (unnecessary) cost in thread creation/teardown.
>>
>> � �Maybe I should have made that clear, but this seems to
>> unnecessarily do the whole atomic_inc/dec for each thread. That seems
>> a bit sad.
>>
>> � �That said, if not having to dereference ->signal simplifies the
>> scheduler interaction, I guess the extra atomic ref at thread
>> creation/deletion is fine. So I don't think this is wrong, it's just
>> something I wanted to bring up.
>
> Ah, ok. �Anything that cuts overhead is worth doing.

Well, it cuts both ways. Maybe your approach is simpler and avoids
overhead at scheduling time. And "tsk->signal" may not be reliable due
to races with exit etc, so it may well be that going through the
signal struct could end up being a source of nasty races. I didn't
look whether the scheduler already derefenced ->signal for some other
reason, for example.

So your patch may well have done the exact right thing.

Linus

Mike Galbraith

unread,

Nov 11, 2010, 2:40:03 PM11/11/10

to

When you say with/without, does that mean enabled/disabled, or
patched/virgin and/or cgroups/nocgroups?

-Mike

Markus Trippelsdorf

unread,

Nov 11, 2010, 2:40:03 PM11/11/10

to

Patched/virgin and nocgroups
--
Markus

Mike Galbraith

unread,

Nov 11, 2010, 3:00:02 PM11/11/10

to

On Thu, 2010-11-11 at 20:38 +0100, Markus Trippelsdorf wrote:
> On 2010.11.11 at 12:35 -0700, Mike Galbraith wrote:

> > When you say with/without, does that mean enabled/disabled, or
> > patched/virgin and/or cgroups/nocgroups?
>
> Patched/virgin and nocgroups

Figures. As you can see, group scheduling is not wonderful for extreme
switchers. Fortunately, most apps do a bit of work in between.

-Mike

Oleg Nesterov

unread,

Nov 11, 2010, 3:40:02 PM11/11/10

to

I didn't read this patch carefully (yet) but,

On 11/11, Mike Galbraith wrote:
>
> @@ -2569,6 +2576,7 @@ void sched_fork(struct task_struct *p, i
> * Silence PROVE_RCU.
> */
> rcu_read_lock();
> + autogroup_fork(p);

Surely this doesn't need rcu.

But the real problem is that copy_process() can fail after that,
and in this case we have the unbalanced kref_get().

> +++ linux-2.6.36.git/kernel/exit.c
> @@ -174,6 +174,7 @@ repeat:
> write_lock_irq(&tasklist_lock);
> tracehook_finish_release_task(p);
> __exit_signal(p);
> + sched_autogroup_exit(p);

This doesn't look right. Note that "p" can run/sleep after that
(or in parallel), set_task_rq() can use the freed ->autogroup.

Btw, I can't apply this patch...

Oleg.

Oleg Nesterov

unread,

Nov 11, 2010, 3:40:02 PM11/11/10

to

On 11/11, Linus Torvalds wrote:
>
> Well, it cuts both ways. Maybe your approach is simpler and avoids
> overhead at scheduling time. And "tsk->signal" may not be reliable due
> to races with exit etc, so it may well be that going through the
> signal struct could end up being a source of nasty races. I didn't
> look whether the scheduler already derefenced ->signal for some other
> reason, for example.

Just in case, starting from 2.6.35 tsk->signal is reliable.

Oleg.

Mike Galbraith

unread,

Nov 11, 2010, 5:30:02 PM11/11/10

to

On Thu, 2010-11-11 at 21:27 +0100, Oleg Nesterov wrote:
> I didn't read this patch carefully (yet) but,
>
> On 11/11, Mike Galbraith wrote:
> >
> > @@ -2569,6 +2576,7 @@ void sched_fork(struct task_struct *p, i
> > * Silence PROVE_RCU.
> > */
> > rcu_read_lock();
> > + autogroup_fork(p);
>
> Surely this doesn't need rcu.

No, it was just a convenient spot.

> But the real problem is that copy_process() can fail after that,
> and in this case we have the unbalanced kref_get().

Memory leak, will fix.

> > +++ linux-2.6.36.git/kernel/exit.c
> > @@ -174,6 +174,7 @@ repeat:
> > write_lock_irq(&tasklist_lock);
> > tracehook_finish_release_task(p);
> > __exit_signal(p);
> > + sched_autogroup_exit(p);
>
> This doesn't look right. Note that "p" can run/sleep after that
> (or in parallel), set_task_rq() can use the freed ->autogroup.

So avoiding refcounting rcu released task_group backfired. Crud.

> Btw, I can't apply this patch...

It depends on the patch below from Peter, or manual fixup.

Subject: sched, cgroup: Fixup broken cgroup movement
From: Peter Zijlstra <a.p.zi...@chello.nl>
Date: Fri Oct 15 15:24:15 CEST 2010

Reported-by: Dima Zavin <di...@android.com>
Signed-off-by: Peter Zijlstra <a.p.zi...@chello.nl>
---
include/linux/sched.h | 2 +-
kernel/sched.c | 8 ++++----
kernel/sched_fair.c | 25 +++++++++++++++++++------
3 files changed, 24 insertions(+), 11 deletions(-)

Index: linux-2.6.36.git/kernel/sched.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched.c
+++ linux-2.6.36.git/kernel/sched.c

@@ -8297,12 +8297,12 @@ void sched_move_task(struct task_struct
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);

- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));

if (unlikely(running))
tsk->sched_class->set_curr_task(rq);

Index: linux-2.6.36.git/include/linux/sched.h
===================================================================
--- linux-2.6.36.git.orig/include/linux/sched.h
+++ linux-2.6.36.git/include/linux/sched.h

@@ -1072,7 +1072,7 @@ struct sched_class {
struct task_struct *task);

#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*moved_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};

Index: linux-2.6.36.git/kernel/sched_fair.c
===================================================================
--- linux-2.6.36.git.orig/kernel/sched_fair.c
+++ linux-2.6.36.git/kernel/sched_fair.c
@@ -3824,13 +3824,26 @@ static void set_curr_task_fair(struct rq
}

#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif

@@ -3882,7 +3895,7 @@ static const struct sched_class fair_sch
.get_rr_interval = get_rr_interval_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};

Oleg Nesterov

unread,

Nov 12, 2010, 1:30:03 PM11/12/10

to

On 11/11, Mike Galbraith wrote:
>
> On Thu, 2010-11-11 at 21:27 +0100, Oleg Nesterov wrote:
>
> > But the real problem is that copy_process() can fail after that,
> > and in this case we have the unbalanced kref_get().
>
> Memory leak, will fix.
>
> > > +++ linux-2.6.36.git/kernel/exit.c
> > > @@ -174,6 +174,7 @@ repeat:
> > > write_lock_irq(&tasklist_lock);
> > > tracehook_finish_release_task(p);
> > > __exit_signal(p);
> > > + sched_autogroup_exit(p);
> >
> > This doesn't look right. Note that "p" can run/sleep after that
> > (or in parallel), set_task_rq() can use the freed ->autogroup.
>
> So avoiding refcounting rcu released task_group backfired. Crud.

Just in case, the lock order may be wrong. sched_autogroup_exit()
takes task_group_lock under write_lock(tasklist), while
sched_autogroup_handler() takes them in reverse order.

I am not sure, but perhaps this can be simpler?
wake_up_new_task() does autogroup_fork(), and do_exit() does
sched_autogroup_exit() before the last schedule. Possible?

> > Btw, I can't apply this patch...
>
> It depends on the patch below from Peter, or manual fixup.

Thanks. It also applies cleanly to 2.6.36.

Very basic question. Currently sched_autogroup_create_attach()
has the only caller, __proc_set_tty(). It is a bit strange that
signal->tty change is process-wide, but sched_autogroup_create_attach()
move the single thread, the caller. What about other threads in
this thread group? The same for proc_clear_tty().

> +void sched_autogroup_create_attach(struct task_struct *p)
> +{
> + autogroup_move_task(p, autogroup_create());
> +
> + /*
> + * Correct freshly allocated group's refcount.
> + * Move takes a reference on destination, but
> + * create already initialized refcount to 1.
> + */
> + if (p->autogroup != &autogroup_default)
> + autogroup_kref_put(p->autogroup);
> +}

OK, but I don't understand "p->autogroup != &autogroup_default"
check. This is true if autogroup_create() succeeds. Otherwise
autogroup_create() does autogroup_kref_get(autogroup_default),
doesn't this mean we need unconditional _put ?

And can't resist, minor cosmetic nit,

> static inline struct task_group *task_group(struct task_struct *p)
> {
> + struct task_group *tg;
> struct cgroup_subsys_state *css;
>
> css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
> lockdep_is_held(&task_rq(p)->lock));
> - return container_of(css, struct task_group, css);
> + tg = container_of(css, struct task_group, css);
> +
> + autogroup_task_group(p, &tg);

Fell free to ignore, but imho

return autogroup_task_group(p, tg);

looks a bit better. Why autogroup_task_group() returns its
result via pointer?

Oleg.

Mike Galbraith

unread,

Nov 13, 2010, 6:50:02 AM11/13/10

to

On Fri, 2010-11-12 at 19:12 +0100, Oleg Nesterov wrote:
> On 11/11, Mike Galbraith wrote:
> >
> > On Thu, 2010-11-11 at 21:27 +0100, Oleg Nesterov wrote:
> >
> > > But the real problem is that copy_process() can fail after that,
> > > and in this case we have the unbalanced kref_get().
> >
> > Memory leak, will fix.
> >
> > > > +++ linux-2.6.36.git/kernel/exit.c
> > > > @@ -174,6 +174,7 @@ repeat:
> > > > write_lock_irq(&tasklist_lock);
> > > > tracehook_finish_release_task(p);
> > > > __exit_signal(p);
> > > > + sched_autogroup_exit(p);
> > >
> > > This doesn't look right. Note that "p" can run/sleep after that
> > > (or in parallel), set_task_rq() can use the freed ->autogroup.
> >
> > So avoiding refcounting rcu released task_group backfired. Crud.
>
> Just in case, the lock order may be wrong. sched_autogroup_exit()
> takes task_group_lock under write_lock(tasklist), while
> sched_autogroup_handler() takes them in reverse order.

Bug self destructs when global classifier goes away.

> I am not sure, but perhaps this can be simpler?
> wake_up_new_task() does autogroup_fork(), and do_exit() does
> sched_autogroup_exit() before the last schedule. Possible?

That's what I was going to do. That said, I couldn't have had the
problem if I'd tied final put directly to life of container, and am
thinking I should do that instead when I go back to p->signal.

> Very basic question. Currently sched_autogroup_create_attach()
> has the only caller, __proc_set_tty(). It is a bit strange that
> signal->tty change is process-wide, but sched_autogroup_create_attach()
> move the single thread, the caller. What about other threads in
> this thread group? The same for proc_clear_tty().

Yeah, I really should (will) move all on the spot, though it doesn't
seem to matter in general practice, forks afterward land in the right
bucket. With per tty or p->signal, migration will pick up stragglers
lazily.. unless they're pinned.

> > +void sched_autogroup_create_attach(struct task_struct *p)
> > +{
> > + autogroup_move_task(p, autogroup_create());
> > +
> > + /*
> > + * Correct freshly allocated group's refcount.
> > + * Move takes a reference on destination, but
> > + * create already initialized refcount to 1.
> > + */
> > + if (p->autogroup != &autogroup_default)
> > + autogroup_kref_put(p->autogroup);
> > +}
>
> OK, but I don't understand "p->autogroup != &autogroup_default"
> check. This is true if autogroup_create() succeeds. Otherwise
> autogroup_create() does autogroup_kref_get(autogroup_default),
> doesn't this mean we need unconditional _put ?

D'oh, target fixation :) Thanks.

> And can't resist, minor cosmetic nit,
>
> > static inline struct task_group *task_group(struct task_struct *p)
> > {
> > + struct task_group *tg;
> > struct cgroup_subsys_state *css;
> >
> > css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
> > lockdep_is_held(&task_rq(p)->lock));
> > - return container_of(css, struct task_group, css);
> > + tg = container_of(css, struct task_group, css);
> > +
> > + autogroup_task_group(p, &tg);
>
> Fell free to ignore, but imho
>
> return autogroup_task_group(p, tg);
>
> looks a bit better. Why autogroup_task_group() returns its
> result via pointer?

No particularly good reason, I'll do the cosmetic change.

Thanks,

-Mike

Mike Galbraith

unread,

Nov 14, 2010, 12:20:02 PM11/14/10

to

On Sat, 2010-11-13 at 04:42 -0700, Mike Galbraith wrote:
> On Fri, 2010-11-12 at 19:12 +0100, Oleg Nesterov wrote:
> > On 11/11, Mike Galbraith wrote:
> > >
> > > On Thu, 2010-11-11 at 21:27 +0100, Oleg Nesterov wrote:
> > >
> > > > But the real problem is that copy_process() can fail after that,
> > > > and in this case we have the unbalanced kref_get().
> > >
> > > Memory leak, will fix.
> > >
> > > > > +++ linux-2.6.36.git/kernel/exit.c
> > > > > @@ -174,6 +174,7 @@ repeat:
> > > > > write_lock_irq(&tasklist_lock);
> > > > > tracehook_finish_release_task(p);
> > > > > __exit_signal(p);
> > > > > + sched_autogroup_exit(p);
> > > >
> > > > This doesn't look right. Note that "p" can run/sleep after that
> > > > (or in parallel), set_task_rq() can use the freed ->autogroup.
> > >
> > > So avoiding refcounting rcu released task_group backfired. Crud.
> >
> > Just in case, the lock order may be wrong. sched_autogroup_exit()
> > takes task_group_lock under write_lock(tasklist), while
> > sched_autogroup_handler() takes them in reverse order.
>
> Bug self destructs when global classifier goes away.

I didn't nuke the handler, but did hide it under a debug option since it
is useful for testing. If the user enables it, and turns autogroup off,
imho off should means off NOW, so I stuck with it as is. I coded up a
lazy (tick time check) move to handle pinned tasks not otherwise being
moved, but that was too much for even my (lack of) taste to handle.

The locking should be fine as it was now, since autogroup_exit() isn't
under the tasklist lock any more. (surprising i didn't hit any problems
with this or use after free in rt kernel given how hard i beat on it)

Pondering adding some debug bits to identify autogroup tasks, maybe
in /proc/N/cgroup or such.

> > I am not sure, but perhaps this can be simpler?
> > wake_up_new_task() does autogroup_fork(), and do_exit() does
> > sched_autogroup_exit() before the last schedule. Possible?
>
> That's what I was going to do. That said, I couldn't have had the
> problem if I'd tied final put directly to life of container, and am
> thinking I should do that instead when I go back to p->signal.

I ended up tying it directly to p->signal's life, and beat on it with
CONFIG_PREEMPT. I wanted to give it a thrashing in PREEMPT_RT, but
when I snagged your signal patches, I apparently didn't snag quite
enough, as the rt kernel with those patches is early boot doorstop.

> > Very basic question. Currently sched_autogroup_create_attach()
> > has the only caller, __proc_set_tty(). It is a bit strange that
> > signal->tty change is process-wide, but sched_autogroup_create_attach()
> > move the single thread, the caller. What about other threads in
> > this thread group? The same for proc_clear_tty().
>

> Yeah, I really should (will) move all on the spot...

Did that, and the rest. This patch will apply to tip or .git.

patchlet:

A recurring complaint from CFS users is that parallel kbuild has a negative
impact on desktop interactivity. This patch implements an idea from Linus,
to automatically create task groups. This patch only implements Linus' per
tty task group suggestion, and only for fair class tasks, but leaves the way
open for enhancement.

Implementation: each task's signal struct contains an inherited pointer to a

refcounted autogroup struct containing a task group pointer, the default for
all tasks pointing to the init_task_group. When a task calls __proc_set_tty(),

the process wide reference to the default group is dropped, a new task group is
created, and the process is moved into the new task group. Children thereafter
inherit this task group, and increase it's refcount. On exit, a reference to the
current task group is dropped when the last reference to each signal struct is
dropped. The task group is destroyed when the last signal struct referencing
it is freed. At runqueue selection time, IFF a task has no cgroup assignment,

it's current autogroup is used.

The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP is
selected, but can be disabled via the boot option noautogroup, and can be

also be turned on/off on the fly if CONFIG_SCHED_AUTOGROUP is enabled via..

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -509,6 +509,8 @@ struct thread_group_cputimer {
spinlock_t lock;
};

+struct autogroup;
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -576,6 +578,9 @@ struct signal_struct {

struct tty_struct *tty; /* NULL if no tty */

+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif

/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.
@@ -1931,6 +1936,23 @@ int sched_rt_handler(struct ctl_table *t

extern unsigned int sysctl_sched_compat_yield;

+#ifdef CONFIG_SCHED_AUTOGROUP

+#ifdef CONFIG_SCHED_AUTOGROUP_DEBUG
+extern unsigned int sysctl_sched_autogroup_enabled;

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);

+#endif

+extern void sched_autogroup_create_attach(struct task_struct *p);

+extern void sched_autogroup_detach(struct task_struct *p);
+extern void sched_autogroup_fork(struct signal_struct *sig);
+extern void sched_autogroup_exit(struct signal_struct *sig);

+#else
+static inline void sched_autogroup_create_attach(struct task_struct *p) { }

+static inline void sched_autogroup_detach(struct task_struct *p) { }
+static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+static inline void sched_autogroup_exit(struct signal_struct *sig) { }

+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c

@@ -78,6 +78,7 @@

#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"

#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>

@@ -605,11 +606,14 @@ static inline int cpu_of(struct rq *rq)
*/

static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+

+ return autogroup_task_group(p, tg);

}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

@@ -2006,6 +2010,7 @@ static void sched_irq_time_avg_update(st

#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"

#include "sched_stoptask.c"

#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"

@@ -7979,7 +7984,7 @@ void __init sched_init(void)

#ifdef CONFIG_CGROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
-
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

@@ -8509,15 +8514,11 @@ void sched_destroy_group(struct task_gro

/* change task's runqueue when it moves between groups.
* The caller of this function should have put the task in its new group
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
+ * reflect its new group. Called with the runqueue lock held.
*/
-void sched_move_task(struct task_struct *tsk)
+void __sched_move_task(struct task_struct *tsk, struct rq *rq)
{
int on_rq, running;
- unsigned long flags;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &flags);

running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;

@@ -8538,7 +8539,15 @@ void sched_move_task(struct task_struct

tsk->sched_class->set_curr_task(rq);
if (on_rq)
enqueue_task(rq, tsk, 0);
+}

+void sched_move_task(struct task_struct *tsk)
+{
+ struct rq *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(tsk, &flags);
+ __sched_move_task(tsk, rq);
task_rq_unlock(rq, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */

Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c
+++ linux-2.6/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(st

static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (atomic_dec_and_test(&sig->sigcnt)) {
+ sched_autogroup_exit(sig);
free_signal_struct(sig);
+ }
}

void __put_task_struct(struct task_struct *tsk)
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clo
posix_cpu_timers_init_group(sig);

tty_audit_fork(sig);
+ sched_autogroup_fork(sig);

sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
Index: linux-2.6/drivers/tty/tty_io.c
===================================================================
--- linux-2.6.orig/drivers/tty/tty_io.c
+++ linux-2.6/drivers/tty/tty_io.c
@@ -3160,6 +3160,7 @@ static void __proc_set_tty(struct task_s

put_pid(tsk->signal->tty_old_pgrp);
tsk->signal->tty = tty_kref_get(tty);
tsk->signal->tty_old_pgrp = NULL;
+ sched_autogroup_create_attach(tsk);
}

static void proc_set_tty(struct task_struct *tsk, struct tty_struct *tty)

Index: linux-2.6/kernel/sched_autogroup.h
===================================================================
--- /dev/null
+++ linux-2.6/kernel/sched_autogroup.h
@@ -0,0 +1,18 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+

+static void __sched_move_task(struct task_struct *tsk, struct rq *rq);
+

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+

+static inline void autogroup_init(struct task_struct *init_task) { }
+

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ return tg;
+}
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
Index: linux-2.6/kernel/sched_autogroup.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/sched_autogroup.c
@@ -0,0 +1,170 @@

+#ifdef CONFIG_SCHED_AUTOGROUP
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+
+struct autogroup {
+ struct kref kref;
+ struct task_group *tg;
+};
+
+static struct autogroup autogroup_default;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+ autogroup_default.tg = &init_task_group;
+ kref_init(&autogroup_default.kref);

+ init_task->signal->autogroup = &autogroup_default;

+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+ struct autogroup *ag = container_of(kref, struct autogroup, kref);

+ struct task_group *tg = ag->tg;
+
+ kfree(ag);
+ sched_destroy_group(tg);

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ enabled &= (tg == &root_task_group);

+ enabled &= (p->sched_class == &fair_sched_class);
+ enabled &= (!(p->flags & PF_EXITING));
+
+ if (enabled)

+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)

+{
+ struct autogroup *prev;

+ struct task_struct *t;

+ struct rq *rq;
+ unsigned long flags;
+
+ rq = task_rq_lock(p, &flags);

+ prev = p->signal->autogroup;

+ if (prev == ag) {
+ task_rq_unlock(rq, &flags);
+ return;
+ }
+

+ p->signal->autogroup = autogroup_kref_get(ag);

+ __sched_move_task(p, rq);
+ task_rq_unlock(rq, &flags);
+

+ rcu_read_lock();

+ list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+ sched_move_task(t);
+ }

+ rcu_read_unlock();

+
+ autogroup_kref_put(prev);
+}
+

+void sched_autogroup_create_attach(struct task_struct *p)
+{

+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+ /* drop extra refrence added by autogroup_create() */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+ sig->autogroup = autogroup_kref_get(current->signal->autogroup);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+ autogroup_kref_put(sig->autogroup);
+}
+
+#ifdef CONFIG_SCHED_AUTOGROUP_DEBUG

+int sched_autogroup_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct task_struct *p, *t;
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;

+
+ /*

+ * Exclude cgroup, task group and task create/destroy
+ * during global classification.
+ */
+ cgroup_lock();
+ spin_lock(&task_group_lock);
+ read_lock(&tasklist_lock);
+
+ do_each_thread(p, t) {
+ sched_move_task(t);
+ } while_each_thread(p, t);
+

+ spin_unlock(&task_group_lock);
+ cgroup_unlock();
+
+ return 0;
+}

+#endif

+
+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+#endif

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -382,6 +382,17 @@ static struct ctl_table kern_table[] = {

.mode = 0644,
.proc_handler = proc_dointvec,
},

+#ifdef CONFIG_SCHED_AUTOGROUP_DEBUG

+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_autogroup_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",

Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig
@@ -728,6 +728,26 @@ config NET_NS

endif # NAMESPACES

+config SCHED_AUTOGROUP
+ bool "Automatic process group scheduling"
+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED
+ help
+ This option optimizes the scheduler for common desktop workloads by
+ automatically creating and populating task groups. This separation
+ of workloads isolates aggressive CPU burners (like build jobs) from
+ desktop applications. Task group autogeneration is currently based
+ upon task tty association.
+

+config SCHED_AUTOGROUP_DEBUG
+ bool "Enable Autogroup debugging"
+ depends on SCHED_AUTOGROUP
+ default n
+ help
+ This option allows the user to enable/disable autogroup on the fly
+ via echo [10] > /proc/sys/kernel/sched_autogroup_enabled.
+
config MM_OWNER
bool

Index: linux-2.6/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.orig/Documentation/kernel-parameters.txt
+++ linux-2.6/Documentation/kernel-parameters.txt
@@ -1622,6 +1622,8 @@ and is between 256 and 4096 characters.

noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.

+ noautogroup Disable scheduler automatic task group creation.
+
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.

Markus Trippelsdorf

unread,

Nov 14, 2010, 12:50:02 PM11/14/10

to

Unfortunately it won't. There's a missing "+" at line 507:

markus@arch linux % patch -p1 --dry-run < /home/markus/tty_autogroup2.patch
patching file include/linux/sched.h
Hunk #3 succeeded at 1935 (offset -1 lines).
patching file kernel/sched.c
Hunk #2 succeeded at 607 (offset 1 line).
Hunk #3 succeeded at 2011 with fuzz 2 (offset 1 line).
Hunk #4 succeeded at 7959 (offset -25 lines).
Hunk #5 succeeded at 8489 (offset -25 lines).
Hunk #6 succeeded at 8514 (offset -25 lines).
patching file kernel/fork.c
patching file drivers/tty/tty_io.c
patching file kernel/sched_autogroup.h
patching file kernel/sched_autogroup.c
patch: **** malformed patch at line 507: Index: linux-2.6/kernel/sysctl.c
--
Markus

Mike Galbraith

unread,

Nov 14, 2010, 1:20:02 PM11/14/10

to

Oh well, yesterday ping (one of sister's laptop loving cats) opened
_187_ screenshots while I was away from the keyboard. I'll blame any
spelling errors on him too while I'm at it :)

Here's a fresh copy, no cats is sight.

A recurring complaint from CFS users is that parallel kbuild has a negative
impact on desktop interactivity. This patch implements an idea from Linus,
to automatically create task groups. This patch only implements Linus' per
tty task group suggestion, and only for fair class tasks, but leaves the way
open for enhancement.

Implementation: each task's signal struct contains an inherited pointer to a
refcounted autogroup struct containing a task group pointer, the default for
all tasks pointing to the init_task_group. When a task calls __proc_set_tty(),
the process wide reference to the default group is dropped, a new task group is
created, and the process is moved into the new task group. Children thereafter
inherit this task group, and increase it's refcount. On exit, a reference to the
current task group is dropped when the last reference to each signal struct is
dropped. The task group is destroyed when the last signal struct referencing
it is freed. At runqueue selection time, IFF a task has no cgroup assignment,
it's current autogroup is used.

The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP is
selected, but can be disabled via the boot option noautogroup, and can be

also be turned on/off on the fly via..

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

+ read_unlock(&tasklist_lock);

Linus Torvalds

unread,

Nov 14, 2010, 2:30:02 PM11/14/10

to

On Sun, Nov 14, 2010 at 10:10 AM, Mike Galbraith <efa...@gmx.de> wrote:
>
> Oh well, yesterday ping (one of sister's laptop loving cats) opened
> _187_ screenshots while I was away from the keyboard. �I'll blame any
> spelling errors on him too while I'm at it :)
>
> Here's a fresh copy, no cats is sight.

This catless version looks very good to me. Assuming testing and
approval by the scheduler people, this has an enthusiastic "ack" from
me.

Linus

Markus Trippelsdorf

unread,

Nov 14, 2010, 3:30:02 PM11/14/10

to

On 2010.11.14 at 12:20 -0800, Linus Torvalds wrote:

> On Sun, Nov 14, 2010 at 11:28 AM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > This catless version looks very good to me
>

> Oh, btw, maybe one comment: is CONFIG_SCHED_AUTOGROUP_DEBUG even worth
> having as a config option? The only thing it enables is the small
> /proc interface, and I don't see any downside to just always having
> that if you have AUTOGROUP enabled in the first place.
>
> It isn't like it's some traditional kernel debug feature that makes
> things slower or adds tons of debug output.

It also enables the sched_autogroup_handler, that you disliked seeing in
the previous version.
--
Markus

Linus Torvalds

unread,

Nov 14, 2010, 3:50:02 PM11/14/10

to

On Sun, Nov 14, 2010 at 12:27 PM, Markus Trippelsdorf
<mar...@trippelsdorf.de> wrote:
> On 2010.11.14 at 12:20 -0800, Linus Torvalds wrote:
>>
>> It isn't like it's some traditional kernel debug feature that makes
>> things slower or adds tons of debug output.
>
> It also enables the sched_autogroup_handler, that you disliked seeing in
> the previous version.

Yes, but that one exists only to make things "exact". I don't really
see why it exists. What's the point of doing the task movement, if the
group information is just going to be ignored anyway?

IOW, my dislike of the sched_autogroup_handler is not about the debug
option per se, it's about the same thing I said about tty hangups etc:
why do we care so deeply?

I think it would be perfectly acceptably to make the "enable" bit just
decide whether or not to take that autogroup information into account
for scheduling decision. Why do we care so deeply about having to move
the groups around right _then_?

Maybe there is some really deep reason for actually having to do the
reclassification and the sched_move_task() thing for each thread
synchronously when disabling/enabling that thing. If so, I'm just not
seeing it. Why can't we just let things be, and next time things get
scheduled they'll move on their own?

So my objection was really about the same "why do we have to try so
hard to keep the autogroups so 1:1 with the tty's"? It's just a
heuristic, trying to be exact about it seems to miss the point.

Linus

Mike Galbraith

unread,

Nov 14, 2010, 6:50:02 PM11/14/10

to

On Sun, 2010-11-14 at 12:48 -0800, Linus Torvalds wrote:
> On Sun, Nov 14, 2010 at 12:27 PM, Markus Trippelsdorf
> <mar...@trippelsdorf.de> wrote:
> > On 2010.11.14 at 12:20 -0800, Linus Torvalds wrote:
> >>
> >> It isn't like it's some traditional kernel debug feature that makes
> >> things slower or adds tons of debug output.
> >
> > It also enables the sched_autogroup_handler, that you disliked seeing in
> > the previous version.
>
> Yes, but that one exists only to make things "exact". I don't really
> see why it exists. What's the point of doing the task movement, if the
> group information is just going to be ignored anyway?

Not only. pinned tasks would stay in their autogroup until somebody
moved them to a cgroup. Them wandering back over time would be fine,
and all but pinned tasks will.

-Mike

Mike Galbraith

unread,

Nov 14, 2010, 7:10:02 PM11/14/10

to

On Sun, 2010-11-14 at 12:20 -0800, Linus Torvalds wrote:
> On Sun, Nov 14, 2010 at 11:28 AM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > This catless version looks very good to me
>
> Oh, btw, maybe one comment: is CONFIG_SCHED_AUTOGROUP_DEBUG even worth
> having as a config option? The only thing it enables is the small
> /proc interface, and I don't see any downside to just always having
> that if you have AUTOGROUP enabled in the first place.

If the on/off is available, you can silently strand pinned tasks forever
in an autogroup. So, I tied the on/off switch to the global move so the
user won't be surprised.

-Mike

Linus Torvalds

unread,

Nov 14, 2010, 7:20:01 PM11/14/10

to

On Sun, Nov 14, 2010 at 3:43 PM, Mike Galbraith <efa...@gmx.de> wrote:
>
> Not only. pinned tasks would stay in their autogroup until somebody
> moved them to a cgroup. �Them wandering back over time would be fine,
> and all but pinned tasks will.

But why is a problem?

That's kind of my point. Why do we care about some special case that
(a) likely doesn't happen and (b) even if it happens, what's the
problem with it happening?

Let me put it this way: the autogroup thing modifies the scheduler in
some subtle ways, but it doesn't make any _semantic_ difference. And
it's not like enabling/disabling autogroup scheduling is something
critical to begin with. It's a heuristic.

THAT is why I think it's so silly to try to be so strict and walk over
all processes while holding a couple of spinlocks.

Ok, so you disable autogroups after having used them, _and_ having
done some really specific things, and some processes that used to be
in a group may end up still scheduling in that group. Why care? It's
not like random people can enable/disable autogroup scheduling and try
to use this as a way to get unfair scheduling.

In fact, I'd go as far as saying that for the tty-based autogroup
scheduling, if you want the autogroup policy to take effect, it would
be perfectly ok if it really only affected the actual group
_allocation_. So when you turn it on, old tty associations that
existed before autogroup being turned on would not actually be in
their own groups at all. And when you turn it off, existing tty groups
would continue to exist, and you'd actually have to open a new window
to get processes to no longer be part of any autogroup behavior.

See what I'm saying? I still think you're bending over backwards to be
"careful" in ways that don't seem to make much sense to me.

Now, if there is some really fundamental reason why processes _have_
to be disassociated with the group when the autogroup policy changes,
that would be a different issue. If the scheduler oopses when it hits
a process that was in a tty autogroup but autogrouping has been turned
off, _that_ would be a reason to say "recalculate everything when you
disable/enable policy groups". But I don't think that's the case.

Linus

Linus Torvalds

unread,

Nov 14, 2010, 7:30:02 PM11/14/10

to

On Sun, Nov 14, 2010 at 4:15 PM, Linus Torvalds
<torv...@linux-foundation.org> wrote:
>
> THAT is why I think it's so silly to try to be so strict and walk over
> all processes while holding a couple of spinlocks.

Btw, let me say that I think the patch is great even with that thing
in. It looks clean, the thing I'm complaining about is not a big deal,
and it seems to perform very much as advertized. The difference with
autogroup scheduling is very noticeable with a simple "make -j64"
kernel compile.

So I really don't think it's a big deal. The sysctl handler isn't even
complicated. But boy does it hurt my eyes to see a spinlock held
around a "do_each_thread()". And I do get the feeling that the
simplest way to fix it would be to just remove the code entirely, and
just say that "enabling/disabling may be delayed for old processes
with existing autogroups".

Mike Galbraith

unread,

Nov 14, 2010, 8:20:02 PM11/14/10

to

On Sun, 2010-11-14 at 16:26 -0800, Linus Torvalds wrote:
> On Sun, Nov 14, 2010 at 4:15 PM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > THAT is why I think it's so silly to try to be so strict and walk over
> > all processes while holding a couple of spinlocks.
>
> Btw, let me say that I think the patch is great even with that thing
> in. It looks clean, the thing I'm complaining about is not a big deal,
> and it seems to perform very much as advertized. The difference with
> autogroup scheduling is very noticeable with a simple "make -j64"
> kernel compile.
>
> So I really don't think it's a big deal. The sysctl handler isn't even
> complicated. But boy does it hurt my eyes to see a spinlock held
> around a "do_each_thread()". And I do get the feeling that the
> simplest way to fix it would be to just remove the code entirely, and
> just say that "enabling/disabling may be delayed for old processes
> with existing autogroups".

Which is what I just did. If the oddball case isn't a big deal, the
patch shrinks, which is a good thing. I just wanted to cover all bases.

Patchlet with handler whacked:

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -509,6 +509,8 @@ struct thread_group_cputimer {
spinlock_t lock;
};

+struct autogroup;
+
/*
* NOTE! "signal_struct" does not have it's own
* locking, because a shared signal_struct always
@@ -576,6 +578,9 @@ struct signal_struct {

struct tty_struct *tty; /* NULL if no tty */

+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.

@@ -1931,6 +1936,20 @@ int sched_rt_handler(struct ctl_table *t

extern unsigned int sysctl_sched_compat_yield;

+#ifdef CONFIG_SCHED_AUTOGROUP

+extern unsigned int sysctl_sched_autogroup_enabled;
+

@@ -0,0 +1,140 @@

+static int __init setup_autogroup(char *str)
+{
+ sysctl_sched_autogroup_enabled = 0;
+
+ return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+#endif
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -382,6 +382,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_SCHED_AUTOGROUP

+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,

+ .proc_handler = proc_dointvec,

+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
Index: linux-2.6/init/Kconfig
===================================================================
--- linux-2.6.orig/init/Kconfig
+++ linux-2.6/init/Kconfig

@@ -728,6 +728,18 @@ config NET_NS

endif # NAMESPACES

+config SCHED_AUTOGROUP
+ bool "Automatic process group scheduling"
+ select CGROUPS
+ select CGROUP_SCHED
+ select FAIR_GROUP_SCHED
+ help
+ This option optimizes the scheduler for common desktop workloads by
+ automatically creating and populating task groups. This separation
+ of workloads isolates aggressive CPU burners (like build jobs) from
+ desktop applications. Task group autogeneration is currently based
+ upon task tty association.
+

config MM_OWNER
bool

Index: linux-2.6/Documentation/kernel-parameters.txt
===================================================================
--- linux-2.6.orig/Documentation/kernel-parameters.txt
+++ linux-2.6/Documentation/kernel-parameters.txt
@@ -1622,6 +1622,8 @@ and is between 256 and 4096 characters.
noapic [SMP,APIC] Tells the kernel to not make use of any
IOAPICs that may be present in the system.

+ noautogroup Disable scheduler automatic task group creation.
+
nobats [PPC] Do not use BATs for mapping kernel lowmem
on "Classic" PPC cores.

Linus Torvalds

unread,

Nov 14, 2010, 10:20:03 PM11/14/10

to

On Sun, Nov 14, 2010 at 5:13 PM, Mike Galbraith <efa...@gmx.de> wrote:
>
> Which is what I just did. If the oddball case isn't a big deal, the
> patch shrinks, which is a good thing. I just wanted to cover all bases.

Yeah. And I have to say that I'm (very happily) surprised by just how
small that patch really ends up being, and how it's not intrusive or
ugly either.

I'm also very happy with just what it does to interactive performance.
Admittedly, my "testcase" is really trivial (reading email in a
web-browser, scrolling around a bit, while doing a "make -j64" on the
kernel at the same time), but it's a test-case that is very relevant
for me. And it is a _huge_ improvement.

It's an improvement for things like smooth scrolling around, but what
I found more interesting was how it seems to really make web pages
load a lot faster. Maybe it shouldn't have been surprising, but I
always associated that with network performance. But there's clearly
enough of a CPU load when loading a new web page that if you have a
load average of 50+ at the same time, you _will_ be starved for CPU in
the loading process, and probably won't get all the http requests out
quickly enough.

So I think this is firmly one of those "real improvement" patches.
Good job. Group scheduling goes from "useful for some specific server
loads" to "that's a killer feature".

Linus

Peter Zijlstra

unread,

Nov 15, 2010, 4:00:02 AM11/15/10

to

On Sun, 2010-11-14 at 18:13 -0700, Mike Galbraith wrote:
> +static inline struct task_group *
> +autogroup_task_group(struct task_struct *p, struct task_group *tg)
> +{
> + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
> +
> + enabled &= (tg == &root_task_group);
> + enabled &= (p->sched_class == &fair_sched_class);
> + enabled &= (!(p->flags & PF_EXITING));
> +
> + if (enabled)
> + return p->signal->autogroup->tg;
> +
> + return tg;
> +}

That made me go wtf.. curious way of writing things.

I guess the usual way of writing that (which is admittedly a little more
verbose) would be something like:

static bool
task_wants_autogroup(struct task_struct *tsk, struct task_group *tg)
{
if (tg != &root_task_group)
return false;

if (tsk->sched_class != &fair_sched_class)
return false;

if (tsk->flags & PF_EXITING)
return false;

return true;
}

static inline struct task_group *
autogroup_task_group(struct task_struct *tsk, struct task_group *tg)
{
if (task_wants_autogroup(tsk, tg))
return tsk->signal->autogroup->tg;

return tg;

Mike Galbraith

unread,

Nov 15, 2010, 6:40:02 AM11/15/10

to

That is a bit easier on the eye, modulo my hysterical attachment to "p".

Ok, so it's hopefully now ready to either take wing or go splat.

Baked:

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+

+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{

+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ enabled &= (tg == &root_task_group);
+ enabled &= (p->sched_class == &fair_sched_class);
+ enabled &= (!(p->flags & PF_EXITING));
+
+ if (enabled)
+ return p->signal->autogroup->tg;
+
+ return tg;
+}

Mike Galbraith

unread,

Nov 15, 2010, 6:50:02 AM11/15/10

to

The cat forgot to quilt refresh :)

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

kernel/fork.c | 5 -
kernel/sched.c | 25 ++++-
kernel/sched_autogroup.c | 151 ++++++++++++++++++++++++++++++++++++

kernel/sched_autogroup.h | 18 ++++
kernel/sysctl.c | 11 ++

9 files changed, 235 insertions(+), 9 deletions(-)

@@ -0,0 +1,151 @@

+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (p->flags & PF_EXITING)
+ return false;
+
+ return true;

+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+

+ if (enabled && task_wants_autogroup(p, tg))

Oleg Nesterov

unread,

Nov 15, 2010, 8:10:02 AM11/15/10

to

I continue to play the advocatus diaboli ;)

On 11/15, Mike Galbraith wrote:
>
> +static inline bool
> +task_wants_autogroup(struct task_struct *p, struct task_group *tg)
> +{
> + if (tg != &root_task_group)
> + return false;
> +
> + if (p->sched_class != &fair_sched_class)
> + return false;
> +
> + if (p->flags & PF_EXITING)
> + return false;

Hmm, why? Perhaps PF_EXITING was needed in the previous version to
avoid the race with release_task(). But now it is always safe to
use signal->autogroup.

And the exiting task can do a lot before it disappears, probably
we shouldn't ignore ->autogroup.

> +static void
> +autogroup_move_group(struct task_struct *p, struct autogroup *ag)
> +{
> + struct autogroup *prev;
> + struct task_struct *t;
> + struct rq *rq;
> + unsigned long flags;
> +
> + rq = task_rq_lock(p, &flags);
> + prev = p->signal->autogroup;
> + if (prev == ag) {
> + task_rq_unlock(rq, &flags);
> + return;
> + }
> +
> + p->signal->autogroup = autogroup_kref_get(ag);
> + __sched_move_task(p, rq);
> + task_rq_unlock(rq, &flags);
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> + sched_move_task(t);
> + }
> + rcu_read_unlock();

Not sure I understand why do we need rq->lock...

It can't protect the change of signal->autogroup, multiple callers
can use different rq's.

However. Currently the only callers holds ->siglock, so we are safe.
Perhaps we should just document that autogroup_move_group() needs
->siglock.

This also mean the patch can be simplified even more, __sched_move_task()
is not needed.

> +void sched_autogroup_fork(struct signal_struct *sig)
> +{
> + sig->autogroup = autogroup_kref_get(current->signal->autogroup);
> +}

Well, in theory this can race with another thread doing autogroup_move_group().
We can read the old ->autogroup, and then use it after it was already freed.

Probably this needs ->siglock too.

Oleg.

Mike Galbraith

unread,

Nov 15, 2010, 9:10:02 AM11/15/10

to

On Sun, 2010-11-14 at 19:12 -0800, Linus Torvalds wrote:

> I'm also very happy with just what it does to interactive performance.
> Admittedly, my "testcase" is really trivial (reading email in a
> web-browser, scrolling around a bit, while doing a "make -j64" on the
> kernel at the same time), but it's a test-case that is very relevant
> for me. And it is a _huge_ improvement.

Next logical auto-step would be to try to subvert cfq. At a glance,
io_context looks highly hackable for !CONFIG_CFQ_GROUP_IOSCHED case.
The other case may get interesting for someone not very familiar with
cfq innards, but I see a tempting looking subversion point.

-Mike

Mike Galbraith

unread,

Nov 15, 2010, 4:30:01 PM11/15/10

to

On Mon, 2010-11-15 at 13:57 +0100, Oleg Nesterov wrote:
> I continue to play the advocatus diaboli ;)

Much appreciated.

> On 11/15, Mike Galbraith wrote:
> >
> > +static inline bool
> > +task_wants_autogroup(struct task_struct *p, struct task_group *tg)
> > +{
> > + if (tg != &root_task_group)
> > + return false;
> > +
> > + if (p->sched_class != &fair_sched_class)
> > + return false;
> > +
> > + if (p->flags & PF_EXITING)
> > + return false;
>
> Hmm, why? Perhaps PF_EXITING was needed in the previous version to
> avoid the race with release_task(). But now it is always safe to
> use signal->autogroup.

That came into existence when I stress tested previous version in
PREEMPT_RT (boom). I see no good reason to bother an exiting task
though, so would prefer to leave it as is.

> And the exiting task can do a lot before it disappears, probably
> we shouldn't ignore ->autogroup.

I doubt it would add value.

> Not sure I understand why do we need rq->lock...

Indeed, should have been whacked. Now gone.

> It can't protect the change of signal->autogroup, multiple callers
> can use different rq's.

Guaranteed live ->autogroup should be good enough for heuristic use, and
had better be so. Having to take ->siglock in the fast path would kill
using ->signal.

> However. Currently the only callers holds ->siglock, so we are safe.
> Perhaps we should just document that autogroup_move_group() needs
> ->siglock.

Done.

> This also mean the patch can be simplified even more, __sched_move_task()
> is not needed.

(shrinkage:)

> > +void sched_autogroup_fork(struct signal_struct *sig)
> > +{
> > + sig->autogroup = autogroup_kref_get(current->signal->autogroup);
> > +}
>
> Well, in theory this can race with another thread doing autogroup_move_group().
> We can read the old ->autogroup, and then use it after it was already freed.
>
> Probably this needs ->siglock too.

Another landmine. Done.

Some numbers.

Signed-off-by: Mike Galbraith <efa...@gmx.de>

kernel/sched.c | 9 +-
kernel/sched_autogroup.c | 150 ++++++++++++++++++++++++++++++++++++
kernel/sched_autogroup.h | 16 +++
kernel/sysctl.c | 11 ++
9 files changed, 222 insertions(+), 3 deletions(-)

@@ -0,0 +1,16 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+

+static inline struct task_group *

+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+

+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) { }
+
+static inline struct task_group *

+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{

+ return tg;
+}
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
Index: linux-2.6/kernel/sched_autogroup.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/sched_autogroup.c

@@ -0,0 +1,150 @@

+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+ if (tg != &root_task_group)
+ return false;
+
+ if (p->sched_class != &fair_sched_class)
+ return false;
+
+ if (p->flags & PF_EXITING)
+ return false;

+
+ return true;
+}
+
+static inline struct task_group *

+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{

+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+

+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+ struct autogroup *prev;
+ struct task_struct *t;
+

+ prev = p->signal->autogroup;
+ if (prev == ag)

+ return;

+
+ p->signal->autogroup = autogroup_kref_get(ag);

+ sched_move_task(p);

+
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
+ sched_move_task(t);
+ }
+ rcu_read_unlock();

+
+ autogroup_kref_put(prev);
+}
+

+/* Must be called with siglock held */

+void sched_autogroup_create_attach(struct task_struct *p)
+{
+ struct autogroup *ag = autogroup_create();
+
+ autogroup_move_group(p, ag);
+ /* drop extra refrence added by autogroup_create() */
+ autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+

+/* Must be called with siglock held. Currently has no users */

+void sched_autogroup_detach(struct task_struct *p)
+{
+ autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+

+void sched_autogroup_fork(struct signal_struct *sig)
+{

+ struct sighand_struct *sighand = current->sighand;
+
+ spin_lock(&sighand->siglock);

+ sig->autogroup = autogroup_kref_get(current->signal->autogroup);

+ spin_unlock(&sighand->siglock);

Peter Zijlstra

unread,

Nov 15, 2010, 5:50:01 PM11/15/10

to

On Mon, 2010-11-15 at 14:25 -0700, Mike Galbraith wrote:
> > However. Currently the only callers holds ->siglock, so we are safe.
> > Perhaps we should just document that autogroup_move_group() needs
> > ->siglock.
>
> Done.

lockdep_assert_held() is a good way to document these things.

Valdis.K...@vt.edu

unread,

Nov 15, 2010, 5:50:02 PM11/15/10

to

On Thu, 11 Nov 2010 08:26:40 MST, Mike Galbraith said:

> Implementation: each task struct contains an inherited pointer to a refcounted

> autogroup struct containing a task group pointer, the default for all tasks
> pointing to the init_task_group. When a task calls __proc_set_tty(), the

> task's reference to the default group is dropped, a new task group is created,
> and the task is moved out of the old group and into the new. Children thereafter
> inherit this task group, and increase it's refcount. Calls to __tty_hangup()
> and proc_clear_tty() move the caller back to the init_task_group, and possibly
> destroy the task group. On exit, reference to the current task group is dropped,
> and the task group is potentially destroyed. At runqueue selection time, iff

> a task has no cgroup assignment, it's current autogroup is used.

So the set of all tasks that never call proc_set_tty() ends up in the same one
big default group, correct? Do we have any provisions for making sure that if
a user has 8 or 10 windows open doing heavy work, the default group (with a lot
of important daemons/etc in it) doesn't get starved with only a 1/10th share of
the CPU? Or am I missing something here?

> +extern void sched_autogroup_detatch(struct task_struct *p);

sched_autogroup_detach() instead?

Linus Torvalds

unread,

Nov 15, 2010, 6:30:02 PM11/15/10

to

On Mon, Nov 15, 2010 at 2:41 PM, <Valdis.K...@vt.edu> wrote:
>
> So the set of all tasks that never call proc_set_tty() ends up in the same one
> big default group, correct?

Well, yes and no.

Yes, that's what the code currently does. But I did ask Mike (and he
delivered) to try to make the code look and work in a way where the
whole "tty thing" is just one of the heuristics.

It's not clear exactly what the non-tty heuristics would be, but I do
have a few suggestions:

- I think it might be a good idea to associate a task group with the
current "cred" of a process, and fall back on it in the absense of a
tty-provided one.

Now, for desktop use, that probably doesn't often matter, but even
for just the desktop it would mean that "system daemons" would at
least get a group of their own, rather than be grouped with
"everything else"

(As is, I think the autogroup thing already ends up protecting
system daemons more than _not_ having the autogroup, since it will
basically mean that a "make -j64" won't be stealing all the CPU time
from everybody else - even if the system daemons all end up in that
single "default" group together with the non-tty graphical apps.)

- I suspect we could make kernel daemons be a group of their own.

>�Do we have any provisions for making sure that if

> a user has 8 or 10 windows open doing heavy work, the default group (with a lot
> of important daemons/etc in it) doesn't get starved with only a 1/10th share of
> the CPU? Or am I missing something here?

I think you're missing the fact that without the autogroups, it's
_worse_. If you do "make -j64" without autogroups, those important
daemons get starved by all that work. With the autogroups, they end up
being protected by being in the default group.

So the fact that they are in the default group with lots of other
users doesn't hurt, quite the reverse. User apps are likely to be in
their own groups, so they affect system apps less than they do now.

But I do agree that we might be able to make that all much more explicit.

Linus

Mike Galbraith

unread,

Nov 15, 2010, 6:50:02 PM11/15/10

to

Yes, all tasks never having had a tty association are relegated to the
root task group, and no, there is no provision for the root task group
getting more than it's fair share of CPU.

The patch is only intended to (hopefully) better suit the general case
desktop. One size has zero chance of fitting all ;-)

> > +extern void sched_autogroup_detatch(struct task_struct *p);
>
> sched_autogroup_detach() instead?

Hm, why?

-Mike

Linus Torvalds

unread,

Nov 15, 2010, 7:00:02 PM11/15/10

to

On Mon, Nov 15, 2010 at 3:46 PM, Mike Galbraith <efa...@gmx.de> wrote:
> On Mon, 2010-11-15 at 17:41 -0500, Valdis.K...@vt.edu wrote:
>
>> > +extern void sched_autogroup_detatch(struct task_struct *p);
>>
>> sched_autogroup_detach() instead?
>
> Hm, why?

You really aren't a good speller of that word, are you?

Linus

Mike Galbraith

unread,

Nov 15, 2010, 7:10:01 PM11/15/10

to

On Mon, 2010-11-15 at 15:50 -0800, Linus Torvalds wrote:
> On Mon, Nov 15, 2010 at 3:46 PM, Mike Galbraith <efa...@gmx.de> wrote:
> > On Mon, 2010-11-15 at 17:41 -0500, Valdis.K...@vt.edu wrote:
> >
> >> > +extern void sched_autogroup_detatch(struct task_struct *p);
> >>
> >> sched_autogroup_detach() instead?
> >
> > Hm, why?
>
> You really aren't a good speller of that word, are you?

<stare> d e t a c h... d e t a t....t? aw crap. Guess not.

Couldn't even see it.

-Mike

Linus Torvalds

unread,

Nov 15, 2010, 8:20:01 PM11/15/10

to

Hmm. Just found a bug. I'm not sure if it's the autogroup patches
themselves, or whether it's just the cgroup code that the autogroup
patch enables for me.

When I do

echo t > /proc/sysrq-trigger

(or "w") I get a NULL pointer dereference (offset 0x38 - decimal 56)
in "cgroup_path+0x7", with a call trace of sched_debug_show,
show_state_filter, sysrq_handle_showstate_blocked. I don't have the
whole oops, because the machine is really dead at that point
(presumably died holding the runqueue lock or some other critical
resource), but if required I could take a photo of it. However, I bet
it is repeatable, so I doubt you need it.

Anyway, that "cgroup_path+0x7" is the very first memory dereference:

movq 56(%rdi), %rsi # cgrp_5(D)->dentry, _________p1

so sched_debug_show() is apparently calling cgroup_path() with a NULL
cgroup. I think it's "print_task()" that is to blame, it does

cgroup_path(task_group(p)->css.cgroup, ..

without checking whether there _is_ any css.cgroup.

Peter, that looks like your code (commit d19ca30874f2)

Guys?

Linus

Paul Menage

unread,

Nov 15, 2010, 9:00:02 PM11/15/10

to

On Mon, Nov 15, 2010 at 5:18 PM, Linus Torvalds
<torv...@linux-foundation.org> wrote:
>
> so sched_debug_show() is apparently calling cgroup_path() with a NULL
> cgroup. I think it's "print_task()" that is to blame, it does
>
> cgroup_path(task_group(p)->css.cgroup, ..
>
> without checking whether there _is_ any css.cgroup.

Right - previously the returned task_group would be always associated
with a cgroup. Now, it may not be.

The original task_group() should be made accessible for anything that
wants a real cgroup in the scheduler hierarchy, and called from the
new task_group() function. Not sure what the best naming convention
would be, maybe task_group() and effective_task_group() ?

Paul

Vivek Goyal

unread,

Nov 15, 2010, 9:00:03 PM11/15/10

to

On Mon, Nov 15, 2010 at 02:25:50PM -0700, Mike Galbraith wrote:

[..]

>
> A recurring complaint from CFS users is that parallel kbuild has a negative
> impact on desktop interactivity. This patch implements an idea from Linus,
> to automatically create task groups. This patch only implements Linus' per
> tty task group suggestion, and only for fair class tasks, but leaves the way
> open for enhancement.
>
> Implementation: each task's signal struct contains an inherited pointer to a
> refcounted autogroup struct containing a task group pointer, the default for
> all tasks pointing to the init_task_group. When a task calls __proc_set_tty(),
> the process wide reference to the default group is dropped, a new task group is
> created, and the process is moved into the new task group. Children thereafter
> inherit this task group, and increase it's refcount. On exit, a reference to the
> current task group is dropped when the last reference to each signal struct is
> dropped. The task group is destroyed when the last signal struct referencing
> it is freed. At runqueue selection time, IFF a task has no cgroup assignment,
> it's current autogroup is used.

Mike,

IIUC, this automatically created task group is invisible to user space? I
mean generally there is a task group associated with a cgroup and user space
tools can walk through cgroup hierarchy to figure out how system is
configured. Will that be possible with this patch.

I am wondering what will happen to things like some kind of per cgroup
stats. For example block controller keeps track of number of sectors
transferred per cgroup. Hence this information will not be available for
these internal task groups?

Looks like everybody likes the idea but let me still ask the following
question.

Should this kind of thing be done in user space? I mean what we are
essentially doing providing isolation between two groups. That's why
this cgroup infrastructure is in place. Just that currently how cgroups
are created is fully depends on user space and kernel does not create
cgroups of its own by default (ecept root cgroup).

I think systemd does something similar in the sense every system service
it puts in a cgroup of its own on system startup.

libcgroup daemon has the facility to listen for kernel events (through
netlink socket), and then put newly created tasks in cgroups as per
the user spcefied rules in a config file. For example, if one wants
isolation between tasks of two user ids, one can just write a rule and
once the user logs in, its login session will be automatically placed
in right cgroup. Hence one will be able to achieve isolation between
two users. I think now it also has rules for classifying executables
based on names/paths. So one can put "firefox" in one cgroup and say
"make -j64" in a separate cgroup and provide isolation between two
applications. It is just a matter of putting right rule in the config file.

This patch sounds like an extension to user id problem where we want
isolation between the processes of same user (process groups using
different terminals). Would it make sense to generate some kind of kernel
event for this and let user space execute the rules instead of creating
this functionality in kernel.

This way once we extend this functionality to other subsystems, we can
make it more flexible in user space. For example, create these groups
for cpu controller but lets say not for block controller. Otherwise
we will end up creating more kernel tunables for achieving same effect.

Thanks
Vivek

Linus Torvalds

unread,

Nov 15, 2010, 9:20:01 PM11/15/10

to

On Mon, Nov 15, 2010 at 5:56 PM, Vivek Goyal <vgo...@redhat.com> wrote:
>
> Should this kind of thing be done in user space?

Almost certainly not.

First off, user-space is a fragmented mess. Just from a "let's get it
done" angle, it just doesn't work. There are lots of different thing
that create new tty's, and you can't have them all fixed. Plus it
would be _way_ more code in user space than it is in kernel space.

Secondly, user-space daemons are a total mess. We've tried it many
many times, and every time the _intention_ is to make things simpler
to debug and deploy. And it almost never is. The interfaces end up
being too painful, and the "part of the code is in kernel space, part
of it is in user space" means that things just break all the time.

Finally, the whole "user space is more flexible" is just a lie. It
simply doesn't end up being true. It will be _harder_ to configure
some user-space daemon than it is to just set a flag in /sys or
whatever. The "flexibility" tends to be more a flexibility to get
things wrong than any actual advantage.

Just look at the patch in question. It's simple, it's clean, and it
"just works". Doing the same thing in user space? It would be a total
nightmare, and exactly _because_ it would be a total nightmare, the
code would never be that simple or clean.

Linus

Mike Galbraith

unread,

Nov 16, 2010, 8:00:02 AM11/16/10

to

On Mon, 2010-11-15 at 17:55 -0800, Paul Menage wrote:
> On Mon, Nov 15, 2010 at 5:18 PM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > so sched_debug_show() is apparently calling cgroup_path() with a NULL
> > cgroup. I think it's "print_task()" that is to blame, it does
> >
> > cgroup_path(task_group(p)->css.cgroup, ..
> >
> > without checking whether there _is_ any css.cgroup.
>
> Right - previously the returned task_group would be always associated
> with a cgroup. Now, it may not be.
>
> The original task_group() should be made accessible for anything that
> wants a real cgroup in the scheduler hierarchy, and called from the
> new task_group() function. Not sure what the best naming convention
> would be, maybe task_group() and effective_task_group() ?

effective_task_group() works for me. Autogroup (currently at least)
only needs to interface with set_task_rq().

A tasty alternative would be to have autogroup be it's own subsystem,
with full cgroup userspace visibility/tweakability. I have no idea if
that's feasible though. I glanced at cgroup.c, but didn't see the dirt
simple I wanted, and quickly ran away.

---
kernel/sched.c | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c

@@ -606,27 +606,33 @@ static inline int cpu_of(struct rq *rq)

*/
static inline struct task_group *task_group(struct task_struct *p)
{

- struct task_group *tg;

struct cgroup_subsys_state *css;

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));

- tg = container_of(css, struct task_group, css);
+ return container_of(css, struct task_group, css);
+}

- return autogroup_task_group(p, tg);
+static inline struct task_group *effective_task_group(struct task_struct *p)
+{
+ return autogroup_task_group(p, task_group(p));

}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
{
+#if (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
+ struct task_group *tg = effective_task_group(p);
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
- p->se.parent = task_group(p)->se[cpu];
+ p->se.cfs_rq = tg->cfs_rq[cpu];
+ p->se.parent = tg->se[cpu];
#endif

#ifdef CONFIG_RT_GROUP_SCHED
- p->rt.rt_rq = task_group(p)->rt_rq[cpu];
- p->rt.parent = task_group(p)->rt_se[cpu];
+ p->rt.rt_rq = tg->rt_rq[cpu];
+ p->rt.parent = tg->rt_se[cpu];
#endif

Oleg Nesterov

unread,

Nov 16, 2010, 8:20:03 AM11/16/10

to

On 11/15, Mike Galbraith wrote:
>
> On Mon, 2010-11-15 at 13:57 +0100, Oleg Nesterov wrote:
>
> > And the exiting task can do a lot before it disappears, probably
> > we shouldn't ignore ->autogroup.

I don't really understand what makes the exiting task different,
but OK.

However, I must admit I dislike this check. Because, looking at this
code, it is not clear why do we check PF_EXITING. It looks as if it
is needed for correctness.

OK, this is minor. I think the patch is correct, just one nit below.

> > It can't protect the change of signal->autogroup, multiple callers
> > can use different rq's.
>
> Guaranteed live ->autogroup should be good enough for heuristic use, and
> had better be so. Having to take ->siglock in the fast path would kill
> using ->signal.

Yes, sure, rq->lock should ensure signal->autogroup can't go away.
(even if it can be changed under us). And it does, we are moving all
threads before kref_put().

> +static void
> +autogroup_move_group(struct task_struct *p, struct autogroup *ag)
> +{
> + struct autogroup *prev;
> + struct task_struct *t;
> +
> + prev = p->signal->autogroup;
> + if (prev == ag)
> + return;
> +
> + p->signal->autogroup = autogroup_kref_get(ag);
> + sched_move_task(p);
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> + sched_move_task(t);
> + }
> + rcu_read_unlock();
> +
> + autogroup_kref_put(prev);
> +}

Well, this looks a bit strange (but correct).

We are changing ->autogroup assuming the caller holds ->siglock.
But if we hold ->siglock we do not need rcu_read_lock() to iterate
over the thread_group, we can just do

p->signal->autogroup = autogroup_kref_get(ag);

t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);

Again, this is minor, I won't insist.

Oleg.

Mike Galbraith

unread,

Nov 16, 2010, 9:10:02 AM11/16/10

to

On Mon, 2010-11-15 at 20:56 -0500, Vivek Goyal wrote:

> Mike,
>
> IIUC, this automatically created task group is invisible to user space? I
> mean generally there is a task group associated with a cgroup and user space
> tools can walk through cgroup hierarchy to figure out how system is
> configured. Will that be possible with this patch.

No, it's dirt simple automation only at this point.

> I am wondering what will happen to things like some kind of per cgroup
> stats. For example block controller keeps track of number of sectors
> transferred per cgroup. Hence this information will not be available for
> these internal task groups?

No, it won't. The target audience is those folks who don't _do_ the
configuration they _could_ do, folks who don't use SCHED_IDLE or nice,
or the power available through userspace cgroup tools.. folks who expect
their box to "just work", out of the box.

> Looks like everybody likes the idea but let me still ask the following
> question.
>
> Should this kind of thing be done in user space? I mean what we are
> essentially doing providing isolation between two groups. That's why
> this cgroup infrastructure is in place. Just that currently how cgroups
> are created is fully depends on user space and kernel does not create
> cgroups of its own by default (ecept root cgroup).

I was of the same mind when Linus first broached the subject, but Ingo
convinced me it was worth exploring because of the simple fact that
people are not using the available tools.

Sadly, this includes distros.

AFAIK, no distro has cgroups configured and ready for aunt tilly, no
distro has taught the GUI to use cgroups _at all_, even though it's
trivial to launch self-reaping task groups from userspace.

> I think systemd does something similar in the sense every system service
> it puts in a cgroup of its own on system startup.
>
> libcgroup daemon has the facility to listen for kernel events (through
> netlink socket), and then put newly created tasks in cgroups as per
> the user spcefied rules in a config file. For example, if one wants
> isolation between tasks of two user ids, one can just write a rule and
> once the user logs in, its login session will be automatically placed
> in right cgroup. Hence one will be able to achieve isolation between
> two users. I think now it also has rules for classifying executables
> based on names/paths. So one can put "firefox" in one cgroup and say
> "make -j64" in a separate cgroup and provide isolation between two
> applications. It is just a matter of putting right rule in the config file.

I fiddled with configuring my system, but found options lacking. For
instance, I found no way to automate per tty (or pgid in my case). I
had to cobble scripts together to get the job done. Nothing that my
distro delivered could just make it just happen for me.

When the tool mature, and distros use them, in kernel automation may
well become more or less moot, but in the here and now, there is a
target audience with a need that is not being serviced.

> This patch sounds like an extension to user id problem where we want
> isolation between the processes of same user (process groups using
> different terminals). Would it make sense to generate some kind of kernel
> event for this and let user space execute the rules instead of creating
> this functionality in kernel.

Per user isn't very useful. The typical workstation has one user
whacking away on the kbd/mouse. While you can identify firefox etc,
it's not being done, and requires identifying every application. Heck,
cgroups is built in, but userspace doesn't even mount. Nothing but
nothing uses cgroups.

> This way once we extend this functionality to other subsystems, we can
> make it more flexible in user space. For example, create these groups
> for cpu controller but lets say not for block controller. Otherwise
> we will end up creating more kernel tunables for achieving same effect.

I see your arguments, and agree to a large extent. As Linus noted,
there are other advantages to in kernel automation, but for me, it all
boils down to the fact that userspace is doing nothing with the tools.

ATM, cgroups is an enterprise or power user tool. The out of the box
distro kernel user sees zero benefit for the overhead investment.

-Mike

Peter Zijlstra

unread,

Nov 16, 2010, 9:10:02 AM11/16/10

to

On Mon, 2010-11-15 at 14:25 -0700, Mike Galbraith wrote:

> > > + if (p->flags & PF_EXITING)
> > > + return false;
> >
> > Hmm, why? Perhaps PF_EXITING was needed in the previous version to
> > avoid the race with release_task(). But now it is always safe to
> > use signal->autogroup.
>
> That came into existence when I stress tested previous version in
> PREEMPT_RT (boom). I see no good reason to bother an exiting task
> though, so would prefer to leave it as is.

PREEMPT_RT has a slightly different exit path IIRC. If that was the only
thing you saw it explode on we could leave the check out for now and
revisit it in the -rt patches when and if it pops up. Hmm?

Peter Zijlstra

unread,

Nov 16, 2010, 9:10:02 AM11/16/10

to

On Mon, 2010-11-15 at 17:55 -0800, Paul Menage wrote:

> On Mon, Nov 15, 2010 at 5:18 PM, Linus Torvalds
> <torv...@linux-foundation.org> wrote:
> >
> > so sched_debug_show() is apparently calling cgroup_path() with a NULL
> > cgroup. I think it's "print_task()" that is to blame, it does
> >
> > cgroup_path(task_group(p)->css.cgroup, ..
> >
> > without checking whether there _is_ any css.cgroup.
>
> Right - previously the returned task_group would be always associated
> with a cgroup. Now, it may not be.
>
> The original task_group() should be made accessible for anything that
> wants a real cgroup in the scheduler hierarchy, and called from the
> new task_group() function. Not sure what the best naming convention
> would be, maybe task_group() and effective_task_group() ?

Right, that doesn't solve the full problem though.

/proc/sched_debug should show these automagic task_groups, its just that
there's currently no way to properly name them, we can of course add
something like a name field to the struct autogroup thing, but what do
we fill it with? "autogroup-%d" and keep a sequence number for each
autogroup?

Then the below task_group_path() thing can try the autogroup name scheme
if it finds a NULL css.

Something like the below might avoid the explosion:

---
kernel/sched_debug.c | 28 ++++++++++++++--------------
1 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d1..9b5560f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -87,6 +87,19 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
}
#endif

+#if defined(CONFIG_CGROUP_SCHED) && \
+ (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
+static void task_group_path(struct task_group *tg, char *buf, int buflen)
+{
+ /* may be NULL if the underlying cgroup isn't fully-created yet */
+ if (!tg->css.cgroup) {
+ buf[0] = '\0';
+ return;
+ }
+ cgroup_path(tg->css.cgroup, buf, buflen);
+}
+#endif
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
@@ -115,7 +128,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
char path[64];

rcu_read_lock();
- cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+ task_group_path(task_group(p), path, sizeof(path));
rcu_read_unlock();
SEQ_printf(m, " %s", path);
}
@@ -147,19 +160,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_unlock_irqrestore(&tasklist_lock, flags);
}

-#if defined(CONFIG_CGROUP_SCHED) && \
- (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
- /* may be NULL if the underlying cgroup isn't fully-created yet */
- if (!tg->css.cgroup) {
- buf[0] = '\0';
- return;
- }
- cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,

Mike Galbraith

unread,

Nov 16, 2010, 9:20:01 AM11/16/10

to

On Tue, 2010-11-16 at 14:04 +0100, Oleg Nesterov wrote:
> On 11/15, Mike Galbraith wrote:
> >
> > On Mon, 2010-11-15 at 13:57 +0100, Oleg Nesterov wrote:
> >
> > > And the exiting task can do a lot before it disappears, probably
> > > we shouldn't ignore ->autogroup.
>
> I don't really understand what makes the exiting task different,
> but OK.
>
> However, I must admit I dislike this check. Because, looking at this
> code, it is not clear why do we check PF_EXITING. It looks as if it
> is needed for correctness.

Is _not_ needed I presume.

I'll remove it, I'm not overly attached (a t t a..;) to it.

> OK, this is minor. I think the patch is correct, just one nit below.
>
> > > It can't protect the change of signal->autogroup, multiple callers
> > > can use different rq's.
> >
> > Guaranteed live ->autogroup should be good enough for heuristic use, and
> > had better be so. Having to take ->siglock in the fast path would kill
> > using ->signal.
>
> Yes, sure, rq->lock should ensure signal->autogroup can't go away.
> (even if it can be changed under us). And it does, we are moving all
> threads before kref_put().

(yeah)

> > +static void
> > +autogroup_move_group(struct task_struct *p, struct autogroup *ag)
> > +{
> > + struct autogroup *prev;
> > + struct task_struct *t;
> > +
> > + prev = p->signal->autogroup;
> > + if (prev == ag)
> > + return;
> > +
> > + p->signal->autogroup = autogroup_kref_get(ag);
> > + sched_move_task(p);
> > +
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(t, &p->thread_group, thread_group) {
> > + sched_move_task(t);
> > + }
> > + rcu_read_unlock();
> > +
> > + autogroup_kref_put(prev);
> > +}
>
> Well, this looks a bit strange (but correct).

My mouse copied it.

> We are changing ->autogroup assuming the caller holds ->siglock.
> But if we hold ->siglock we do not need rcu_read_lock() to iterate
> over the thread_group, we can just do
>
> p->signal->autogroup = autogroup_kref_get(ag);
>
> t = p;
> do {
> sched_move_task(t);
> } while_each_thread(p, t);
>
> Again, this is minor, I won't insist.

I'll do it that way. I was pondering adding the option to move one or
all as cgroups does, but don't think that will ever be needed.

Thanks,

-Mike

Peter Zijlstra

unread,

Nov 16, 2010, 9:20:01 AM11/16/10

to

On Tue, 2010-11-16 at 07:02 -0700, Mike Galbraith wrote:
> While you can identify firefox etc,
> it's not being done, and requires identifying every application. Heck,
> cgroups is built in, but userspace doesn't even mount. Nothing but
> nothing uses cgroups.

The yet another init rewrite called systemd is supposedly cgroup happy..
No idea if its going to be useful though, I doubt its going to have an
effect on me launching a konsole or the like, or screen creating a bunch
of ttys.

Mike Galbraith

unread,

Nov 16, 2010, 9:30:05 AM11/16/10

to

On Tue, 2010-11-16 at 14:59 +0100, Peter Zijlstra wrote:
> On Mon, 2010-11-15 at 17:55 -0800, Paul Menage wrote:
> > On Mon, Nov 15, 2010 at 5:18 PM, Linus Torvalds
> > <torv...@linux-foundation.org> wrote:
> > >
> > > so sched_debug_show() is apparently calling cgroup_path() with a NULL
> > > cgroup. I think it's "print_task()" that is to blame, it does
> > >
> > > cgroup_path(task_group(p)->css.cgroup, ..
> > >
> > > without checking whether there _is_ any css.cgroup.
> >
> > Right - previously the returned task_group would be always associated
> > with a cgroup. Now, it may not be.
> >
> > The original task_group() should be made accessible for anything that
> > wants a real cgroup in the scheduler hierarchy, and called from the
> > new task_group() function. Not sure what the best naming convention
> > would be, maybe task_group() and effective_task_group() ?
>
> Right, that doesn't solve the full problem though.
>
> /proc/sched_debug should show these automagic task_groups, its just that
> there's currently no way to properly name them, we can of course add
> something like a name field to the struct autogroup thing, but what do
> we fill it with? "autogroup-%d" and keep a sequence number for each
> autogroup?

I was considering exactly that for /proc/N/cgroup visibility. Might get
thumped if I re-use that file though.

Mike Galbraith

unread,

Nov 16, 2010, 9:30:04 AM11/16/10

to

On Tue, 2010-11-16 at 15:01 +0100, Peter Zijlstra wrote:
> On Mon, 2010-11-15 at 14:25 -0700, Mike Galbraith wrote:
> > > > + if (p->flags & PF_EXITING)
> > > > + return false;
> > >
> > > Hmm, why? Perhaps PF_EXITING was needed in the previous version to
> > > avoid the race with release_task(). But now it is always safe to
> > > use signal->autogroup.
> >
> > That came into existence when I stress tested previous version in
> > PREEMPT_RT (boom). I see no good reason to bother an exiting task
> > though, so would prefer to leave it as is.
>
> PREEMPT_RT has a slightly different exit path IIRC. If that was the only
> thing you saw it explode on we could leave the check out for now and
> revisit it in the -rt patches when and if it pops up. Hmm?

Yeah, I'm going to whack it. (and add your lockdep thingy)

-Mike

Dhaval Giani

unread,

Nov 16, 2010, 9:50:02 AM11/16/10

to

On Tue, Nov 16, 2010 at 3:11 PM, Peter Zijlstra <a.p.zi...@chello.nl> wrote:
> On Tue, 2010-11-16 at 07:02 -0700, Mike Galbraith wrote:
>> While you can identify firefox etc,
>> it's not being done, and requires identifying every application. Heck,
>> cgroups is built in, but userspace doesn't even mount. Nothing but
>> nothing uses cgroups.
>
> The yet another init rewrite called systemd is supposedly cgroup happy..
> No idea if its going to be useful though, I doubt its going to have an
> effect on me launching a konsole or the like, or screen creating a bunch
> of ttys.

systemd uses cgroups only for process tracking. No resource
management. Though afaik, Lennart has some plans of doing resource
management using systemd. I wonder how autogroups will interact with
systemd in that case.

Dhaval

Oleg Nesterov

unread,

Nov 16, 2010, 10:20:02 AM11/16/10

to

On 11/16, Mike Galbraith wrote:
>
> On Tue, 2010-11-16 at 14:04 +0100, Oleg Nesterov wrote:
> > However, I must admit I dislike this check. Because, looking at this
> > code, it is not clear why do we check PF_EXITING. It looks as if it
> > is needed for correctness.
>
> Is _not_ needed I presume.
>
> I'll remove it, I'm not overly attached (a t t a..;) to it.

Argh!

I was wrong, it _is_ needed for correctness. Yes, it is always safe
to read the pointer, but

> > Yes, sure, rq->lock should ensure signal->autogroup can't go away.
> > (even if it can be changed under us). And it does, we are moving all
> > threads before kref_put().
>
> (yeah)

Exactly. And this means we can _only_ assume it can't go away if
autogroup_move_group() can see us on ->thread_group list.

Perhaps this deserve a commen (unless I missed something again).

Mike, sorry for confusion.

Oleg.

Mike Galbraith

unread,

Nov 16, 2010, 10:50:02 AM11/16/10

to

On Tue, 2010-11-16 at 16:03 +0100, Oleg Nesterov wrote:
> On 11/16, Mike Galbraith wrote:
> >
> > On Tue, 2010-11-16 at 14:04 +0100, Oleg Nesterov wrote:
> > > However, I must admit I dislike this check. Because, looking at this
> > > code, it is not clear why do we check PF_EXITING. It looks as if it
> > > is needed for correctness.
> >
> > Is _not_ needed I presume.
> >
> > I'll remove it, I'm not overly attached (a t t a..;) to it.
>
> Argh!
>
> I was wrong, it _is_ needed for correctness. Yes, it is always safe
> to read the pointer, but
>
> > > Yes, sure, rq->lock should ensure signal->autogroup can't go away.
> > > (even if it can be changed under us). And it does, we are moving all
> > > threads before kref_put().
> >
> > (yeah)
>
> Exactly. And this means we can _only_ assume it can't go away if
> autogroup_move_group() can see us on ->thread_group list.

Aha!

> Perhaps this deserve a commen (unless I missed something again).
>
> Mike, sorry for confusion.

Oh no, thank you. I hadn't figured it out yet, was going to go back and
poke rt kernel with sharp sticks. (exit can be one scary beast)

-Mike

Linus Torvalds

unread,

Nov 16, 2010, 12:20:02 PM11/16/10

to

On Tue, Nov 16, 2010 at 9:03 AM, Lennart Poettering
<mzxr...@0pointer.de> wrote:
>
> Binding something like this to TTYs is just backwards.

Numbers talk, bullshit walks.

The numbers have been quoted. The clear interactive behavior has been seen.

And you're just full of bullshit.

Come back when you have something working and with numbers and better
interactive performance. Until then, nobody cares.

Linus

Ingo Molnar

unread,

Nov 16, 2010, 12:30:02 PM11/16/10

to

* Mike Galbraith <efa...@gmx.de> wrote:

> > Exactly. And this means we can _only_ assume it can't go away if
> > autogroup_move_group() can see us on ->thread_group list.
>
> Aha!

Mike,

Mind sending a new patch with a separate v2 announcement in a new thread, once you
have something i could apply to the scheduler tree (for a v2.6.38 merge)?

You sent a couple of iterations in this discussion and i'd rather not fish out the
wrong one.

Thanks,

Ingo

Mike Galbraith

unread,

Nov 16, 2010, 12:50:02 PM11/16/10

to

On Tue, 2010-11-16 at 18:28 +0100, Ingo Molnar wrote:
> * Mike Galbraith <efa...@gmx.de> wrote:
>
> > > Exactly. And this means we can _only_ assume it can't go away if
> > > autogroup_move_group() can see us on ->thread_group list.
> >
> > Aha!
>
> Mike,
>
> Mind sending a new patch with a separate v2 announcement in a new thread, once you
> have something i could apply to the scheduler tree (for a v2.6.38 merge)?

Will do. (v3->936 became wider than expected;)

-Mike

Peter Zijlstra

unread,

Nov 16, 2010, 1:10:04 PM11/16/10

to

On Tue, 2010-11-16 at 18:03 +0100, Lennart Poettering wrote:
> Binding something like this to TTYs is just backwards. No graphical
> session has a TTY attached anymore. And there might be multiple TTYs
> used in the same session.

Using a group per tty makes sense for us console jockeys..

Anyway, nobody uses systemd yet and afaik not all distro's even plan on
using it (I know I'm not waiting to learn yet another init variant).

Lennart Poettering

unread,

Nov 16, 2010, 1:20:02 PM11/16/10

to

On Tue, 16.11.10 09:11, Linus Torvalds (torv...@linux-foundation.org) wrote:

>
> On Tue, Nov 16, 2010 at 9:03 AM, Lennart Poettering
> <mzxr...@0pointer.de> wrote:
> >
> > Binding something like this to TTYs is just backwards.
>
> Numbers talk, bullshit walks.
>
> The numbers have been quoted. The clear interactive behavior has been seen.

Here's my super-complex patch btw, to achieve exactly the same thing
from userspace without involving any kernel or systemd patching and
kernel-side logic. Simply edit your own ~/.bashrc and add this to the end:

if [ "$PS1" ] ; then
mkdir -m 0700 /sys/fs/cgroup/cpu/user/$$
echo $$ > /sys/fs/cgroup/cpu/user/$$/tasks
fi

Then, as the superuser do this:

mount -t cgroup cgroup /sys/fs/cgroup/cpu -o cpu
mkdir -m 0777 /sys/fs/cgroup/cpu/user

Done. Same effect. However: not crazy.

I am not sure I myself will find the time to prep some 'numbers' for
you. They'd be the same as with the kernel patch anyway. But I am sure
somebody else will do it for you...

Lennart

--
Lennart Poettering - Red Hat, Inc.

Paul Menage

unread,

Nov 16, 2010, 1:30:02 PM11/16/10

to

On Tue, Nov 16, 2010 at 4:58 AM, Mike Galbraith <efa...@gmx.de> wrote:
>
> A tasty alternative would be to have autogroup be it's own subsystem,
> with full cgroup userspace visibility/tweakability.

What exactly do you envisage by that? Having autogroup (in its current
incarnation) be a subsystem wouldn't really make sense - there's
already a cgroup subsystem for partitioning CPU scheduler groups. If
autogroups were integrated with cgroups I think that it would be as a
way of automatically creating (and destroying?) groups based on tty
connectedness.

We tried something like this with the ns subsystem, which would
create/enter a new cgroup whenever a new namespace was created; in the
end it turned out to be more of a nuisance than anything else.

People have proposed all sorts of in-kernel approaches for
auto-creation of cgroups based on things like userid, process name,
now tty, etc.

The previous effort for kernel process grouping (CKRM) started off
with a complex in-kernel rules engine that was ultimately dropped and
moved to userspace. My feeling is that userspace is a better place for
this - as Lennart pointed out, you can get a similar effect with a few
lines tweaking in a bash login script or a pam module that's much more
configurable from userspace and keeps all the existing cgroup stats
available.

Paul

Peter Zijlstra

unread,

Nov 16, 2010, 1:30:02 PM11/16/10

to

On Tue, 2010-11-16 at 19:16 +0100, Lennart Poettering wrote:
> On Tue, 16.11.10 09:11, Linus Torvalds (torv...@linux-foundation.org) wrote:
>
> >
> > On Tue, Nov 16, 2010 at 9:03 AM, Lennart Poettering
> > <mzxr...@0pointer.de> wrote:
> > >
> > > Binding something like this to TTYs is just backwards.
> >
> > Numbers talk, bullshit walks.
> >
> > The numbers have been quoted. The clear interactive behavior has been seen.
>
> Here's my super-complex patch btw, to achieve exactly the same thing
> from userspace without involving any kernel or systemd patching and
> kernel-side logic. Simply edit your own ~/.bashrc and add this to the end:
>
> if [ "$PS1" ] ; then
> mkdir -m 0700 /sys/fs/cgroup/cpu/user/$$
> echo $$ > /sys/fs/cgroup/cpu/user/$$/tasks
> fi
>
> Then, as the superuser do this:
>
> mount -t cgroup cgroup /sys/fs/cgroup/cpu -o cpu
> mkdir -m 0777 /sys/fs/cgroup/cpu/user
>
> Done. Same effect. However: not crazy.
>
> I am not sure I myself will find the time to prep some 'numbers' for
> you. They'd be the same as with the kernel patch anyway. But I am sure
> somebody else will do it for you...

Not quite the same, you're nesting one level deeper. But the reality is,
not a lot of people will change their userspace.

Paul Menage

unread,

Nov 16, 2010, 1:40:03 PM11/16/10

to

On Tue, Nov 16, 2010 at 10:21 AM, Peter Zijlstra <a.p.zi...@chello.nl> wrote:
>
> Not quite the same, you're nesting one level deeper. But the reality is,
> not a lot of people will change their userspace.

That's a weak argument - not a lot of people will (explicitly) change
their kernel either - they'll get a fresh kernel via their distro
updates, as they would get userspace updates. So it's only a few
people (distros) that actually need to make such a change.

Paul

Linus Torvalds

unread,

Nov 16, 2010, 2:00:04 PM11/16/10

to

On Tue, Nov 16, 2010 at 10:16 AM, Lennart Poettering
<mzxr...@0pointer.de> wrote:
>
> Here's my super-complex patch btw, to achieve exactly the same thing
> from userspace without involving any kernel or systemd patching and
> kernel-side logic. Simply edit your own ~/.bashrc and add this to the end:

Right. And that's basically how this "patch" was actually tested
originally - by doing this by hand, without actually having a patch in
hand. I told people: this seems to work really well. Mike made it work
automatically.

Because it's something we want to do it for all users, and for all
shells, and make sure it gets done automatically. Including for users
that have old distributions etc, and make it easy to do in one place.
And then you do it for all the other heuristics we can see easily in
the kernel. And then you do it magically without users even having to
_notice_.

Suddenly it doesn't seem that wonderful any more to play with bashrc, does it?

That's the point. We can push out the kernel change, and everything
will "just work". We can make that feature we already have in the
kernel actually be _useful_.

User-level configuration for something that should just work is
annoying. We can do better.

Put another way: if we find a better way to do something, we should
_not_ say "well, if users want it, they can do this <technical thing
here>". If it really is a better way to do something, we should just
do it. Requiring user setup is _not_ a feature.

Now, I'm not saying that we shouldn't allow users to use cgroups. Of
course they can do things manually too. But we shouldn't require users
to do silly things that we can more easily do ourselves.

If the choice is between telling everybody "you should do this", and
"we should just do this for you", I'll take the second one every time.
We know it should be done. Why should we then tell somebody else to do
it for us?

Linus

da...@lang.hm

unread,

Nov 16, 2010, 2:00:02 PM11/16/10

to

On Tue, 16 Nov 2010, Paul Menage wrote:

> On Tue, Nov 16, 2010 at 10:21 AM, Peter Zijlstra <a.p.zi...@chello.nl> wrote:
>>
>> Not quite the same, you're nesting one level deeper. But the reality is,
>> not a lot of people will change their userspace.
>
> That's a weak argument - not a lot of people will (explicitly) change
> their kernel either - they'll get a fresh kernel via their distro
> updates, as they would get userspace updates. So it's only a few
> people (distros) that actually need to make such a change.

what is the downside of this patch going to be?

people who currently expect all the processes to compete equally will now
find that they no longer do so. If I am understanding this correctly, this
could mean that a box that was dedicated to running one application will
now have that application no longer dominate the system, instead it will
'share equally' with the housekeeping apps on the system.

what would need to be done to revert to the prior situation?

David Lang

Stephen Clark

unread,

Nov 16, 2010, 2:00:03 PM11/16/10

to

On 11/16/2010 01:16 PM, Lennart Poettering wrote:
> On Tue, 16.11.10 09:11, Linus Torvalds (torv...@linux-foundation.org) wrote:
>
>
>> On Tue, Nov 16, 2010 at 9:03 AM, Lennart Poettering
>> <mzxr...@0pointer.de> wrote:
>>
>>> Binding something like this to TTYs is just backwards.
>>>
>> Numbers talk, bullshit walks.
>>
>> The numbers have been quoted. The clear interactive behavior has been seen.
>>
> Here's my super-complex patch btw, to achieve exactly the same thing
> from userspace without involving any kernel or systemd patching and
> kernel-side logic. Simply edit your own ~/.bashrc and add this to the end:
>
> if [ "$PS1" ] ; then
> mkdir -m 0700 /sys/fs/cgroup/cpu/user/$$
> echo $$> /sys/fs/cgroup/cpu/user/$$/tasks
> fi
>
> Then, as the superuser do this:
>
> mount -t cgroup cgroup /sys/fs/cgroup/cpu -o cpu
> mkdir -m 0777 /sys/fs/cgroup/cpu/user
>
> Done. Same effect. However: not crazy.
>
> I am not sure I myself will find the time to prep some 'numbers' for
> you. They'd be the same as with the kernel patch anyway. But I am sure
> somebody else will do it for you...
>
> Lennart
>
>

So you have tested this and have a nice demo and numbers to back it up?

--

"They that give up essential liberty to obtain temporary safety,
deserve neither liberty nor safety." (Ben Franklin)

"The course of history shows that as a government grows, liberty
decreases." (Thomas Jefferson)