[RFC] sched: implement the exclusive wait queue as a LIFO queue

Changli Gao

unread,

Apr 28, 2010, 1:30:25 AM4/28/10

to Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org, Changli Gao

implement the exclusive wait queue as a LIFO queue

If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
process who goes to sleep recently, will be woke up first. As its memory is
more likely in cache, we will get better performance. And when there are many
processes waiting on a exclusive wait queue, some of them may not be woke up,
if the others can handle the workload, and it will reduce the load of
the scheduler.

Note: before applying this patch, you need my previous patch patched first.
https://patchwork.kernel.org/patch/95600/

Signed-off-by: Changli Gao <xia...@gmail.com>
----
fs/eventpoll.c | 3 +--
include/linux/wait.h | 17 +++++++----------
kernel/sched.c | 8 ++++----
kernel/wait.c | 9 +++------
4 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5..e9b3ebe 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue(&ep->wq, &wait);
+ __add_wait_queue_ex(&ep->wq, &wait);

for (;;) {
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b..95c127d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -30,8 +30,6 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);

struct __wait_queue {
- unsigned int flags;
-#define WQ_FLAG_EXCLUSIVE 0x01
void *private;
wait_queue_func_t func;
struct list_head task_list;
@@ -50,6 +48,7 @@ struct wait_bit_queue {
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
+ struct list_head task_list_ex;
};
typedef struct __wait_queue_head wait_queue_head_t;

@@ -69,7 +68,8 @@ struct task_struct;

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
- .task_list = { &(name).task_list, &(name).task_list } }
+ .task_list = { &(name).task_list, &(name).task_list }, \
+ .task_list_ex = { &(name).task_list_ex, &(name).task_list_ex } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -97,7 +97,6 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *)

static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
- q->flags = 0;
q->private = p;
q->func = default_wake_function;
}
@@ -105,14 +104,13 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
static inline void init_waitqueue_func_entry(wait_queue_t *q,
wait_queue_func_t func)
{
- q->flags = 0;
q->private = NULL;
q->func = func;
}

static inline int waitqueue_active(wait_queue_head_t *q)
{
- return !list_empty(&q->task_list);
+ return !list_empty(&q->task_list) || !list_empty(&q->task_list);
}

extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
@@ -127,10 +125,10 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
/*
* Used for wake-one threads:
*/
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
+static inline void __add_wait_queue_ex(wait_queue_head_t *head,
wait_queue_t *new)
{
- list_add_tail(&new->task_list, &head->task_list);
+ list_add(&new->task_list, &head->task_list_ex);
}

static inline void __remove_wait_queue(wait_queue_head_t *head,
@@ -409,8 +407,7 @@ do { \
static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
wait_queue_t * wait)
{
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
}

/*
diff --git a/kernel/sched.c b/kernel/sched.c
index be5ab70..59b1534 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3903,11 +3903,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
{
wait_queue_t *curr, *next;

- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list)
+ curr->func(curr, mode, wake_flags, key);

- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ list_for_each_entry_safe(curr, next, &q->task_list_ex, task_list) {
+ if (curr->func(curr, mode, wake_flags, key) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d8..a0559df 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -15,6 +15,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
spin_lock_init(&q->lock);
lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
+ INIT_LIST_HEAD(&q->task_list_ex);
}

EXPORT_SYMBOL(__init_waitqueue_head);
@@ -23,7 +24,6 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
@@ -34,9 +34,8 @@ void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
@@ -69,7 +68,6 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
@@ -83,10 +81,9 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;

- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Changli Gao

unread,

Apr 28, 2010, 2:23:18 AM4/28/10

to Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org, Changli Gao

On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <xia...@gmail.com> wrote:
>
> static inline int waitqueue_active(wait_queue_head_t *q)
> {
> - return !list_empty(&q->task_list);
> + return !list_empty(&q->task_list) || !list_empty(&q->task_list);
> }
>

I am sorry. the later task_list should be task_list_ex. The updated
patch is attached. And I will replace task_list list with hlist for
saving space.

--
Regards，
Changli Gao(xia...@gmail.com)

wq-ex-lifo.diff

Xiaotian Feng

unread,

Apr 28, 2010, 3:48:06 AM4/28/10

to Changli Gao, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <xia...@gmail.com> wrote:

> implement the exclusive wait queue as a LIFO queue
>
> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
> process who goes to sleep recently, will be woke up first. As its memory is
> more likely in cache, we will get better performance. And when there are many
> processes waiting on a exclusive wait queue, some of them may not be woke up,
> if the others can handle the workload, and it will reduce the load of
> the scheduler.
>

Starve some processes for performance?

Changli Gao

unread,

Apr 28, 2010, 3:52:28 AM4/28/10

to Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 3:47 PM, Xiaotian Feng <xtf...@gmail.com> wrote:
> On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <xia...@gmail.com> wrote:
>> implement the exclusive wait queue as a LIFO queue
>>
>> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
>> process who goes to sleep recently, will be woke up first. As its memory is
>> more likely in cache, we will get better performance. And when there are many
>> processes waiting on a exclusive wait queue, some of them may not be woke up,
>> if the others can handle the workload, and it will reduce the load of
>> the scheduler.
>>
>
> Starve some processes for performance?
>

Starve? Oh, No. If we don't need these processes, and we can do better
without them, why we wake them up?

--
Regards，
Changli Gao(xia...@gmail.com)

Changli Gao

unread,

Apr 28, 2010, 4:06:29 AM4/28/10

to Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org, Changli Gao

On Wed, Apr 28, 2010 at 2:22 PM, Changli Gao <xia...@gmail.com> wrote:
>
> I am sorry. the later task_list should be task_list_ex. The updated
> patch is attached. And I will replace task_list list with hlist for
> saving space.
>

This version replaces list with hlist.

--
Regards，
Changli Gao(xia...@gmail.com)

wq-ex-lifo.diff

Yong Zhang

unread,

Apr 28, 2010, 4:17:56 AM4/28/10

to Changli Gao, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 03:52:01PM +0800, Changli Gao wrote:
> On Wed, Apr 28, 2010 at 3:47 PM, Xiaotian Feng <xtf...@gmail.com> wrote:
> > On Wed, Apr 28, 2010 at 1:03 PM, Changli Gao <xia...@gmail.com> wrote:
> >> implement the exclusive wait queue as a LIFO queue
> >>
> >> If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
> >> process who goes to sleep recently, will be woke up first. As its memory is
> >> more likely in cache, we will get better performance. And when there are many
> >> processes waiting on a exclusive wait queue, some of them may not be woke up,
> >> if the others can handle the workload, and it will reduce the load of
> >> the scheduler.
> >>
> >
> > Starve some processes for performance?
> >
>
> Starve? Oh, No. If we don't need these processes, and we can do better

What do you mean "we don't need these processes"?

> without them, why we wake them up?

So some processs(at the tail of exclusive list)will be treated abnormally
and it will sleep for a long time, is this reasonable?

Changli Gao

unread,

Apr 28, 2010, 4:24:21 AM4/28/10

to Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 4:15 PM, Yong Zhang <yong....@windriver.com> wrote:
>
> What do you mean "we don't need these processes"?

If the work is less than the workers, we don't need the workers at the
tail of the exculsive list.

>
>> without them, why we wake them up?
>
> So some processs(at the tail of exclusive list)will be treated abnormally
> and it will sleep for a long time, is this reasonable?
>

If there isn't enough work to be done, we'd better not disrupt them
and leave them sleeping forever to keep the scheduler happier. Do we
have reason to keep fair to all the workers? Does it have benefit?

Johannes Weiner

unread,

Apr 28, 2010, 5:26:57 AM4/28/10

to Changli Gao, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, David Howells, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 04:23:52PM +0800, Changli Gao wrote:
> On Wed, Apr 28, 2010 at 4:15 PM, Yong Zhang <yong....@windriver.com> wrote:
> >
> > What do you mean "we don't need these processes"?
>
> If the work is less than the workers, we don't need the workers at the
> tail of the exculsive list.

Have you checked how exclusive waitqueues are even used?

> > So some processs(at the tail of exclusive list)will be treated abnormally
> > and it will sleep for a long time, is this reasonable?
> >
>
> If there isn't enough work to be done, we'd better not disrupt them
> and leave them sleeping forever to keep the scheduler happier. Do we
> have reason to keep fair to all the workers? Does it have benefit?

How about starving lock contenders? See wait_on_bit_lock() and grep
for the users e.g.

Changli Gao

unread,

Apr 28, 2010, 7:17:55 AM4/28/10

to David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 5:29 PM, David Howells <dhow...@redhat.com> wrote:
> Changli Gao <xia...@gmail.com> wrote:
>
>> If there isn't enough work to be done, we'd better not disrupt them
>> and leave them sleeping forever to keep the scheduler happier. Do we
>> have reason to keep fair to all the workers? Does it have benefit?
>

> You've made one important assumption: the processes on the wait queue are
> sleeping waiting to service things... but what if the wait queue governs
> access to a resource, and all the processes on that wait queue need access to
> that resource to do things? Some of the processes waiting for it may never
> get a go, and so necessary work may be left undone.
>

You are right. I made the wrong assumption. But we indeed need some
primitive to add wait_queue at the head of the wait_queue_head, and I
know epoll needs it, at least.

fs/eventpoll.c: 1443.
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(&ep->wq, &wait);

Jamie Lokier

unread,

Apr 28, 2010, 9:22:54 AM4/28/10

to Changli Gao, David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Changli Gao wrote:
> On Wed, Apr 28, 2010 at 5:29 PM, David Howells <dhow...@redhat.com> wrote:
> > Changli Gao <xia...@gmail.com> wrote:
> >
> >> If there isn't enough work to be done, we'd better not disrupt them
> >> and leave them sleeping forever to keep the scheduler happier. Do we
> >> have reason to keep fair to all the workers? Does it have benefit?
> >
> > You've made one important assumption: the processes on the wait queue are
> > sleeping waiting to service things... but what if the wait queue governs
> > access to a resource, and all the processes on that wait queue need access to
> > that resource to do things? Some of the processes waiting for it may never
> > get a go, and so necessary work may be left undone.
> >
>
> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);

The same thing about assumptions applies here. The userspace process
may be waiting for an epoll condition to get access to a resource,
rather than being a worker thread interchangeable with others.

For example, userspace might be using a pipe as a signal-safe lock, or
signal-safe multi-token semaphore, and epoll to wait for that pipe.

WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
pointless thundering herd. It doesn't mean unfairness is ok.

The LIFO idea _might_ make sense for interchangeable worker-thread
situations - including userspace. It would make sense for pipe
waiters, socket waiters (especially accept), etc.

Do you have any measurements which showing the LIFO mode performing
better than FIFO, and by how much?

-- Jamie

David Howells

unread,

Apr 28, 2010, 9:23:27 AM4/28/10

to Changli Gao, dhow...@redhat.com, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Can you split the wait queue code and differentiate exclusive wait queues with
LIFO functionality from wait queues with FIFO functionality. I can see why
your suggestion is desirable.

David

Changli Gao

unread,

Apr 28, 2010, 9:42:39 AM4/28/10

to Jamie Lokier, David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <ja...@shareable.org> wrote:
> Changli Gao wrote:
>>
>> fs/eventpoll.c: 1443.
>> wait.flags |= WQ_FLAG_EXCLUSIVE;
>> __add_wait_queue(&ep->wq, &wait);
>
> The same thing about assumptions applies here. The userspace process
> may be waiting for an epoll condition to get access to a resource,
> rather than being a worker thread interchangeable with others.

Oh, the lines above are the current ones. So the assumptions applies
and works here.

>
> For example, userspace might be using a pipe as a signal-safe lock, or
> signal-safe multi-token semaphore, and epoll to wait for that pipe.
>
> WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> pointless thundering herd. It doesn't mean unfairness is ok.

The users should not make any assumption about the waking up sequence,
neither LIFO nor FIFO.

>
> The LIFO idea _might_ make sense for interchangeable worker-thread
> situations - including userspace. It would make sense for pipe
> waiters, socket waiters (especially accept), etc.

Yea, and my following patches are for socket waiters.

>
> Do you have any measurements which showing the LIFO mode performing
> better than FIFO, and by how much?
>

I didn't do any test yet. But some work done by LSE project years ago
showed that it is better.

http://lse.sourceforge.net/io/aionotes.txt

" Also in view of
better cache utilization the wake queue mechanism is LIFO by default.
(A new exclusive LIFO wakeup option has been introduced for this purpose)"

--
Regards，
Changli Gao(xia...@gmail.com)

Changli Gao

unread,

Apr 28, 2010, 9:56:58 AM4/28/10

to David Howells, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 5:32 PM, David Howells <dhow...@redhat.com> wrote:
> Changli Gao <xia...@gmail.com> wrote:
>

>> @@ -50,6 +48,7 @@ struct wait_bit_queue {
>> struct __wait_queue_head {
>> spinlock_t lock;
>> struct list_head task_list;
>> + struct list_head task_list_ex;
>

> It would be preferable it if you could avoid making struct __wait_queue_head
> bigger. That will increase the size of a lot of things.
>

I don't know how to do that, as maybe there are non-exclusive and
exclusive wait queues in the same wait queue head. If we want to
enqueue exclusive wait queues at the head of exclusive queues, we have
to know where the head is, otherwise, we have to loop to find the head
when enqueuing.

--
Regards，
Changli Gao(xia...@gmail.com)

David Howells

unread,

Apr 28, 2010, 10:10:36 AM4/28/10

to Changli Gao, dhow...@redhat.com, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Changli Gao <xia...@gmail.com> wrote:

> I don't know how to do that, as maybe there are non-exclusive and
> exclusive wait queues in the same wait queue head. If we want to
> enqueue exclusive wait queues at the head of exclusive queues, we have
> to know where the head is, otherwise, we have to loop to find the head
> when enqueuing.

I suspect you really want to have the semantics defined per-queue. _Either_ a
queue is FIFO (such as processes waiting for a resource so they can do
something with it) _or_ it is LIFO (such as a pool of processes waiting to be
given work).

How often do the two actually mix? And if they do, is that really an error?

David

Changli Gao

unread,

Apr 28, 2010, 10:53:51 AM4/28/10

to David Howells, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 10:06 PM, David Howells <dhow...@redhat.com> wrote:
>
> I suspect you really want to have the semantics defined per-queue. _Either_ a
> queue is FIFO (such as processes waiting for a resource so they can do
> something with it) _or_ it is LIFO (such as a pool of processes waiting to be
> given work).
>
> How often do the two actually mix? And if they do, is that really an error?
>

The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
exclusive and non-exclusive is identified by wait queues, not wait
queue heads. Maybe there is a historical reason. It is much like a
hack.

--
Regards，
Changli Gao(xia...@gmail.com)

David Howells

unread,

Apr 28, 2010, 11:02:42 AM4/28/10

to Changli Gao, dhow...@redhat.com, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Changli Gao <xia...@gmail.com> wrote:

> > How often do the two actually mix? And if they do, is that really an
> > error?
>

> The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
> exclusive and non-exclusive is identified by wait queues, not wait
> queue heads. Maybe there is a historical reason. It is much like a
> hack.

But is it actually used in both modes at the same time?

Jamie Lokier

unread,

Apr 28, 2010, 11:26:19 AM4/28/10

to Changli Gao, David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

Changli Gao wrote:
> On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <ja...@shareable.org> wrote:
> > Changli Gao wrote:
> >>
> >> fs/eventpoll.c: 1443.
> >> wait.flags |= WQ_FLAG_EXCLUSIVE;
> >> __add_wait_queue(&ep->wq, &wait);
> >
> > The same thing about assumptions applies here. The userspace process
> > may be waiting for an epoll condition to get access to a resource,
> > rather than being a worker thread interchangeable with others.
>
> Oh, the lines above are the current ones. So the assumptions applies
> and works here.

No, because WQ_FLAG_EXCLUSIVE doesn't have your LIFO semantic at the moment.

Your patch changes the behaviour of epoll, though I don't know if it
matters. Perhaps all programs which have multiple tasks waiting on
the same epoll fd are "interchangeable worker thread" types anyway :-)

> > For example, userspace might be using a pipe as a signal-safe lock, or
> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
> >
> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> > pointless thundering herd. It doesn't mean unfairness is ok.
>
> The users should not make any assumption about the waking up sequence,
> neither LIFO nor FIFO.

Correct, but they should be able to assume non-starvation (eventual
progress) for all waiters.

It's one of those subtle things, possibly a unixy thing: Non-RT tasks
should always make progress when the competition is just other non-RT
tasks, even if the progress is slow.

Starvation can spread out beyond the starved process, to cause
priority inversions in other tasks that are waiting on a resource
locked by the starved process. Among other things, that can cause
higher priority tasks, and RT priority tasks, to block permanently.
Very unpleasant.

> > The LIFO idea _might_ make sense for interchangeable worker-thread
> > situations - including userspace. It would make sense for pipe
> > waiters, socket waiters (especially accept), etc.
>
> Yea, and my following patches are for socket waiters.

Occasionally unix socketpairs are occasionally used in the above ways too.

I'm not against your patch, but I worry that starvation is a new
semantic, and it may have a significant effect on something - either
in the kernel, or in userspace which is harder to check.

> > Do you have any measurements which showing the LIFO mode performing
> > better than FIFO, and by how much?
>
> I didn't do any test yet. But some work done by LSE project years ago
> showed that it is better.
>
> http://lse.sourceforge.net/io/aionotes.txt
>
> " Also in view of
> better cache utilization the wake queue mechanism is LIFO by default.
> (A new exclusive LIFO wakeup option has been introduced for this purpose)"

I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
prevent starvation while getting some of the locality benefit.
Something like add-LIFO and increment a small counter in the next wait
entry, but never add in front of an entry whose counter has reached
MAX_LIFO_WAITERS? :-)

-- Jamie

Changli Gao

unread,

Apr 28, 2010, 11:34:13 AM4/28/10

to David Howells, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 11:00 PM, David Howells <dhow...@redhat.com> wrote:
>
> But is it actually used in both modes at the same time?
>

It should be seldom in good applications. So I only need to add a
function, like this:

add_wait_queue_head_exclusive(wq, wait)
first_exclusive = find_the_first_exclusive(wq); // it is fast, as
there should no or few non-exclusive wait queues
list_add(wait, first_exclusive);

--
Regards，
Changli Gao(xia...@gmail.com)

Changli Gao

unread,

Apr 28, 2010, 11:50:16 AM4/28/10

to Jamie Lokier, David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Davide Libenzi, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, linux-...@vger.kernel.org

On Wed, Apr 28, 2010 at 11:25 PM, Jamie Lokier <ja...@shareable.org> wrote:
> Changli Gao wrote:
>> On Wed, Apr 28, 2010 at 9:21 PM, Jamie Lokier <ja...@shareable.org> wrote:
>> > Changli Gao wrote:
>> >>
>> >> fs/eventpoll.c: 1443.
>> >> wait.flags |= WQ_FLAG_EXCLUSIVE;
>> >> __add_wait_queue(&ep->wq, &wait);
>> >
>> > The same thing about assumptions applies here. The userspace process
>> > may be waiting for an epoll condition to get access to a resource,
>> > rather than being a worker thread interchangeable with others.
>>
>> Oh, the lines above are the current ones. So the assumptions applies
>> and works here.
>
> No, because WQ_FLAG_EXCLUSIVE doesn't have your LIFO semantic at the moment.
>
> Your patch changes the behaviour of epoll, though I don't know if it
> matters. Perhaps all programs which have multiple tasks waiting on
> the same epoll fd are "interchangeable worker thread" types anyway :-)
>

No. You are wrong. I meant epoll implemented LIFO on its own. You
should check the code. :)

>> > For example, userspace might be using a pipe as a signal-safe lock, or
>> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
>> >
>> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
>> > pointless thundering herd. It doesn't mean unfairness is ok.
>>
>> The users should not make any assumption about the waking up sequence,
>> neither LIFO nor FIFO.
>
> Correct, but they should be able to assume non-starvation (eventual
> progress) for all waiters.
>
> It's one of those subtle things, possibly a unixy thing: Non-RT tasks
> should always make progress when the competition is just other non-RT
> tasks, even if the progress is slow.
>
> Starvation can spread out beyond the starved process, to cause
> priority inversions in other tasks that are waiting on a resource
> locked by the starved process. Among other things, that can cause
> higher priority tasks, and RT priority tasks, to block permanently.
> Very unpleasant.
>
>> > The LIFO idea _might_ make sense for interchangeable worker-thread
>> > situations - including userspace. It would make sense for pipe
>> > waiters, socket waiters (especially accept), etc.
>>
>> Yea, and my following patches are for socket waiters.
>
> Occasionally unix socketpairs are occasionally used in the above ways too.
>
> I'm not against your patch, but I worry that starvation is a new
> semantic, and it may have a significant effect on something - either
> in the kernel, or in userspace which is harder to check.

Thanks for your reminding.

>
> I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
> prevent starvation while getting some of the locality benefit.
> Something like add-LIFO and increment a small counter in the next wait
> entry, but never add in front of an entry whose counter has reached
> MAX_LIFO_WAITERS? :-)
>

It is a little complex, and I'll keep it simple and improve it when necessary.

--
Regards，
Changli Gao(xia...@gmail.com)

Davide Libenzi

unread,

Apr 28, 2010, 2:57:14 PM4/28/10

to Changli Gao, David Howells, Yong Zhang, Xiaotian Feng, Ingo Molnar, Alexander Viro, Andrew Morton, Eric W. Biederman, Roland Dreier, Stefan Richter, Peter Zijlstra, David S. Miller, Eric Dumazet, Christoph Lameter, Andreas Herrmann, Thomas Gleixner, Takashi Iwai, linux-...@vger.kernel.org, Linux Kernel Mailing List

On Wed, 28 Apr 2010, Changli Gao wrote:

> On Wed, Apr 28, 2010 at 5:29 PM, David Howells <dhow...@redhat.com> wrote:
> > Changli Gao <xia...@gmail.com> wrote:
> >
> >> If there isn't enough work to be done, we'd better not disrupt them
> >> and leave them sleeping forever to keep the scheduler happier. Do we
> >> have reason to keep fair to all the workers? Does it have benefit?
> >
> > You've made one important assumption: the processes on the wait queue are
> > sleeping waiting to service things... but what if the wait queue governs
> > access to a resource, and all the processes on that wait queue need access to
> > that resource to do things? Some of the processes waiting for it may never
> > get a go, and so necessary work may be left undone.
> >
>
> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);

I'm not sure one user deserves a new function, but like it has been
noticed, the patch for that should eventually be totally isolated from
other bits.

- Davide