If the exclusive wait queue is also a LIFO queue as the normal wait queue, the
process who goes to sleep recently, will be woke up first. As its memory is
more likely in cache, we will get better performance. And when there are many
processes waiting on a exclusive wait queue, some of them may not be woke up,
if the others can handle the workload, and it will reduce the load of
the scheduler.
Note: before applying this patch, you need my previous patch patched first.
https://patchwork.kernel.org/patch/95600/
Signed-off-by: Changli Gao <xia...@gmail.com>
----
fs/eventpoll.c | 3 +--
include/linux/wait.h | 17 +++++++----------
kernel/sched.c | 8 ++++----
kernel/wait.c | 9 +++------
4 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index bd056a5..e9b3ebe 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1140,8 +1140,7 @@ retry:
* ep_poll_callback() when events will become available.
*/
init_waitqueue_entry(&wait, current);
- wait.flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue(&ep->wq, &wait);
+ __add_wait_queue_ex(&ep->wq, &wait);
for (;;) {
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a48e16b..95c127d 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -30,8 +30,6 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v
int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key);
struct __wait_queue {
- unsigned int flags;
-#define WQ_FLAG_EXCLUSIVE 0x01
void *private;
wait_queue_func_t func;
struct list_head task_list;
@@ -50,6 +48,7 @@ struct wait_bit_queue {
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
+ struct list_head task_list_ex;
};
typedef struct __wait_queue_head wait_queue_head_t;
@@ -69,7 +68,8 @@ struct task_struct;
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
- .task_list = { &(name).task_list, &(name).task_list } }
+ .task_list = { &(name).task_list, &(name).task_list }, \
+ .task_list_ex = { &(name).task_list_ex, &(name).task_list_ex } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@ -97,7 +97,6 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *)
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
- q->flags = 0;
q->private = p;
q->func = default_wake_function;
}
@@ -105,14 +104,13 @@ static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
static inline void init_waitqueue_func_entry(wait_queue_t *q,
wait_queue_func_t func)
{
- q->flags = 0;
q->private = NULL;
q->func = func;
}
static inline int waitqueue_active(wait_queue_head_t *q)
{
- return !list_empty(&q->task_list);
+ return !list_empty(&q->task_list) || !list_empty(&q->task_list);
}
extern void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait);
@@ -127,10 +125,10 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
/*
* Used for wake-one threads:
*/
-static inline void __add_wait_queue_tail(wait_queue_head_t *head,
+static inline void __add_wait_queue_ex(wait_queue_head_t *head,
wait_queue_t *new)
{
- list_add_tail(&new->task_list, &head->task_list);
+ list_add(&new->task_list, &head->task_list_ex);
}
static inline void __remove_wait_queue(wait_queue_head_t *head,
@@ -409,8 +407,7 @@ do { \
static inline void add_wait_queue_exclusive_locked(wait_queue_head_t *q,
wait_queue_t * wait)
{
- wait->flags |= WQ_FLAG_EXCLUSIVE;
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
}
/*
diff --git a/kernel/sched.c b/kernel/sched.c
index be5ab70..59b1534 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3903,11 +3903,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
{
wait_queue_t *curr, *next;
- list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
- unsigned flags = curr->flags;
+ list_for_each_entry_safe(curr, next, &q->task_list, task_list)
+ curr->func(curr, mode, wake_flags, key);
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ list_for_each_entry_safe(curr, next, &q->task_list_ex, task_list) {
+ if (curr->func(curr, mode, wake_flags, key) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d8..a0559df 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -15,6 +15,7 @@ void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
spin_lock_init(&q->lock);
lockdep_set_class(&q->lock, key);
INIT_LIST_HEAD(&q->task_list);
+ INIT_LIST_HEAD(&q->task_list_ex);
}
EXPORT_SYMBOL(__init_waitqueue_head);
@@ -23,7 +24,6 @@ void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
@@ -34,9 +34,8 @@ void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
@@ -69,7 +68,6 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
- wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
__add_wait_queue(q, wait);
@@ -83,10 +81,9 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
{
unsigned long flags;
- wait->flags |= WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
if (list_empty(&wait->task_list))
- __add_wait_queue_tail(q, wait);
+ __add_wait_queue_ex(q, wait);
set_current_state(state);
spin_unlock_irqrestore(&q->lock, flags);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
I am sorry. the later task_list should be task_list_ex. The updated
patch is attached. And I will replace task_list list with hlist for
saving space.
--
Regards,
Changli Gao(xia...@gmail.com)
Starve some processes for performance?
Starve? Oh, No. If we don't need these processes, and we can do better
without them, why we wake them up?
--
Regards,
Changli Gao(xia...@gmail.com)
This version replaces list with hlist.
--
Regards,
Changli Gao(xia...@gmail.com)
What do you mean "we don't need these processes"?
> without them, why we wake them up?
So some processs(at the tail of exclusive list)will be treated abnormally
and it will sleep for a long time, is this reasonable?
If the work is less than the workers, we don't need the workers at the
tail of the exculsive list.
>
>> without them, why we wake them up?
>
> So some processs(at the tail of exclusive list)will be treated abnormally
> and it will sleep for a long time, is this reasonable?
>
If there isn't enough work to be done, we'd better not disrupt them
and leave them sleeping forever to keep the scheduler happier. Do we
have reason to keep fair to all the workers? Does it have benefit?
Have you checked how exclusive waitqueues are even used?
> > So some processs(at the tail of exclusive list)will be treated abnormally
> > and it will sleep for a long time, is this reasonable?
> >
>
> If there isn't enough work to be done, we'd better not disrupt them
> and leave them sleeping forever to keep the scheduler happier. Do we
> have reason to keep fair to all the workers? Does it have benefit?
How about starving lock contenders? See wait_on_bit_lock() and grep
for the users e.g.
You are right. I made the wrong assumption. But we indeed need some
primitive to add wait_queue at the head of the wait_queue_head, and I
know epoll needs it, at least.
fs/eventpoll.c: 1443.
wait.flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(&ep->wq, &wait);
The same thing about assumptions applies here. The userspace process
may be waiting for an epoll condition to get access to a resource,
rather than being a worker thread interchangeable with others.
For example, userspace might be using a pipe as a signal-safe lock, or
signal-safe multi-token semaphore, and epoll to wait for that pipe.
WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
pointless thundering herd. It doesn't mean unfairness is ok.
The LIFO idea _might_ make sense for interchangeable worker-thread
situations - including userspace. It would make sense for pipe
waiters, socket waiters (especially accept), etc.
Do you have any measurements which showing the LIFO mode performing
better than FIFO, and by how much?
-- Jamie
David
Oh, the lines above are the current ones. So the assumptions applies
and works here.
>
> For example, userspace might be using a pipe as a signal-safe lock, or
> signal-safe multi-token semaphore, and epoll to wait for that pipe.
>
> WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> pointless thundering herd. It doesn't mean unfairness is ok.
The users should not make any assumption about the waking up sequence,
neither LIFO nor FIFO.
>
> The LIFO idea _might_ make sense for interchangeable worker-thread
> situations - including userspace. It would make sense for pipe
> waiters, socket waiters (especially accept), etc.
Yea, and my following patches are for socket waiters.
>
> Do you have any measurements which showing the LIFO mode performing
> better than FIFO, and by how much?
>
I didn't do any test yet. But some work done by LSE project years ago
showed that it is better.
http://lse.sourceforge.net/io/aionotes.txt
" Also in view of
better cache utilization the wake queue mechanism is LIFO by default.
(A new exclusive LIFO wakeup option has been introduced for this purpose)"
--
Regards,
Changli Gao(xia...@gmail.com)
I don't know how to do that, as maybe there are non-exclusive and
exclusive wait queues in the same wait queue head. If we want to
enqueue exclusive wait queues at the head of exclusive queues, we have
to know where the head is, otherwise, we have to loop to find the head
when enqueuing.
--
Regards,
Changli Gao(xia...@gmail.com)
> I don't know how to do that, as maybe there are non-exclusive and
> exclusive wait queues in the same wait queue head. If we want to
> enqueue exclusive wait queues at the head of exclusive queues, we have
> to know where the head is, otherwise, we have to loop to find the head
> when enqueuing.
I suspect you really want to have the semantics defined per-queue. _Either_ a
queue is FIFO (such as processes waiting for a resource so they can do
something with it) _or_ it is LIFO (such as a pool of processes waiting to be
given work).
How often do the two actually mix? And if they do, is that really an error?
David
The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
exclusive and non-exclusive is identified by wait queues, not wait
queue heads. Maybe there is a historical reason. It is much like a
hack.
--
Regards,
Changli Gao(xia...@gmail.com)
> > How often do the two actually mix? And if they do, is that really an
> > error?
>
> The sock.sk_sleep is used by exclusive and non-exclusive wait queues.
> exclusive and non-exclusive is identified by wait queues, not wait
> queue heads. Maybe there is a historical reason. It is much like a
> hack.
But is it actually used in both modes at the same time?
No, because WQ_FLAG_EXCLUSIVE doesn't have your LIFO semantic at the moment.
Your patch changes the behaviour of epoll, though I don't know if it
matters. Perhaps all programs which have multiple tasks waiting on
the same epoll fd are "interchangeable worker thread" types anyway :-)
> > For example, userspace might be using a pipe as a signal-safe lock, or
> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
> >
> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
> > pointless thundering herd. It doesn't mean unfairness is ok.
>
> The users should not make any assumption about the waking up sequence,
> neither LIFO nor FIFO.
Correct, but they should be able to assume non-starvation (eventual
progress) for all waiters.
It's one of those subtle things, possibly a unixy thing: Non-RT tasks
should always make progress when the competition is just other non-RT
tasks, even if the progress is slow.
Starvation can spread out beyond the starved process, to cause
priority inversions in other tasks that are waiting on a resource
locked by the starved process. Among other things, that can cause
higher priority tasks, and RT priority tasks, to block permanently.
Very unpleasant.
> > The LIFO idea _might_ make sense for interchangeable worker-thread
> > situations - including userspace. It would make sense for pipe
> > waiters, socket waiters (especially accept), etc.
>
> Yea, and my following patches are for socket waiters.
Occasionally unix socketpairs are occasionally used in the above ways too.
I'm not against your patch, but I worry that starvation is a new
semantic, and it may have a significant effect on something - either
in the kernel, or in userspace which is harder to check.
> > Do you have any measurements which showing the LIFO mode performing
> > better than FIFO, and by how much?
>
> I didn't do any test yet. But some work done by LSE project years ago
> showed that it is better.
>
> http://lse.sourceforge.net/io/aionotes.txt
>
> " Also in view of
> better cache utilization the wake queue mechanism is LIFO by default.
> (A new exclusive LIFO wakeup option has been introduced for this purpose)"
I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
prevent starvation while getting some of the locality benefit.
Something like add-LIFO and increment a small counter in the next wait
entry, but never add in front of an entry whose counter has reached
MAX_LIFO_WAITERS? :-)
-- Jamie
It should be seldom in good applications. So I only need to add a
function, like this:
add_wait_queue_head_exclusive(wq, wait)
first_exclusive = find_the_first_exclusive(wq); // it is fast, as
there should no or few non-exclusive wait queues
list_add(wait, first_exclusive);
--
Regards,
Changli Gao(xia...@gmail.com)
No. You are wrong. I meant epoll implemented LIFO on its own. You
should check the code. :)
>> > For example, userspace might be using a pipe as a signal-safe lock, or
>> > signal-safe multi-token semaphore, and epoll to wait for that pipe.
>> >
>> > WQ_FLAG_EXCLUSIVE means there is no point waking all tasks, to avoid a
>> > pointless thundering herd. It doesn't mean unfairness is ok.
>>
>> The users should not make any assumption about the waking up sequence,
>> neither LIFO nor FIFO.
>
> Correct, but they should be able to assume non-starvation (eventual
> progress) for all waiters.
>
> It's one of those subtle things, possibly a unixy thing: Non-RT tasks
> should always make progress when the competition is just other non-RT
> tasks, even if the progress is slow.
>
> Starvation can spread out beyond the starved process, to cause
> priority inversions in other tasks that are waiting on a resource
> locked by the starved process. Among other things, that can cause
> higher priority tasks, and RT priority tasks, to block permanently.
> Very unpleasant.
>
>> > The LIFO idea _might_ make sense for interchangeable worker-thread
>> > situations - including userspace. It would make sense for pipe
>> > waiters, socket waiters (especially accept), etc.
>>
>> Yea, and my following patches are for socket waiters.
>
> Occasionally unix socketpairs are occasionally used in the above ways too.
>
> I'm not against your patch, but I worry that starvation is a new
> semantic, and it may have a significant effect on something - either
> in the kernel, or in userspace which is harder to check.
Thanks for your reminding.
>
> I suspect it's possible to combine LIFO-ish and FIFO-ish queuing to
> prevent starvation while getting some of the locality benefit.
> Something like add-LIFO and increment a small counter in the next wait
> entry, but never add in front of an entry whose counter has reached
> MAX_LIFO_WAITERS? :-)
>
It is a little complex, and I'll keep it simple and improve it when necessary.
--
Regards,
Changli Gao(xia...@gmail.com)
> On Wed, Apr 28, 2010 at 5:29 PM, David Howells <dhow...@redhat.com> wrote:
> > Changli Gao <xia...@gmail.com> wrote:
> >
> >> If there isn't enough work to be done, we'd better not disrupt them
> >> and leave them sleeping forever to keep the scheduler happier. Do we
> >> have reason to keep fair to all the workers? Does it have benefit?
> >
> > You've made one important assumption: the processes on the wait queue are
> > sleeping waiting to service things... but what if the wait queue governs
> > access to a resource, and all the processes on that wait queue need access to
> > that resource to do things? Some of the processes waiting for it may never
> > get a go, and so necessary work may be left undone.
> >
>
> You are right. I made the wrong assumption. But we indeed need some
> primitive to add wait_queue at the head of the wait_queue_head, and I
> know epoll needs it, at least.
>
> fs/eventpoll.c: 1443.
> wait.flags |= WQ_FLAG_EXCLUSIVE;
> __add_wait_queue(&ep->wq, &wait);
I'm not sure one user deserves a new function, but like it has been
noticed, the patch for that should eventually be totally isolated from
other bits.
- Davide