[PATCH 1 of 4] Introduce per_call

Zach Brown

unread,

Jan 30, 2007, 4:40:26 PM1/30/07

to linux-...@vger.kernel.org

There are members of task_struct which are only used by a given call chain to
pass arguments up and down the chain itself. They are logically thread-local
storage.

The patches later in the series want to have multiple calls pending for a given
task, though only one will be executing at a given time. By putting these
thread-local members of task_struct in a seperate storage structure we're able
to trivially swap them in and out as their calls are swapped in and out.

per_call_chain() doesn't have a terribly great name. It was chosen in the
spirit of per_cpu().

The storage was left inline in task_struct to avoid introducing indirection for
the vast majority of uses which will never have multiple calls executing in a
task.

I chose a few members of task_struct to migrate under per_call_chain() along
with the introduction as an example of what it looks like. These would be
seperate patches in a patch series that was suitable for merging.

diff -r b1128b48dc99 -r 26e278468209 fs/jbd/journal.c
--- a/fs/jbd/journal.c Fri Jan 12 20:00:03 2007 +0000
+++ b/fs/jbd/journal.c Mon Jan 29 15:36:13 2007 -0800
@@ -471,7 +471,7 @@ int journal_force_commit_nested(journal_
tid_t tid;

spin_lock(&journal->j_state_lock);
- if (journal->j_running_transaction && !current->journal_info) {
+ if (journal->j_running_transaction && !per_call_chain(journal_info)) {
transaction = journal->j_running_transaction;
__log_start_commit(journal, transaction->t_tid);
} else if (journal->j_committing_transaction)
diff -r b1128b48dc99 -r 26e278468209 fs/jbd/transaction.c
--- a/fs/jbd/transaction.c Fri Jan 12 20:00:03 2007 +0000
+++ b/fs/jbd/transaction.c Mon Jan 29 15:36:13 2007 -0800
@@ -279,12 +279,12 @@ handle_t *journal_start(journal_t *journ
if (!handle)
return ERR_PTR(-ENOMEM);

- current->journal_info = handle;
+ per_call_chain(journal_info) = handle;

err = start_this_handle(journal, handle);
if (err < 0) {
jbd_free_handle(handle);
- current->journal_info = NULL;
+ per_call_chain(journal_info) = NULL;
handle = ERR_PTR(err);
}
return handle;
@@ -1368,7 +1368,7 @@ int journal_stop(handle_t *handle)
} while (old_handle_count != transaction->t_handle_count);
}

- current->journal_info = NULL;
+ per_call_chain(journal_info) = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
transaction->t_outstanding_credits -= handle->h_buffer_credits;
diff -r b1128b48dc99 -r 26e278468209 fs/namei.c
--- a/fs/namei.c Fri Jan 12 20:00:03 2007 +0000
+++ b/fs/namei.c Mon Jan 29 15:36:13 2007 -0800
@@ -628,20 +628,20 @@ static inline int do_follow_link(struct
static inline int do_follow_link(struct path *path, struct nameidata *nd)
{
int err = -ELOOP;
- if (current->link_count >= MAX_NESTED_LINKS)
+ if (per_call_chain(link_count) >= MAX_NESTED_LINKS)
goto loop;
- if (current->total_link_count >= 40)
+ if (per_call_chain(total_link_count) >= 40)
goto loop;
BUG_ON(nd->depth >= MAX_NESTED_LINKS);
cond_resched();
err = security_inode_follow_link(path->dentry, nd);
if (err)
goto loop;
- current->link_count++;
- current->total_link_count++;
+ per_call_chain(link_count)++;
+ per_call_chain(total_link_count)++;
nd->depth++;
err = __do_follow_link(path, nd);
- current->link_count--;
+ per_call_chain(link_count)--;
nd->depth--;
return err;
loop:
@@ -1025,7 +1025,7 @@ int fastcall link_path_walk(const char *

int fastcall path_walk(const char * name, struct nameidata *nd)
{
- current->total_link_count = 0;
+ per_call_chain(total_link_count) = 0;
return link_path_walk(name, nd);
}

@@ -1153,7 +1153,7 @@ static int fastcall do_path_lookup(int d

fput_light(file, fput_needed);
}
- current->total_link_count = 0;
+ per_call_chain(total_link_count) = 0;
retval = link_path_walk(name, nd);
out:
if (likely(retval == 0)) {
diff -r b1128b48dc99 -r 26e278468209 include/linux/init_task.h
--- a/include/linux/init_task.h Fri Jan 12 20:00:03 2007 +0000
+++ b/include/linux/init_task.h Mon Jan 29 15:36:13 2007 -0800
@@ -88,6 +88,11 @@ extern struct nsproxy init_nsproxy;

extern struct group_info init_groups;

+#define INIT_PER_CALL_CHAIN(tsk) \
+{ \
+ .journal_info = NULL, \
+}
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -124,6 +129,7 @@ extern struct group_info init_groups;
.keep_capabilities = 0, \
.user = INIT_USER, \
.comm = "swapper", \
+ .per_call = INIT_PER_CALL_CHAIN(tsk), \
.thread = INIT_THREAD, \
.fs = &init_fs, \
.files = &init_files, \
@@ -135,7 +141,6 @@ extern struct group_info init_groups;
.signal = {{0}}}, \
.blocked = {{0}}, \
.alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \
- .journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
.pi_lock = SPIN_LOCK_UNLOCKED, \
diff -r b1128b48dc99 -r 26e278468209 include/linux/jbd.h
--- a/include/linux/jbd.h Fri Jan 12 20:00:03 2007 +0000
+++ b/include/linux/jbd.h Mon Jan 29 15:36:13 2007 -0800
@@ -883,7 +883,7 @@ extern void __wait_on_journal (journal_

static inline handle_t *journal_current_handle(void)
{
- return current->journal_info;
+ return per_call_chain(journal_info);
}

/* The journaling code user interface:
diff -r b1128b48dc99 -r 26e278468209 include/linux/sched.h
--- a/include/linux/sched.h Fri Jan 12 20:00:03 2007 +0000
+++ b/include/linux/sched.h Mon Jan 29 15:36:13 2007 -0800
@@ -784,6 +784,20 @@ static inline void prefetch_stack(struct
static inline void prefetch_stack(struct task_struct *t) { }
#endif

+/*
+ * Members of this structure are used to pass arguments down call chains
+ * without specific arguments. Historically they lived on task_struct,
+ * putting them in one place gives us some flexibility. They're accessed
+ * with per_call_chain(name).
+ */
+struct per_call_chain_storage {
+ int link_count; /* number of links in one symlink */
+ int total_link_count; /* total links followed in a lookup */
+ void *journal_info; /* journalling filesystem info */
+};
+
+#define per_call_chain(foo) current->per_call.foo
+
struct audit_context; /* See audit.c */
struct mempolicy;
struct pipe_inode_info;
@@ -920,7 +934,7 @@ struct task_struct {
it with task_lock())
- initialized normally by flush_old_exec */
/* file system info */
- int link_count, total_link_count;
+ struct per_call_chain_storage per_call;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
@@ -993,9 +1007,6 @@ struct task_struct {
struct held_lock held_locks[MAX_LOCK_DEPTH];
unsigned int lockdep_recursion;
#endif
-
-/* journalling filesystem info */
- void *journal_info;

/* VM state */
struct reclaim_state *reclaim_state;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Zach Brown

unread,

Jan 30, 2007, 4:40:31 PM1/30/07

to linux-...@vger.kernel.org

This patch introduces the notion of a 'fibril'. It's meant to be a lighter
kernel thread. There can be multiple of them in the process of executing for a
given task_struct, but only one can every be actively running at a time. Think
of it as a stack and some metadata for scheduling them inside the task_stuct.

This implementation is wildly architecture-specific but isn't put in the right
places. Since these are not code paths that I have extensive experience with,
I focused more on on getting it going and representative of the concept than on
making it right on the first try. I'm actively interested in feedback from
people who know more about the places this touches.

The fibril struct itself is left stand-alone for clarity. There is a 1:1
relationship between fibrils and struct thread_info, though, so it might make
more sense to embed the two somehow.

The use of list_head for the run queue is simplistic. As long as we're not
removing specific fibrils from the list, which seems unlikely, we be more
clever. Maybe no more clever than a singly-linked list, though.

Fibril management is under the runqueue lock because that ends up working well
for the wake-up path as well. In the current patch, though, it makes for some
pretty sloppy code for unlocking the runqueue lock (and re-enabling interrupts
and pre-emption) on the other side of the switch.

The actual mechanics of switching from one stack to another at the end of
schedule_fibril() makes me nervous. I'm not convinced that blindly copying the
contents of thread_info from the previous to the next stack is safe, even if
done with interrupts disabled. (NMIs?) The juggling of current->thread_info
might be racy, etc.

diff -r 26e278468209 -r df7bc026d50e arch/i386/kernel/process.c
--- a/arch/i386/kernel/process.c Mon Jan 29 15:36:13 2007 -0800
+++ b/arch/i386/kernel/process.c Mon Jan 29 15:36:16 2007 -0800
@@ -698,6 +698,28 @@ struct task_struct fastcall * __switch_t
return prev_p;
}

+/*
+ * We've just switched the stack and instruction pointer to point to a new
+ * fibril. We were called from schedule() -> schedule_fibril() with the
+ * runqueue lock held _irq and with preemption disabled.
+ *
+ * We let finish_fibril_switch() unwind the state that was built up by
+ * our callers. We do that here so that we don't need to ask fibrils to
+ * first execute something analagous to schedule_tail(). Maybe that's
+ * wrong.
+ *
+ * We'd also have to reacquire the kernel lock here. For now we know the
+ * BUG_ON(lock_depth) prevents us from having to worry about it.
+ */
+void fastcall __switch_to_fibril(struct thread_info *ti)
+{
+ finish_fibril_switch();
+
+ /* free the ti if schedule_fibril() told us that it's done */
+ if (ti->status & TS_FREE_AFTER_SWITCH)
+ free_thread_info(ti);
+}
+
asmlinkage int sys_fork(struct pt_regs regs)
{
return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
diff -r 26e278468209 -r df7bc026d50e include/asm-i386/system.h
--- a/include/asm-i386/system.h Mon Jan 29 15:36:13 2007 -0800
+++ b/include/asm-i386/system.h Mon Jan 29 15:36:16 2007 -0800
@@ -31,6 +31,31 @@ extern struct task_struct * FASTCALL(__s
"=a" (last),"=S" (esi),"=D" (edi) \
:"m" (next->thread.esp),"m" (next->thread.eip), \
"2" (prev), "d" (next)); \
+} while (0)
+
+struct thread_info;
+void fastcall __switch_to_fibril(struct thread_info *ti);
+
+/*
+ * This is called with the run queue lock held _irq and with preemption
+ * disabled. __switch_to_fibril drops those.
+ */
+#define switch_to_fibril(prev, next, ti) do { \
+ unsigned long esi,edi; \
+ asm volatile("pushfl\n\t" /* Save flags */ \
+ "pushl %%ebp\n\t" \
+ "movl %%esp,%0\n\t" /* save ESP */ \
+ "movl %4,%%esp\n\t" /* restore ESP */ \
+ "movl $1f,%1\n\t" /* save EIP */ \
+ "pushl %5\n\t" /* restore EIP */ \
+ "jmp __switch_to_fibril\n" \
+ "1:\t" \
+ "popl %%ebp\n\t" \
+ "popfl" \
+ :"=m" (prev->esp),"=m" (prev->eip), \
+ "=S" (esi),"=D" (edi) \
+ :"m" (next->esp),"m" (next->eip), \
+ "d" (prev), "a" (ti)); \
} while (0)

#define _set_base(addr,base) do { unsigned long __pr; \
diff -r 26e278468209 -r df7bc026d50e include/asm-i386/thread_info.h
--- a/include/asm-i386/thread_info.h Mon Jan 29 15:36:13 2007 -0800
+++ b/include/asm-i386/thread_info.h Mon Jan 29 15:36:16 2007 -0800
@@ -91,6 +91,12 @@ static inline struct thread_info *curren
static inline struct thread_info *current_thread_info(void)
{
return (struct thread_info *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
+/* XXX perhaps should be integrated with task_pt_regs(task) */
+static inline struct pt_regs *thread_info_pt_regs(struct thread_info *info)
+{
+ return (struct pt_regs *)(KSTK_TOP(info)-8) - 1;
}

/* thread information allocation */
@@ -169,6 +175,7 @@ static inline struct thread_info *curren
*/
#define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */
#define TS_POLLING 0x0002 /* True if in idle loop and not sleeping */
+#define TS_FREE_AFTER_SWITCH 0x0004 /* free ti in __switch_to_fibril() */

#define tsk_is_polling(t) ((t)->thread_info->status & TS_POLLING)

diff -r 26e278468209 -r df7bc026d50e include/linux/init_task.h
--- a/include/linux/init_task.h Mon Jan 29 15:36:13 2007 -0800
+++ b/include/linux/init_task.h Mon Jan 29 15:36:16 2007 -0800
@@ -111,6 +111,8 @@ extern struct group_info init_groups;
.cpus_allowed = CPU_MASK_ALL, \
.mm = NULL, \
.active_mm = &init_mm, \
+ .fibril = NULL, \
+ .runnable_fibrils = LIST_HEAD_INIT(tsk.runnable_fibrils), \
.run_list = LIST_HEAD_INIT(tsk.run_list), \
.ioprio = 0, \
.time_slice = HZ, \
diff -r 26e278468209 -r df7bc026d50e include/linux/sched.h
--- a/include/linux/sched.h Mon Jan 29 15:36:13 2007 -0800
+++ b/include/linux/sched.h Mon Jan 29 15:36:16 2007 -0800
@@ -812,6 +812,38 @@ enum sleep_type {

struct prio_array;

+/*
+ * A 'fibril' is a very small fiber. It's used here to mean a small thread.
+ *
+ * (Chosing a weird new name avoided yet more overloading of 'task', 'call',
+ * 'thread', 'stack', 'fib{er,re}', etc).
+ *
+ * This structure is used by the schduler to track multiple executing stacks
+ * inside a task_struct.
+ *
+ * Only one fibril executes for a given task_struct at a time. When it
+ * blocks, however, another fibril has the chance to execute while it sleeps.
+ * This means that call chains executing in fibrils can see concurrent
+ * current-> accesses at blocking points. "per_call_chain()" members are
+ * switched along with the fibril, so they remain local. Preemption *will not*
+ * trigger a fibril switch.
+ *
+ * XXX
+ * - arch specific
+ */
+struct fibril {
+ struct list_head run_list;
+ /* -1 unrunnable, 0 runnable, >0 stopped */
+ long state;
+ unsigned long eip;
+ unsigned long esp;
+ struct thread_info *ti;
+ struct per_call_chain_storage per_call;
+};
+
+void sched_new_runnable_fibril(struct fibril *fibril);
+void finish_fibril_switch(void);
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
@@ -857,6 +889,20 @@ struct task_struct {
struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;
+
+ /*
+ * The scheduler uses this to determine if the current call is a
+ * stand-alone task or a fibril. If it's a fibril then wake-ups
+ * will target the fibril and a schedule() might result in swapping
+ * in another runnable fibril. So to start executing fibrils at all
+ * one allocates a fibril to represent the running task and then
+ * puts initialized runnable fibrils in the run list.
+ *
+ * The state members of the fibril and runnable_fibrils list are
+ * managed under the task's run queue lock.
+ */
+ struct fibril *fibril;
+ struct list_head runnable_fibrils;

/* task state */
struct linux_binfmt *binfmt;
diff -r 26e278468209 -r df7bc026d50e kernel/exit.c
--- a/kernel/exit.c Mon Jan 29 15:36:13 2007 -0800
+++ b/kernel/exit.c Mon Jan 29 15:36:16 2007 -0800
@@ -854,6 +854,13 @@ fastcall NORET_TYPE void do_exit(long co
{
struct task_struct *tsk = current;
int group_dead;
+
+ /*
+ * XXX this is just a debug helper, this should be waiting for all
+ * fibrils to return. Possibly after sending them lots of -KILL
+ * signals?
+ */
+ BUG_ON(!list_empty(&current->runnable_fibrils));

profile_task_exit(tsk);

diff -r 26e278468209 -r df7bc026d50e kernel/fork.c
--- a/kernel/fork.c Mon Jan 29 15:36:13 2007 -0800
+++ b/kernel/fork.c Mon Jan 29 15:36:16 2007 -0800
@@ -1179,6 +1179,9 @@ static struct task_struct *copy_process(

/* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
p->ioprio = current->ioprio;
+
+ p->fibril = NULL;
+ INIT_LIST_HEAD(&p->runnable_fibrils);

/*
* The task hasn't been attached yet, so its cpus_allowed mask will
diff -r 26e278468209 -r df7bc026d50e kernel/sched.c
--- a/kernel/sched.c Mon Jan 29 15:36:13 2007 -0800
+++ b/kernel/sched.c Mon Jan 29 15:36:16 2007 -0800
@@ -3407,6 +3407,111 @@ static inline int interactive_sleep(enum
}

/*
+ * This unwinds the state that was built up by schedule -> schedule_fibril().
+ * The arch-specific switch_to_fibril() path calls here once the new fibril
+ * is executing.
+ */
+void finish_fibril_switch(void)
+{
+ spin_unlock_irq(&this_rq()->lock);
+ preempt_enable_no_resched();
+}
+
+/*
+ * Add a new fibril to the runnable list. It'll be switched to next time
+ * the caller comes through schedule().
+ */
+void sched_new_runnable_fibril(struct fibril *fibril)
+{
+ struct task_struct *tsk = current;
+ unsigned long flags;
+ struct rq *rq = task_rq_lock(tsk, &flags);
+
+ fibril->state = TASK_RUNNING;
+ BUG_ON(!list_empty(&fibril->run_list));
+ list_add_tail(&fibril->run_list, &tsk->runnable_fibrils);
+
+ task_rq_unlock(rq, &flags);
+}
+
+/*
+ * This is called from schedule() when we're not being preempted and there is a
+ * fibril waiting in current->runnable_fibrils.
+ *
+ * This is called under the run queue lock to serialize fibril->state and the
+ * runnable_fibrils list with wake-up. We drop it before switching and the
+ * return path takes that into account.
+ *
+ * We always switch so that a caller can specifically make a single pass
+ * through the runnable fibrils by marking itself _RUNNING and calling
+ * schedule().
+ */
+void schedule_fibril(struct task_struct *tsk)
+{
+ struct thread_info *ti = task_thread_info(tsk);
+ struct fibril *prev;
+ struct fibril *next;
+ struct fibril dummy;
+
+ /*
+ * XXX We don't deal with the kernel lock yet. It'd need to be audited
+ * and lock_depth moved under per_call_chain().
+ */
+ BUG_ON(tsk->lock_depth >= 0);
+
+ next = list_entry(current->runnable_fibrils.next, struct fibril,
+ run_list);
+ list_del_init(&next->run_list);
+ BUG_ON(next->state != TASK_RUNNING);
+
+ prev = tsk->fibril;
+ if (prev) {
+ prev->state = tsk->state;
+ prev->per_call = current->per_call;
+ /*
+ * This catches the case where the caller wants to make a pass
+ * through runnable fibrils by marking itself _RUNNING and
+ * calling schedule(). A fibril should not be able to be on
+ * both tsk->fibril and the runnable_list.
+ */
+ if (prev->state == TASK_RUNNING) {
+ BUG_ON(!list_empty(&prev->run_list));
+ list_add_tail(&prev->run_list,
+ &current->runnable_fibrils);
+ }
+ } else {
+ /*
+ * To free a fibril the calling path can free the structure
+ * itself, set current->fibril to NULL, and call schedule().
+ * That causes us to tell __switch_to_fibril() to free the ti
+ * associated with the fibril once we've switched away from it.
+ * The dummy is just use to give switch_to_fibril() something
+ * to save state in to.
+ */
+ prev = &dummy;
+ }
+
+ /*
+ * XXX The idea is to copy all but the actual call stack. Obviously
+ * this is wildly arch-specific and belongs abstracted out.
+ */
+ *next->ti = *ti;
+ *thread_info_pt_regs(next->ti) = *thread_info_pt_regs(ti);
+
+ current->thread_info = next->ti;
+ current->thread.esp0 = (unsigned long)(thread_info_pt_regs(next->ti) + 1);
+ current->fibril = next;
+ current->state = next->state;
+ current->per_call = next->per_call;
+
+ if (prev == &dummy)
+ ti->status |= TS_FREE_AFTER_SWITCH;
+
+ /* __switch_to_fibril() drops the runqueue lock and enables preempt */
+ switch_to_fibril(prev, next, ti);
+}
+
+/*
* schedule() is the main scheduler function.
*/
asmlinkage void __sched schedule(void)
@@ -3468,6 +3573,22 @@ need_resched_nonpreemptible:
run_time /= (CURRENT_BONUS(prev) ? : 1);

spin_lock_irq(&rq->lock);
+
+ /* always switch to a runnable fibril if we aren't being preempted */
+ if (unlikely(!(preempt_count() & PREEMPT_ACTIVE) &&
+ !list_empty(&prev->runnable_fibrils))) {
+ schedule_fibril(prev);
+ /*
+ * finish_fibril_switch() drops the rq lock and enables
+ * premption, but the popfl disables interrupts again. Watch
+ * me learn how context switch locking works before your very
+ * eyes! XXX This will need to be fixed up by throwing
+ * together something like the prepare_lock_switch() path the
+ * scheduler does. Guidance appreciated!
+ */
+ local_irq_enable();
+ return;
+ }

switch_count = &prev->nivcsw;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

Zach Brown

unread,

Jan 30, 2007, 4:41:23 PM1/30/07

to linux-...@vger.kernel.org

The addition of multiple sleeping fibrils under a task_struct means that we
can't simply wake a task_struct to be able to wake a specific sleeping code
path.

This patch introduces task_wake_target() as a way to refer to a code path that
is about to sleep and will be woken in the future. Sleepers that used to wake
a current task_struct reference with wake_up_process() now use this helper to
get a wake target cookie and wake it with wake_up_target().

Some paths know that waking a task will be sufficient. Paths working with
kernel threads that never use fibrils fall into this category. They're changed
to use wake_up_task() instead of wake_up_process().

This is not an exhaustive patch. It isn't yet clear how signals are going to
interract with fibrils. Once that is decided callers of wake_up_state() are
going to need to reflect the desired behaviour. I add __deprecated to it to
highlight this detail.

The actual act of performing the wake-up is hidden under try_to_wake_up() and
is serialized with the scheduler under the runqueue lock. This is very
fiddly stuff. I'm sure I've missed some details. I've tried to comment
the intent above try_to_wake_up_fibril().

diff -r df7bc026d50e -r 4ea674e8825e arch/i386/kernel/ptrace.c
--- a/arch/i386/kernel/ptrace.c Mon Jan 29 15:36:16 2007 -0800
+++ b/arch/i386/kernel/ptrace.c Mon Jan 29 15:46:47 2007 -0800
@@ -492,7 +492,7 @@ long arch_ptrace(struct task_struct *chi
child->exit_code = data;
/* make sure the single step bit is not set. */
clear_singlestep(child);
- wake_up_process(child);
+ wake_up_task(child);
ret = 0;
break;

@@ -508,7 +508,7 @@ long arch_ptrace(struct task_struct *chi
child->exit_code = SIGKILL;
/* make sure the single step bit is not set. */
clear_singlestep(child);
- wake_up_process(child);
+ wake_up_task(child);
break;

case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
@@ -526,7 +526,7 @@ long arch_ptrace(struct task_struct *chi
set_singlestep(child);
child->exit_code = data;
/* give it a chance to run. */
- wake_up_process(child);
+ wake_up_task(child);
ret = 0;
break;

diff -r df7bc026d50e -r 4ea674e8825e drivers/block/loop.c
--- a/drivers/block/loop.c Mon Jan 29 15:36:16 2007 -0800
+++ b/drivers/block/loop.c Mon Jan 29 15:46:47 2007 -0800
@@ -824,7 +824,7 @@ static int loop_set_fd(struct loop_devic
goto out_clr;
}
lo->lo_state = Lo_bound;
- wake_up_process(lo->lo_thread);
+ wake_up_task(lo->lo_thread);
return 0;

out_clr:
diff -r df7bc026d50e -r 4ea674e8825e drivers/md/dm-io.c
--- a/drivers/md/dm-io.c Mon Jan 29 15:36:16 2007 -0800
+++ b/drivers/md/dm-io.c Mon Jan 29 15:46:47 2007 -0800
@@ -18,7 +18,7 @@ struct io {
struct io {
unsigned long error;
atomic_t count;
- struct task_struct *sleeper;
+ void *wake_target;
io_notify_fn callback;
void *context;
};
@@ -110,8 +110,8 @@ static void dec_count(struct io *io, uns
set_bit(region, &io->error);

if (atomic_dec_and_test(&io->count)) {
- if (io->sleeper)
- wake_up_process(io->sleeper);
+ if (io->wake_target)
+ wake_up_task(io->wake_target);

else {
int r = io->error;
@@ -323,7 +323,7 @@ static int sync_io(unsigned int num_regi

io.error = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
- io.sleeper = current;
+ io.wake_target = task_wake_target(current);

dispatch_io(rw, num_regions, where, dp, &io, 1);

@@ -358,7 +358,7 @@ static int async_io(unsigned int num_reg
io = mempool_alloc(_io_pool, GFP_NOIO);
io->error = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = NULL;
+ io->wake_target = NULL;
io->callback = fn;
io->context = context;

diff -r df7bc026d50e -r 4ea674e8825e drivers/scsi/qla2xxx/qla_os.c
--- a/drivers/scsi/qla2xxx/qla_os.c Mon Jan 29 15:36:16 2007 -0800
+++ b/drivers/scsi/qla2xxx/qla_os.c Mon Jan 29 15:46:47 2007 -0800
@@ -2403,7 +2403,7 @@ qla2xxx_wake_dpc(scsi_qla_host_t *ha)
qla2xxx_wake_dpc(scsi_qla_host_t *ha)
{
if (ha->dpc_thread)
- wake_up_process(ha->dpc_thread);
+ wake_up_task(ha->dpc_thread);
}

/*
diff -r df7bc026d50e -r 4ea674e8825e drivers/scsi/scsi_error.c
--- a/drivers/scsi/scsi_error.c Mon Jan 29 15:36:16 2007 -0800
+++ b/drivers/scsi/scsi_error.c Mon Jan 29 15:46:47 2007 -0800
@@ -51,7 +51,7 @@ void scsi_eh_wakeup(struct Scsi_Host *sh
void scsi_eh_wakeup(struct Scsi_Host *shost)
{
if (shost->host_busy == shost->host_failed) {
- wake_up_process(shost->ehandler);
+ wake_up_task(shost->ehandler);
SCSI_LOG_ERROR_RECOVERY(5,
printk("Waking error handler thread\n"));
}
diff -r df7bc026d50e -r 4ea674e8825e fs/aio.c
--- a/fs/aio.c Mon Jan 29 15:36:16 2007 -0800
+++ b/fs/aio.c Mon Jan 29 15:46:47 2007 -0800
@@ -907,7 +907,7 @@ void fastcall kick_iocb(struct kiocb *io
* single context. */
if (is_sync_kiocb(iocb)) {
kiocbSetKicked(iocb);
- wake_up_process(iocb->ki_obj.tsk);
+ wake_up_target(iocb->ki_obj.wake_target);
return;
}

@@ -941,7 +941,7 @@ int fastcall aio_complete(struct kiocb *
BUG_ON(iocb->ki_users != 1);
iocb->ki_user_data = res;
iocb->ki_users = 0;
- wake_up_process(iocb->ki_obj.tsk);
+ wake_up_target(iocb->ki_obj.wake_target);
return 1;
}

@@ -1053,7 +1053,7 @@ struct aio_timeout {
struct aio_timeout {
struct timer_list timer;
int timed_out;
- struct task_struct *p;
+ void *wake_target;
};

static void timeout_func(unsigned long data)
@@ -1061,7 +1061,7 @@ static void timeout_func(unsigned long d
struct aio_timeout *to = (struct aio_timeout *)data;

to->timed_out = 1;
- wake_up_process(to->p);
+ wake_up_target(to->wake_target);
}

static inline void init_timeout(struct aio_timeout *to)
@@ -1070,7 +1070,7 @@ static inline void init_timeout(struct a
to->timer.data = (unsigned long)to;
to->timer.function = timeout_func;
to->timed_out = 0;
- to->p = current;
+ to->wake_target = task_wake_target(current);
}

static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
diff -r df7bc026d50e -r 4ea674e8825e fs/direct-io.c
--- a/fs/direct-io.c Mon Jan 29 15:36:16 2007 -0800
+++ b/fs/direct-io.c Mon Jan 29 15:46:47 2007 -0800
@@ -124,7 +124,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
- struct task_struct *waiter; /* waiting task (NULL if none) */
+ void *wake_target; /* waiting initiator (NULL if none) */

/* AIO related stuff */
struct kiocb *iocb; /* kiocb */
@@ -278,8 +278,8 @@ static int dio_bio_end_aio(struct bio *b

spin_lock_irqsave(&dio->bio_lock, flags);
remaining = --dio->refcount;
- if (remaining == 1 && dio->waiter)
- wake_up_process(dio->waiter);
+ if (remaining == 1 && dio->wake_target)
+ wake_up_target(dio->wake_target);
spin_unlock_irqrestore(&dio->bio_lock, flags);

if (remaining == 0) {
@@ -309,8 +309,8 @@ static int dio_bio_end_io(struct bio *bi
spin_lock_irqsave(&dio->bio_lock, flags);
bio->bi_private = dio->bio_list;
dio->bio_list = bio;
- if (--dio->refcount == 1 && dio->waiter)
- wake_up_process(dio->waiter);
+ if (--dio->refcount == 1 && dio->wake_target)
+ wake_up_target(dio->wake_target);
spin_unlock_irqrestore(&dio->bio_lock, flags);
return 0;
}
@@ -393,12 +393,12 @@ static struct bio *dio_await_one(struct
*/
while (dio->refcount > 1 && dio->bio_list == NULL) {
__set_current_state(TASK_UNINTERRUPTIBLE);
- dio->waiter = current;
+ dio->wake_target = task_wake_target(current);
spin_unlock_irqrestore(&dio->bio_lock, flags);
io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
- dio->waiter = NULL;
+ dio->wake_target = NULL;
}
if (dio->bio_list) {
bio = dio->bio_list;
@@ -990,7 +990,7 @@ direct_io_worker(int rw, struct kiocb *i
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
dio->bio_list = NULL;
- dio->waiter = NULL;
+ dio->wake_target = NULL;

/*
* In case of non-aligned buffers, we may need 2 more
diff -r df7bc026d50e -r 4ea674e8825e fs/jbd/journal.c
--- a/fs/jbd/journal.c Mon Jan 29 15:36:16 2007 -0800
+++ b/fs/jbd/journal.c Mon Jan 29 15:46:47 2007 -0800
@@ -94,7 +94,7 @@ static void commit_timeout(unsigned long
{
struct task_struct * p = (struct task_struct *) __data;

- wake_up_process(p);
+ wake_up_task(p);
}

/*
diff -r df7bc026d50e -r 4ea674e8825e include/linux/aio.h
--- a/include/linux/aio.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/aio.h Mon Jan 29 15:46:47 2007 -0800
@@ -98,7 +98,7 @@ struct kiocb {

union {
void __user *user;
- struct task_struct *tsk;
+ void *wake_target;
} ki_obj;

__u64 ki_user_data; /* user's data for completion */
@@ -124,7 +124,6 @@ struct kiocb {
#define is_sync_kiocb(iocb) ((iocb)->ki_key == KIOCB_SYNC_KEY)
#define init_sync_kiocb(x, filp) \
do { \
- struct task_struct *tsk = current; \
(x)->ki_flags = 0; \
(x)->ki_users = 1; \
(x)->ki_key = KIOCB_SYNC_KEY; \
@@ -133,7 +132,7 @@ struct kiocb {
(x)->ki_cancel = NULL; \
(x)->ki_retry = NULL; \
(x)->ki_dtor = NULL; \
- (x)->ki_obj.tsk = tsk; \
+ (x)->ki_obj.wake_target = task_wake_target(current); \
(x)->ki_user_data = 0; \
init_wait((&(x)->ki_wait)); \
} while (0)
diff -r df7bc026d50e -r 4ea674e8825e include/linux/freezer.h
--- a/include/linux/freezer.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/freezer.h Mon Jan 29 15:46:47 2007 -0800
@@ -42,7 +42,7 @@ static inline int thaw_process(struct ta
{
if (frozen(p)) {
p->flags &= ~PF_FROZEN;
- wake_up_process(p);
+ wake_up_task(p);
return 1;
}
return 0;
diff -r df7bc026d50e -r 4ea674e8825e include/linux/hrtimer.h
--- a/include/linux/hrtimer.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/hrtimer.h Mon Jan 29 15:46:47 2007 -0800
@@ -65,7 +65,7 @@ struct hrtimer {
*/
struct hrtimer_sleeper {
struct hrtimer timer;
- struct task_struct *task;
+ void *wake_target;
};

/**
diff -r df7bc026d50e -r 4ea674e8825e include/linux/kthread.h
--- a/include/linux/kthread.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/kthread.h Mon Jan 29 15:46:47 2007 -0800
@@ -22,7 +22,7 @@ struct task_struct *kthread_create(int (
struct task_struct *__k \
= kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
if (!IS_ERR(__k)) \
- wake_up_process(__k); \
+ wake_up_task(__k); \
__k; \
})

diff -r df7bc026d50e -r 4ea674e8825e include/linux/module.h
--- a/include/linux/module.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/module.h Mon Jan 29 15:46:47 2007 -0800
@@ -334,7 +334,7 @@ struct module
struct list_head modules_which_use_me;

/* Who is waiting for us to be unloaded */
- struct task_struct *waiter;
+ void *wake_target;

/* Destruction function. */
void (*exit)(void);
diff -r df7bc026d50e -r 4ea674e8825e include/linux/mutex.h
--- a/include/linux/mutex.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/mutex.h Mon Jan 29 15:46:47 2007 -0800
@@ -65,7 +65,7 @@ struct mutex {
*/
struct mutex_waiter {
struct list_head list;
- struct task_struct *task;
+ void *wake_target;
#ifdef CONFIG_DEBUG_MUTEXES
struct mutex *lock;
void *magic;
diff -r df7bc026d50e -r 4ea674e8825e include/linux/posix-timers.h
--- a/include/linux/posix-timers.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/posix-timers.h Mon Jan 29 15:46:47 2007 -0800
@@ -48,6 +48,7 @@ struct k_itimer {
int it_sigev_signo; /* signo word of sigevent struct */
sigval_t it_sigev_value; /* value word of sigevent struct */
struct task_struct *it_process; /* process to send signal to */
+ void *it_wake_target; /* wake target for nanosleep case */
struct sigqueue *sigq; /* signal queue entry. */
union {
struct {
diff -r df7bc026d50e -r 4ea674e8825e include/linux/sched.h
--- a/include/linux/sched.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/sched.h Mon Jan 29 15:46:47 2007 -0800
@@ -1338,8 +1338,14 @@ extern void switch_uid(struct user_struc

extern void do_timer(unsigned long ticks);

-extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
-extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+/*
+ * XXX We need to figure out how signal delivery will wake the fibrils in
+ * a task. This is marked deprecated so that we get a compile-time warning
+ * to worry about it.
+ */
+extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)) __deprecated;
+extern int FASTCALL(wake_up_target(void *wake_target));
+extern int FASTCALL(wake_up_task(struct task_struct *task));
extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
unsigned long clone_flags));
#ifdef CONFIG_SMP
diff -r df7bc026d50e -r 4ea674e8825e include/linux/sem.h
--- a/include/linux/sem.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/sem.h Mon Jan 29 15:46:47 2007 -0800
@@ -104,7 +104,7 @@ struct sem_queue {
struct sem_queue {
struct sem_queue * next; /* next entry in the queue */
struct sem_queue ** prev; /* previous entry in the queue, *(q->prev) == q */
- struct task_struct* sleeper; /* this process */
+ void *wake_target;
struct sem_undo * undo; /* undo structure */
int pid; /* process id of requesting process */
int status; /* completion status of operation */
diff -r df7bc026d50e -r 4ea674e8825e include/linux/wait.h
--- a/include/linux/wait.h Mon Jan 29 15:36:16 2007 -0800
+++ b/include/linux/wait.h Mon Jan 29 15:46:47 2007 -0800
@@ -54,13 +54,16 @@ typedef struct __wait_queue_head wait_qu
typedef struct __wait_queue_head wait_queue_head_t;

struct task_struct;
+/* XXX sigh, wait.h <-> sched.h have some fun ordering */
+void *task_wake_target(struct task_struct *task);
+struct task_struct *wake_target_to_task(void *wake_target);

/*
* Macros for declaration and initialisaton of the datatypes
*/

#define __WAITQUEUE_INITIALIZER(name, tsk) { \
- .private = tsk, \
+ .private = task_wake_target(tsk), \
.func = default_wake_function, \
.task_list = { NULL, NULL } }

@@ -91,7 +94,7 @@ static inline void init_waitqueue_entry(
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
q->flags = 0;
- q->private = p;
+ q->private = task_wake_target(p);
q->func = default_wake_function;
}

@@ -389,7 +392,7 @@ int wake_bit_function(wait_queue_t *wait

#define DEFINE_WAIT(name) \
wait_queue_t name = { \
- .private = current, \
+ .private = task_wake_target(current), \
.func = autoremove_wake_function, \
.task_list = LIST_HEAD_INIT((name).task_list), \
}
@@ -398,7 +401,7 @@ int wake_bit_function(wait_queue_t *wait
struct wait_bit_queue name = { \
.key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \
.wait = { \
- .private = current, \
+ .private = task_wake_target(current), \
.func = wake_bit_function, \
.task_list = \
LIST_HEAD_INIT((name).wait.task_list), \
@@ -407,7 +410,7 @@ int wake_bit_function(wait_queue_t *wait

#define init_wait(wait) \
do { \
- (wait)->private = current; \
+ (wait)->private = task_wake_target(current); \
(wait)->func = autoremove_wake_function; \
INIT_LIST_HEAD(&(wait)->task_list); \
} while (0)
diff -r df7bc026d50e -r 4ea674e8825e ipc/mqueue.c
--- a/ipc/mqueue.c Mon Jan 29 15:36:16 2007 -0800
+++ b/ipc/mqueue.c Mon Jan 29 15:46:47 2007 -0800
@@ -58,7 +58,7 @@

struct ext_wait_queue { /* queue of sleeping tasks */
- struct task_struct *task;
+ void *wake_target;
struct list_head list;
struct msg_msg *msg; /* ptr of loaded message */
int state; /* one of STATE_* values */
@@ -394,10 +394,11 @@ static void wq_add(struct mqueue_inode_i
{
struct ext_wait_queue *walk;

- ewp->task = current;
+ ewp->wake_target = task_wake_target(current);

list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
- if (walk->task->static_prio <= current->static_prio) {
+ if (wake_target_to_task(walk->wake_target)->static_prio
+ <= current->static_prio) {
list_add_tail(&ewp->list, &walk->list);
return;
}
@@ -785,7 +786,7 @@ static inline void pipelined_send(struct
receiver->msg = message;
list_del(&receiver->list);
receiver->state = STATE_PENDING;
- wake_up_process(receiver->task);
+ wake_up_target(receiver->wake_target);
smp_wmb();
receiver->state = STATE_READY;
}
@@ -804,7 +805,7 @@ static inline void pipelined_receive(str
msg_insert(sender->msg, info);
list_del(&sender->list);
sender->state = STATE_PENDING;
- wake_up_process(sender->task);
+ wake_up_target(sender->wake_target);
smp_wmb();
sender->state = STATE_READY;
}
@@ -869,7 +870,7 @@ asmlinkage long sys_mq_timedsend(mqd_t m
spin_unlock(&info->lock);
ret = timeout;
} else {
- wait.task = current;
+ wait.wake_target = task_wake_target(current);
wait.msg = (void *) msg_ptr;
wait.state = STATE_NONE;
ret = wq_sleep(info, SEND, timeout, &wait);
@@ -944,7 +945,7 @@ asmlinkage ssize_t sys_mq_timedreceive(m
ret = timeout;
msg_ptr = NULL;
} else {
- wait.task = current;
+ wait.wake_target = task_wake_target(current);
wait.state = STATE_NONE;
ret = wq_sleep(info, RECV, timeout, &wait);
msg_ptr = wait.msg;
diff -r df7bc026d50e -r 4ea674e8825e ipc/msg.c
--- a/ipc/msg.c Mon Jan 29 15:36:16 2007 -0800
+++ b/ipc/msg.c Mon Jan 29 15:46:47 2007 -0800
@@ -46,7 +46,7 @@
*/
struct msg_receiver {
struct list_head r_list;
- struct task_struct *r_tsk;
+ struct task_struct *r_wake_target;

int r_mode;
long r_msgtype;
@@ -58,7 +58,7 @@ struct msg_receiver {
/* one msg_sender for each sleeping sender */
struct msg_sender {
struct list_head list;
- struct task_struct *tsk;
+ void *wake_target;
};

#define SEARCH_ANY 1
@@ -180,7 +180,7 @@ static int newque (struct ipc_namespace

static inline void ss_add(struct msg_queue *msq, struct msg_sender *mss)
{
- mss->tsk = current;
+ mss->wake_target = task_wake_target(current);
current->state = TASK_INTERRUPTIBLE;
list_add_tail(&mss->list, &msq->q_senders);
}
@@ -203,7 +203,7 @@ static void ss_wakeup(struct list_head *
tmp = tmp->next;
if (kill)
mss->list.next = NULL;
- wake_up_process(mss->tsk);
+ wake_up_target(mss->wake_target);
}
}

@@ -218,7 +218,7 @@ static void expunge_all(struct msg_queue
msr = list_entry(tmp, struct msg_receiver, r_list);
tmp = tmp->next;
msr->r_msg = NULL;
- wake_up_process(msr->r_tsk);
+ wake_up_target(msr->r_wake_target);
smp_mb();
msr->r_msg = ERR_PTR(res);
}
@@ -602,20 +602,21 @@ static inline int pipelined_send(struct
msr = list_entry(tmp, struct msg_receiver, r_list);
tmp = tmp->next;
if (testmsg(msg, msr->r_msgtype, msr->r_mode) &&
- !security_msg_queue_msgrcv(msq, msg, msr->r_tsk,
- msr->r_msgtype, msr->r_mode)) {
+ !security_msg_queue_msgrcv(msq, msg,
+ wake_target_to_task(msr->r_wake_target),
+ msr->r_msgtype, msr->r_mode)) {

list_del(&msr->r_list);
if (msr->r_maxsize < msg->m_ts) {
msr->r_msg = NULL;
- wake_up_process(msr->r_tsk);
+ wake_up_target(msr->r_wake_target);
smp_mb();
msr->r_msg = ERR_PTR(-E2BIG);
} else {
msr->r_msg = NULL;
- msq->q_lrpid = msr->r_tsk->pid;
+ msq->q_lrpid = wake_target_to_task(msr->r_wake_target)->pid;
msq->q_rtime = get_seconds();
- wake_up_process(msr->r_tsk);
+ wake_up_target(msr->r_wake_target);
smp_mb();
msr->r_msg = msg;

@@ -826,7 +827,7 @@ long do_msgrcv(int msqid, long *pmtype,
goto out_unlock;
}
list_add_tail(&msr_d.r_list, &msq->q_receivers);
- msr_d.r_tsk = current;
+ msr_d.r_wake_target = task_wake_target(current);
msr_d.r_msgtype = msgtyp;
msr_d.r_mode = mode;
if (msgflg & MSG_NOERROR)
diff -r df7bc026d50e -r 4ea674e8825e ipc/sem.c
--- a/ipc/sem.c Mon Jan 29 15:36:16 2007 -0800
+++ b/ipc/sem.c Mon Jan 29 15:46:47 2007 -0800
@@ -411,7 +411,7 @@ static void update_queue (struct sem_arr
error = try_atomic_semop(sma, q->sops, q->nsops,
q->undo, q->pid);

- /* Does q->sleeper still need to sleep? */
+ /* Does q->wake_target still need to sleep? */
if (error <= 0) {
struct sem_queue *n;
remove_from_queue(sma,q);
@@ -431,7 +431,7 @@ static void update_queue (struct sem_arr
n = sma->sem_pending;
else
n = q->next;
- wake_up_process(q->sleeper);
+ wake_up_target(q->wake_target);
/* hands-off: q will disappear immediately after
* writing q->status.
*/
@@ -515,7 +515,7 @@ static void freeary (struct ipc_namespac
q->prev = NULL;
n = q->next;
q->status = IN_WAKEUP;
- wake_up_process(q->sleeper); /* doesn't sleep */
+ wake_up_target(q->wake_target); /* doesn't sleep */
smp_wmb();
q->status = -EIDRM; /* hands-off q */
q = n;
@@ -1223,7 +1223,7 @@ retry_undos:
prepend_to_queue(sma ,&queue);

queue.status = -EINTR;
- queue.sleeper = current;
+ queue.wake_target = task_wake_target(current);
current->state = TASK_INTERRUPTIBLE;
sem_unlock(sma);

diff -r df7bc026d50e -r 4ea674e8825e kernel/exit.c
--- a/kernel/exit.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/exit.c Mon Jan 29 15:46:47 2007 -0800
@@ -91,7 +91,7 @@ static void __exit_signal(struct task_st
* then notify it:
*/
if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) {
- wake_up_process(sig->group_exit_task);
+ wake_up_task(sig->group_exit_task);
sig->group_exit_task = NULL;
}
if (tsk == sig->curr_target)
diff -r df7bc026d50e -r 4ea674e8825e kernel/hrtimer.c
--- a/kernel/hrtimer.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/hrtimer.c Mon Jan 29 15:46:47 2007 -0800
@@ -660,11 +660,11 @@ static int hrtimer_wakeup(struct hrtimer
{
struct hrtimer_sleeper *t =
container_of(timer, struct hrtimer_sleeper, timer);
- struct task_struct *task = t->task;
-
- t->task = NULL;
- if (task)
- wake_up_process(task);
+ void *wake_target = t->wake_target;
+
+ t->wake_target = NULL;
+ if (wake_target)
+ wake_up_target(wake_target);

return HRTIMER_NORESTART;
}
@@ -672,7 +672,7 @@ void hrtimer_init_sleeper(struct hrtimer
void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
{
sl->timer.function = hrtimer_wakeup;
- sl->task = task;
+ sl->wake_target = task_wake_target(task);
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -688,9 +688,9 @@ static int __sched do_nanosleep(struct h
hrtimer_cancel(&t->timer);
mode = HRTIMER_ABS;

- } while (t->task && !signal_pending(current));
-
- return t->task == NULL;
+ } while (t->wake_target && !signal_pending(current));
+
+ return t->wake_target == NULL;
}

long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
diff -r df7bc026d50e -r 4ea674e8825e kernel/kthread.c
--- a/kernel/kthread.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/kthread.c Mon Jan 29 15:46:47 2007 -0800
@@ -232,7 +232,7 @@ int kthread_stop(struct task_struct *k)

/* Now set kthread_should_stop() to true, and wake it up. */
kthread_stop_info.k = k;
- wake_up_process(k);
+ wake_up_task(k);
put_task_struct(k);

/* Once it dies, reset stop ptr, gather result and we're done. */
diff -r df7bc026d50e -r 4ea674e8825e kernel/module.c
--- a/kernel/module.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/module.c Mon Jan 29 15:46:47 2007 -0800
@@ -508,7 +508,7 @@ static void module_unload_init(struct mo
/* Hold reference count during initialization. */
local_set(&mod->ref[raw_smp_processor_id()].count, 1);
/* Backwards compatibility macros put refcount during init. */
- mod->waiter = current;
+ mod->wake_target = task_wake_target(current);
}

/* modules using other modules */
@@ -699,7 +699,7 @@ sys_delete_module(const char __user *nam
}

/* Set this up before setting mod->state */
- mod->waiter = current;
+ mod->wake_target = task_wake_target(current);

/* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
@@ -797,7 +797,7 @@ void module_put(struct module *module)
local_dec(&module->ref[cpu].count);
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
- wake_up_process(module->waiter);
+ wake_up_target(module->wake_target);
put_cpu();
}
}
diff -r df7bc026d50e -r 4ea674e8825e kernel/mutex-debug.c
--- a/kernel/mutex-debug.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/mutex-debug.c Mon Jan 29 15:46:47 2007 -0800
@@ -53,6 +53,7 @@ void debug_mutex_free_waiter(struct mute
memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
}

+#warning "this is going to need updating for fibrils"
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
struct thread_info *ti)
{
@@ -67,12 +68,12 @@ void mutex_remove_waiter(struct mutex *l
struct thread_info *ti)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
- DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
+ DEBUG_LOCKS_WARN_ON(waiter->wake_target != task_wake_target(ti->task));
DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
ti->task->blocked_on = NULL;

list_del_init(&waiter->list);
- waiter->task = NULL;
+ waiter->wake_target = NULL;
}

void debug_mutex_unlock(struct mutex *lock)
diff -r df7bc026d50e -r 4ea674e8825e kernel/mutex.c
--- a/kernel/mutex.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/mutex.c Mon Jan 29 15:46:47 2007 -0800
@@ -137,7 +137,7 @@ __mutex_lock_common(struct mutex *lock,

/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
- waiter.task = task;
+ waiter.wake_target = task_wake_target(task);

for (;;) {
/*
@@ -246,7 +246,7 @@ __mutex_unlock_common_slowpath(atomic_t

debug_mutex_wake_waiter(lock, waiter);

- wake_up_process(waiter->task);
+ wake_up_target(waiter->wake_target);
}

debug_mutex_clear_owner(lock);
diff -r df7bc026d50e -r 4ea674e8825e kernel/posix-cpu-timers.c
--- a/kernel/posix-cpu-timers.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/posix-cpu-timers.c Mon Jan 29 15:46:47 2007 -0800
@@ -673,7 +673,7 @@ static void cpu_timer_fire(struct k_itim
* This a special case for clock_nanosleep,
* not a normal timer from sys_timer_create.
*/
- wake_up_process(timer->it_process);
+ wake_up_target(timer->it_wake_target);
timer->it.cpu.expires.sched = 0;
} else if (timer->it.cpu.incr.sched == 0) {
/*
@@ -1423,6 +1423,12 @@ static int do_cpu_nanosleep(const clocki
timer.it_overrun = -1;
error = posix_cpu_timer_create(&timer);
timer.it_process = current;
+ /*
+ * XXX This isn't quite right, but the rest of the it_process users
+ * fall under the currently unresolved question of how signal delivery
+ * will behave.
+ */
+ timer.it_wake_target = task_wake_target(current);
if (!error) {
static struct itimerspec zero_it;

diff -r df7bc026d50e -r 4ea674e8825e kernel/ptrace.c
--- a/kernel/ptrace.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/ptrace.c Mon Jan 29 15:46:47 2007 -0800
@@ -221,7 +221,7 @@ static inline void __ptrace_detach(struc
__ptrace_unlink(child);
/* .. and wake it up. */
if (child->exit_state != EXIT_ZOMBIE)
- wake_up_process(child);
+ wake_up_task(child);
}

int ptrace_detach(struct task_struct *child, unsigned int data)
diff -r df7bc026d50e -r 4ea674e8825e kernel/rtmutex.c
--- a/kernel/rtmutex.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/rtmutex.c Mon Jan 29 15:46:47 2007 -0800
@@ -516,7 +516,8 @@ static void wakeup_next_waiter(struct rt
}
spin_unlock_irqrestore(&pendowner->pi_lock, flags);

- wake_up_process(pendowner);
+#warning "this looks like it needs expert attention"
+ wake_up_task(pendowner);
}

/*
@@ -640,7 +641,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
/* Signal pending? */
if (signal_pending(current))
ret = -EINTR;
- if (timeout && !timeout->task)
+ if (timeout && !timeout->wake_target)
ret = -ETIMEDOUT;
if (ret)
break;
diff -r df7bc026d50e -r 4ea674e8825e kernel/sched.c
--- a/kernel/sched.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/sched.c Mon Jan 29 15:46:47 2007 -0800
@@ -1381,10 +1381,52 @@ static inline int wake_idle(int cpu, str
}
#endif

+/*
+ * This path wakes a fibril.
+ *
+ * In the common case, a task will be sleeping with multiple pending
+ * sleeping fibrils. In that case we need to put the fibril on the task's
+ * runnable list and wake the task itself. We need it to go back through
+ * the scheduler to find the runnable fibril so we set TIF_NEED_RESCHED.
+ *
+ * A derivative of that case is when the fibril that we're waking is already
+ * current on the sleeping task. In that case we just need to wake the
+ * task itself, it will already be executing the fibril we're waking. We
+ * do not put it on the runnable list in that case.
+ *
+ * XXX Obviously, there are lots of very scary races here. We should get
+ * more confidence that they're taken care of.
+ */
+static int try_to_wake_up_fibril(struct task_struct *tsk, void *wake_target,
+ unsigned int state)
+{
+ struct fibril *fibril = (struct fibril *)
+ ((unsigned long)wake_target & ~1UL);
+ long old_state = fibril->state;
+ int ret = 1;
+
+ if (!(old_state & state))
+ goto out;
+
+ ret = 0;

+ fibril->state = TASK_RUNNING;
+

+ if (fibril->ti->task->fibril != fibril) {

+ BUG_ON(!list_empty(&fibril->run_list));
+ list_add_tail(&fibril->run_list, &tsk->runnable_fibrils);

+ if (!tsk->array)
+ set_ti_thread_flag(task_thread_info(tsk),
+ TIF_NEED_RESCHED);
+ }
+
+out:
+ return ret;
+}
+
/***
* try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
- * @state: the mask of task states that can be woken
+ * @wake_target: the to-be-woken-up sleeper, from task_wake_target()
+ * @state: the mask of states that can be woken
* @sync: do a synchronous wakeup?
*
* Put it on the run-queue if it's not already there. The "current"
@@ -1395,9 +1437,10 @@ static inline int wake_idle(int cpu, str
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
+static int try_to_wake_up(void *wake_target, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
+ struct task_struct *p = wake_target_to_task(wake_target);
unsigned long flags;
long old_state;
struct rq *rq;
@@ -1408,6 +1451,12 @@ static int try_to_wake_up(struct task_st
#endif

rq = task_rq_lock(p, &flags);
+
+ /* See if we're just putting a fibril on its task's runnable list */
+ if (unlikely(((unsigned long)wake_target & 1) &&
+ try_to_wake_up_fibril(p, wake_target, state)))
+ goto out;
+
old_state = p->state;
if (!(old_state & state))
goto out;
@@ -1555,16 +1604,27 @@ out:
return success;
}

-int fastcall wake_up_process(struct task_struct *p)
-{
- return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+int fastcall wake_up_task(struct task_struct *task)
+{
+ return try_to_wake_up((void *)task, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
}
-EXPORT_SYMBOL(wake_up_process);
-
+EXPORT_SYMBOL(wake_up_task);
+
+int fastcall wake_up_target(void *wake_target)
+{
+ return try_to_wake_up(wake_target, TASK_STOPPED | TASK_TRACED |
+ TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(wake_up_target);
+
+/*
+ * XXX We need to figure out how signal delivery will wake the fibrils in
+ * a task.
+ */
int fastcall wake_up_state(struct task_struct *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0);
+ return try_to_wake_up((void *)p, state, 0);
}

static void task_running_tick(struct rq *rq, struct task_struct *p);
@@ -2041,7 +2101,7 @@ static void sched_migrate_task(struct ta

get_task_struct(mt);
task_rq_unlock(rq, &flags);
- wake_up_process(mt);
+ wake_up_task(mt);
put_task_struct(mt);
wait_for_completion(&req.done);

@@ -2673,7 +2733,7 @@ redo:
}
spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance)
- wake_up_process(busiest->migration_thread);
+ wake_up_task(busiest->migration_thread);

/*
* We've kicked active balancing, reset the failure
@@ -3781,6 +3841,33 @@ need_resched:

#endif /* CONFIG_PREEMPT */

+/*
+ * This is a void * so that it's harder for people to stash it in a small
+ * scalar without getting warnings.
+ */
+void *task_wake_target(struct task_struct *task)
+{
+ if (task->fibril) {
+ return (void *)((unsigned long)task->fibril | 1);
+ } else {
+ BUG_ON((unsigned long)task & 1);
+ return task;
+ }
+}
+EXPORT_SYMBOL(task_wake_target);
+
+struct task_struct *wake_target_to_task(void *wake_target)
+{
+ if ((unsigned long)wake_target & 1) {
+ struct fibril *fibril;
+ fibril = (struct fibril *) ((unsigned long)wake_target ^ 1);
+ return fibril->ti->task;
+ } else
+ return (struct task_struct *)((unsigned long)wake_target);
+}
+EXPORT_SYMBOL(wake_target_to_task);
+
+
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
void *key)
{
@@ -5140,7 +5227,7 @@ int set_cpus_allowed(struct task_struct
if (migrate_task(p, any_online_cpu(new_mask), &req)) {
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
- wake_up_process(rq->migration_thread);
+ wake_up_task(rq->migration_thread);
wait_for_completion(&req.done);
tlb_migrate_finish(p->mm);
return 0;
@@ -5462,7 +5549,7 @@ migration_call(struct notifier_block *nf

case CPU_ONLINE:
/* Strictly unneccessary, as first user will wake it. */
- wake_up_process(cpu_rq(cpu)->migration_thread);
+ wake_up_task(cpu_rq(cpu)->migration_thread);
break;

#ifdef CONFIG_HOTPLUG_CPU
diff -r df7bc026d50e -r 4ea674e8825e kernel/signal.c
--- a/kernel/signal.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/signal.c Mon Jan 29 15:46:47 2007 -0800
@@ -948,7 +948,7 @@ __group_complete_signal(int sig, struct
signal_wake_up(t, 0);
t = next_thread(t);
} while (t != p);
- wake_up_process(p->signal->group_exit_task);
+ wake_up_task(p->signal->group_exit_task);
return;
}

diff -r df7bc026d50e -r 4ea674e8825e kernel/softirq.c
--- a/kernel/softirq.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/softirq.c Mon Jan 29 15:46:47 2007 -0800
@@ -58,7 +58,7 @@ static inline void wakeup_softirqd(void)
struct task_struct *tsk = __get_cpu_var(ksoftirqd);

if (tsk && tsk->state != TASK_RUNNING)
- wake_up_process(tsk);
+ wake_up_task(tsk);
}

/*
@@ -583,7 +583,7 @@ static int __cpuinit cpu_callback(struct
per_cpu(ksoftirqd, hotcpu) = p;
break;
case CPU_ONLINE:
- wake_up_process(per_cpu(ksoftirqd, hotcpu));
+ wake_up_task(per_cpu(ksoftirqd, hotcpu));
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
diff -r df7bc026d50e -r 4ea674e8825e kernel/stop_machine.c
--- a/kernel/stop_machine.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/stop_machine.c Mon Jan 29 15:46:47 2007 -0800
@@ -185,7 +185,7 @@ struct task_struct *__stop_machine_run(i
p = kthread_create(do_stop, &smdata, "kstopmachine");
if (!IS_ERR(p)) {
kthread_bind(p, cpu);
- wake_up_process(p);
+ wake_up_task(p);
wait_for_completion(&smdata.done);
}
up(&stopmachine_mutex);
diff -r df7bc026d50e -r 4ea674e8825e kernel/timer.c
--- a/kernel/timer.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/timer.c Mon Jan 29 15:46:47 2007 -0800
@@ -1290,7 +1290,7 @@ asmlinkage long sys_getegid(void)

static void process_timeout(unsigned long __data)
{
- wake_up_process((struct task_struct *)__data);
+ wake_up_task((struct task_struct *)__data);
}

/**
diff -r df7bc026d50e -r 4ea674e8825e kernel/workqueue.c
--- a/kernel/workqueue.c Mon Jan 29 15:36:16 2007 -0800
+++ b/kernel/workqueue.c Mon Jan 29 15:46:47 2007 -0800
@@ -504,14 +504,14 @@ struct workqueue_struct *__create_workqu
if (!p)
destroy = 1;
else
- wake_up_process(p);
+ wake_up_task(p);
} else {
list_add(&wq->list, &workqueues);
for_each_online_cpu(cpu) {
p = create_workqueue_thread(wq, cpu, freezeable);
if (p) {
kthread_bind(p, cpu);
- wake_up_process(p);
+ wake_up_task(p);
} else
destroy = 1;
}
@@ -773,7 +773,7 @@ static int __devinit workqueue_cpu_callb

cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
kthread_bind(cwq->thread, hotcpu);
- wake_up_process(cwq->thread);
+ wake_up_task(cwq->thread);
}
mutex_unlock(&workqueue_mutex);
break;
diff -r df7bc026d50e -r 4ea674e8825e lib/rwsem.c
--- a/lib/rwsem.c Mon Jan 29 15:36:16 2007 -0800
+++ b/lib/rwsem.c Mon Jan 29 15:46:47 2007 -0800
@@ -30,7 +30,7 @@ EXPORT_SYMBOL(__init_rwsem);

struct rwsem_waiter {
struct list_head list;
- struct task_struct *task;
+ void *wake_target;
unsigned int flags;
#define RWSEM_WAITING_FOR_READ 0x00000001
#define RWSEM_WAITING_FOR_WRITE 0x00000002
@@ -50,7 +50,7 @@ __rwsem_do_wake(struct rw_semaphore *sem
__rwsem_do_wake(struct rw_semaphore *sem, int downgrading)
{
struct rwsem_waiter *waiter;
- struct task_struct *tsk;
+ void *wake_target;
struct list_head *next;
signed long oldcount, woken, loop;

@@ -75,16 +75,17 @@ __rwsem_do_wake(struct rw_semaphore *sem
if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
goto readers_only;

- /* We must be careful not to touch 'waiter' after we set ->task = NULL.
- * It is an allocated on the waiter's stack and may become invalid at
- * any time after that point (due to a wakeup from another source).
+ /* We must be careful not to touch 'waiter' after we set ->wake_target
+ * = NULL. It is an allocated on the waiter's stack and may become
+ * invalid at any time after that point (due to a wakeup from another
+ * source).
*/
list_del(&waiter->list);
- tsk = waiter->task;
+ wake_target = waiter->wake_target;
smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
+ waiter->wake_target = NULL;
+ wake_up_target(wake_target);
+ put_task_struct(wake_target_to_task(wake_target));
goto out;

/* don't want to wake any writers */
@@ -123,11 +124,11 @@ __rwsem_do_wake(struct rw_semaphore *sem
for (; loop > 0; loop--) {
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
- tsk = waiter->task;
+ wake_target = waiter->wake_target;
smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
+ waiter->wake_target = NULL;
+ wake_up_target(wake_target);
+ put_task_struct(wake_target_to_task(wake_target));
}

sem->wait_list.next = next;
@@ -157,7 +158,7 @@ rwsem_down_failed_common(struct rw_semap

/* set up my own style of waitqueue */
spin_lock_irq(&sem->wait_lock);
- waiter->task = tsk;
+ waiter->wake_target = task_wake_target(tsk);
get_task_struct(tsk);

list_add_tail(&waiter->list, &sem->wait_list);
@@ -173,7 +174,7 @@ rwsem_down_failed_common(struct rw_semap

/* wait to be given the lock */
for (;;) {
- if (!waiter->task)
+ if (!waiter->wake_target)
break;
schedule();
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
diff -r df7bc026d50e -r 4ea674e8825e mm/pdflush.c
--- a/mm/pdflush.c Mon Jan 29 15:36:16 2007 -0800
+++ b/mm/pdflush.c Mon Jan 29 15:46:47 2007 -0800
@@ -217,7 +217,7 @@ int pdflush_operation(void (*fn)(unsigne
last_empty_jifs = jiffies;
pdf->fn = fn;
pdf->arg0 = arg0;
- wake_up_process(pdf->who);
+ wake_up_task(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
return ret;
diff -r df7bc026d50e -r 4ea674e8825e net/core/pktgen.c
--- a/net/core/pktgen.c Mon Jan 29 15:36:16 2007 -0800
+++ b/net/core/pktgen.c Mon Jan 29 15:46:47 2007 -0800
@@ -3505,7 +3505,7 @@ static int __init pktgen_create_thread(i
pe->proc_fops = &pktgen_thread_fops;
pe->data = t;

- wake_up_process(p);
+ wake_up_task(p);

return 0;

Ingo Molnar

unread,

Feb 1, 2007, 3:38:49 AM2/1/07

to Zach Brown

* Zach Brown <zach....@oracle.com> wrote:

> This patch introduces the notion of a 'fibril'. It's meant to be a

> lighter kernel thread. [...]

as per my other email, i dont really like this concept. This is the
killer:

> [...] There can be multiple of them in the process of executing for a

> given task_struct, but only one can every be actively running at a

> time. [...]

there's almost no scheduling cost from being able to arbitrarily
schedule a kernel thread - but there are /huge/ benefits in it.

would it be hard to redo your AIO patches based on a pool of plain
simple kernel threads?

We could even extend the scheduling properties of kernel threads so that
they could also be 'companion threads' of any given user-space task.
(i.e. they'd always schedule on the same CPu as that user-space task)

I bet most of the real benefit would come from co-scheduling them on the
same CPU. But this should be a performance property, not a basic design
property. (And i also think that having a limited per-CPU pool of AIO
threads works better than having a per-user-thread pool - but again this
is a detail that can be easily changed, not a fundamental design
property.)

Ingo

Ingo Molnar

unread,

Feb 1, 2007, 8:05:04 AM2/1/07

to Zach Brown

* Ingo Molnar <mi...@elte.hu> wrote:

> * Zach Brown <zach....@oracle.com> wrote:
>
> > This patch introduces the notion of a 'fibril'. It's meant to be a
> > lighter kernel thread. [...]
>
> as per my other email, i dont really like this concept. This is the
> killer:

let me clarify this: i very much like your AIO patchset in general, in
the sense that it 'completes' the AIO implementation: finally everything
can be done via it, greatly increasing its utility and hopefully its
penetration. This is the most important step, by far.

what i dont really like /the particular/ concept above - the
introduction of 'fibrils' as a hard distinction of kernel threads. They
are /almost/ kernel threads, but still by being different they create
alot of duplication and miss out on a good deal of features that kernel
threads have naturally.

It kind of hurts to say this because i'm usually quite concept-happy -
one can easily get addicted to the introduction of new core kernel
concepts :-) But i really, really think we dont want to do fibrils but
we want to do kernel threads, and i havent really seen a discussion
about why they shouldnt be done via kernel threads.

Nor have i seen a discussion that whatever threading concept we use for
AIO within the kernel, it is really a fallback thing, not the primary
goal of "native" KAIO design. The primary goal of KAIO design is to
arrive at a state machine - and for one of the most important IO
disciplines, networking, that is reality already. (For filesystem events
i doubt we will ever be able to build an IO state machine - but there
are lots of crazy folks out there so it's not fundamentally impossible,
just very, very hard.)

so my suggestions center around the notion of extending kernel threads
to support the features you find important in fibrils:

> would it be hard to redo your AIO patches based on a pool of plain
> simple kernel threads?
>
> We could even extend the scheduling properties of kernel threads so
> that they could also be 'companion threads' of any given user-space
> task. (i.e. they'd always schedule on the same CPu as that user-space
> task)
>
> I bet most of the real benefit would come from co-scheduling them on
> the same CPU. But this should be a performance property, not a basic
> design property. (And i also think that having a limited per-CPU pool
> of AIO threads works better than having a per-user-thread pool - but
> again this is a detail that can be easily changed, not a fundamental
> design property.)

but i'm willing to be convinced of the opposite as well, as always. (I'm
real good at quickly changing my mind, especially when i'm embarrasingly
wrong about something. So please fire away and dont hold back.)

Christoph Hellwig

unread,

Feb 1, 2007, 8:19:43 AM2/1/07

to Ingo Molnar

On Thu, Feb 01, 2007 at 02:02:34PM +0100, Ingo Molnar wrote:
> what i dont really like /the particular/ concept above - the
> introduction of 'fibrils' as a hard distinction of kernel threads. They
> are /almost/ kernel threads, but still by being different they create
> alot of duplication and miss out on a good deal of features that kernel
> threads have naturally.
>
> It kind of hurts to say this because i'm usually quite concept-happy -
> one can easily get addicted to the introduction of new core kernel
> concepts :-) But i really, really think we dont want to do fibrils but
> we want to do kernel threads, and i havent really seen a discussion
> about why they shouldnt be done via kernel threads.

I tend to agree. Note that there is one thing we should be doing one
one day (not only if we want to use it for aio) is to make kernel threads
more lightweight. Thereéis a lot of baggae we keep around in task_struct
and co that only makes sense for threads that have a user space part and
aren't or shouldn't be needed for a purely kernel-resistant thread.

Ingo Molnar

unread,

Feb 1, 2007, 8:55:32 AM2/1/07

to Christoph Hellwig, Zach Brown, linux-...@vger.kernel.org, linu...@kvack.org, Suparna Bhattacharya, Benjamin LaHaise, Linus Torvalds

* Christoph Hellwig <h...@infradead.org> wrote:

> I tend to agree. Note that there is one thing we should be doing one
> one day (not only if we want to use it for aio) is to make kernel

> threads more lightweight. There a lot of baggae we keep around in

> task_struct and co that only makes sense for threads that have a user
> space part and aren't or shouldn't be needed for a purely
> kernel-resistant thread.

yeah. I'm totally open to such efforts. I'd also be most happy if this
was primarily driven via the KAIO effort: i.e. to implement it via
kernel threads and then to benchmark the hell out of it. I volunteer to
fix whatever fat kernel thread handling has left.

and if people agree with me that 'native' state-machine driven KAIO is
where we want to ultimately achieve (it is certainly the best performing
implementation) then i dont see the point in fibrils as an interim
mechanism anyway. Lets just hide AIO complexities from userspace via
kernel threads, and optimize this via two methods: by making kernel
threads faster, and by simultaneously and gradually converting as much
KAIO code to a native state machine - which would not need any kind of
kernel thread help anyway.

(plus as i mentioned previously, co-scheduling kernel threads with
related user space threads on the same CPU might be something useful too
- not just for KAIO, and we could add that too.)

also, we context-switch kernel threads in 350 nsecs on current hardware
and the -rt kernel is certainly happy with that and runs all hardirqs
and softirqs in separate kernel thread contexts. There's not /that/ much
fat left to cut off - and if there's something more to optimize there
then there are a good number of projects interested in that, not just
the KAIO effort :)

Ingo

Mark Lord

unread,

Feb 1, 2007, 12:14:37 PM2/1/07

to Ingo Molnar

Ingo Molnar wrote:
>
> also, we context-switch kernel threads in 350 nsecs on current hardware
> and the -rt kernel is certainly happy with that and runs all hardirqs

Ingo, how relevant is that "350 nsecs on current hardware" claim?

I don't mean that in a bad way, but my own experience suggests that
most people doing real hard RT (or tight soft RT) are not doing it
on x86 architectures. But rather on lowly 1GHz (or less) ARM based
processors and the like.

For RT issues, those are the platforms I care more about,
as those are the ones that get embedded into real-time devices.

??

Cheers

Ingo Molnar

unread,

Feb 1, 2007, 1:05:46 PM2/1/07

to Mark Lord

* Mark Lord <lk...@rtr.ca> wrote:

> >also, we context-switch kernel threads in 350 nsecs on current
> >hardware and the -rt kernel is certainly happy with that and runs all
> >hardirqs
>
> Ingo, how relevant is that "350 nsecs on current hardware" claim?
>
> I don't mean that in a bad way, but my own experience suggests that
> most people doing real hard RT (or tight soft RT) are not doing it on
> x86 architectures. But rather on lowly 1GHz (or less) ARM based
> processors and the like.

it's not relevant to those embedded boards, but it's relevant to the AIO
discussion, which centers around performance.

> For RT issues, those are the platforms I care more about, as those are
> the ones that get embedded into real-time devices.

yeah. Nevertheless if you want to use -rt on your desktop (under Fedora
4/5/6) you can track an rpmized+distroized full kernel package quite
easily, via 3 easy commands:

cd /etc/yum.repos.d
wget http://people.redhat.com/~mingo/realtime-preempt/rt.repo

yum install kernel-rt.x86_64 # on x86_64
yum install kernel-rt # on i686

which is closely tracking latest upstream -git. (for example, the
current kernel-rt-2.6.20-rc7.1.rt3.0109.i686.rpm is based on
2.6.20-rc7-git1, so if you want to run a kernel rpm that has all of
Linus' latest commits from yesterday, this might be for you.)

it's rumored to be a quite smooth kernel ;-) So in this sense, because
this also runs on all my testboxes by default, it matters on modern
hardware too, at least to me. Today's commodity hardware is tomorrow's
embedded hardware. If a kernel is good on today's colorful desktop
hardware then it will be perfect for tomorrow's embedded hardware.

Ingo

Linus Torvalds

unread,

Feb 1, 2007, 3:08:46 PM2/1/07

to Ingo Molnar

On Thu, 1 Feb 2007, Ingo Molnar wrote:
>
> there's almost no scheduling cost from being able to arbitrarily
> schedule a kernel thread - but there are /huge/ benefits in it.

That's a singularly *stupid* argument.

Of course scheduling is fast. That's the whole *point* of fibrils. They
still schedule. Nobody claimed anything else.

Bringing up RT kernels and scheduling latency is idiotic. It's like saying
"we should do this because the sky is blue". Sure, that's true, but what
the *hell* does raleigh scattering have to do with anything?

The cost has _never_ been scheduling. That was never the point. Why do you
even bring it up? Only to make an argument that makes no sense?

The cost of AIO is

- maintenance. It'sa separate code-path, and it's one that simply doesn't
fit into anything else AT ALL. It works (mostly) for simple things, ie
reads and writes, but even there, it's really adding a lot of crud that
we could do without.

- setup and teardown costs: both in CPU and in memory. These are the big
costs. It's especially true since a lot of AIO actually ends up cached.
The user program just wants the data - 99% of the time it's likely to
be there, and the whole point of AIO is to get at it cheaply, but not
block if it's not there.

So your scheduling arguments are inane. They totally miss the point. They
have nothing to do with *anything*.

Ingo: everybody *agrees* that scheduling is cheap. Scheduling isn't the
issue. Scheduling isn't even needed in the perfect path where the AIO
didn't need to do any real IO (and that _is_ the path we actually would
like to optimize most).

So instead of talking about totally irrelevant things, please keep your
eyes on the ball.

So I claim that the ball is here:

- cached data (and that is *espectally* true of some of the more
interesting things we can do with a more generic AIO thing: path
lookup, inode filling (stat/fstat) etc usually has hit-rates in the 99%
range, but missing even just 1% of the time can be deadly, if the miss
costs you a hundred msec of not doing anythign else!

Do the math. A "stat()" system call generally takes on the other of a
couple of microseconds. But if it misses even just 1% of the time (and
takes 100 msec when it does that, because there is other IO also
competing for the disk arm), ON AVERAGE it takes 1ms.

So what you should aim for is improving that number. The cached case
should hopefully still be in the microseconds, and the uncached case
should be nonblocking for the caller.

- setup/teardown costs. Both memory and CPU. This is where the current
threads simply don't work. The setup cost of doing a clone/exit is
actually much higher than the cost of doing the whole operation, most
of the time. Remember: caches still work.

- maintenance. Clearly AIO will always have some special code, but if we
can move the special code *away* from filesystems and networking and
all the thousands of device drivers, and into core kernel code, we've
done something good. And if we can extend it from just pure read/write
into just about *anything*, then people will be happy.

So stop blathering about scheduling costs, RT kernels and interrupts.
Interrupts generally happen a few thousand times a second. This is
soemthing you want to do a *million* times a second, without any IO
happening at all except for when it has to.

Linus

Zach Brown

unread,

Feb 1, 2007, 4:53:25 PM2/1/07

to Ingo Molnar

> let me clarify this: i very much like your AIO patchset in general, in
> the sense that it 'completes' the AIO implementation: finally
> everything
> can be done via it, greatly increasing its utility and hopefully its
> penetration. This is the most important step, by far.

We violently agree on this :).

> what i dont really like /the particular/ concept above - the
> introduction of 'fibrils' as a hard distinction of kernel threads.
> They
> are /almost/ kernel threads, but still by being different they create
> alot of duplication and miss out on a good deal of features that
> kernel
> threads have naturally.

I might quibble with some of the details, but I understand your
fundamental concern. I do. I don't get up each morning *thrilled*
by the idea of having to update lockdep, sysrq-t, etc, to understand
these fibril things :). The current fibril switch isn't nearly as
clever as the lock-free task scheduling switch. It'd be nice if we
didn't have to do that work to optimize the hell out of it, sure.

> It kind of hurts to say this because i'm usually quite concept-happy -
> one can easily get addicted to the introduction of new core kernel
> concepts :-)

:)

> so my suggestions center around the notion of extending kernel threads
> to support the features you find important in fibrils:
>
>> would it be hard to redo your AIO patches based on a pool of plain
>> simple kernel threads?

It'd certainly be doable to throw together a credible attempt to
service "asys" system call submission with full-on kernel threads.
That seems like reasonable due diligence to me. If full-on threads
are almost as cheap, great. If fibrils are so much cheaper that they
seem to warrant investing in, great.

I am concerned about the change in behaviour if we fall back to full
kernel threads, though. I really, really, want aio syscalls to
behave just like sync ones.

Would your strategy be to update the syscall implementations to share
data in task_struct so that there isn't as significant a change in
behaviour? (sharing current->ioprio, instead if just inheriting it,
for example.). We'd be betting that there would be few of these and
that they'd be pretty reasonable to share?

- z

Benjamin LaHaise

unread,

Feb 1, 2007, 5:24:38 PM2/1/07

to Zach Brown

On Thu, Feb 01, 2007 at 01:52:13PM -0800, Zach Brown wrote:
> >let me clarify this: i very much like your AIO patchset in general, in
> >the sense that it 'completes' the AIO implementation: finally
> >everything
> >can be done via it, greatly increasing its utility and hopefully its
> >penetration. This is the most important step, by far.
>
> We violently agree on this :).

There is also the old kernel_thread based method that should probably be
compared, especially if pre-created threads are thrown into the mix. Also,
since the old days, a lot of thread scaling issues have been fixed that
could even make userland threads more viable.

> Would your strategy be to update the syscall implementations to share
> data in task_struct so that there isn't as significant a change in
> behaviour? (sharing current->ioprio, instead if just inheriting it,
> for example.). We'd be betting that there would be few of these and
> that they'd be pretty reasonable to share?

Priorities cannot be shared, as they have to adapt to the per-request
priority when we get down to the nitty gitty of POSIX AIO, as otherwise
realtime issues like keepalive transmits will be handled incorrectly.

-ben
--
"Time is of no importance, Mr. President, only life is important."
Don't Email: <do...@kvack.org>.

Zach Brown

unread,

Feb 1, 2007, 5:39:08 PM2/1/07

to Benjamin LaHaise

> Priorities cannot be shared, as they have to adapt to the per-request
> priority when we get down to the nitty gitty of POSIX AIO, as
> otherwise
> realtime issues like keepalive transmits will be handled incorrectly.

Well, maybe not *blind* sharing. But something more than the
disconnect threads currently have with current->ioprio.

Today an existing kernel thread would most certainly ignore a
sys_ioprio_set() in the submitter and then handle an aio syscall with
an old current->ioprio.

Something more smart than that is all I'm on about.

- z

Ingo Molnar

unread,

Feb 2, 2007, 5:51:18 AM2/2/07

to Linus Torvalds

* Linus Torvalds <torv...@linux-foundation.org> wrote:

> So stop blathering about scheduling costs, RT kernels and interrupts.
> Interrupts generally happen a few thousand times a second. This is
> soemthing you want to do a *million* times a second, without any IO
> happening at all except for when it has to.

we might be talking past each other.

i never suggested every aio op should create/destroy a kernel thread!

My only suggestion was to have a couple of transparent kernel threads
(not fibrils) attached to a user context that does asynchronous
syscalls! Those kernel threads would be 'switched in' if the current
user-space thread blocks - so instead of having to 'create' any of them
- the fast path would be to /switch/ them to under the current
user-space, so that user-space processing can continue under that other
thread!

That means that in the 'switch kernel context' fastpath it simply needs
to copy the blocked threads' user-space ptregs (~64 bytes) to its own
kernel stack, and then it can do a return-from-syscall without
user-space noticing the switch! Never would we really see the cost of
kernel thread creation. We would never see that cost in the fully cached
case (no other thread is needed then), nor would we see it in the
blocking-IO case, due to pooling. (there are some other details related
to things like the FPU context, but you get the idea.)

Let me quote Zach's reply to my suggestions:

| It'd certainly be doable to throw together a credible attempt to
| service "asys" system call submission with full-on kernel threads.
| That seems like reasonable due diligence to me. If full-on threads

| are almost as cheap, great. If fibrils are so much cheaper that they
| seem to warrant investing in, great.

that's all i wanted to see being considered!

Please ignore my points about scheduling costs - i only talked about
them at length because the only fundamental difference between kernel
threads and fibrils is their /scheduling/ properties. /Not/ the
setup/teardown costs - those are not relevant /precisely/ because they
can be pooled and because they happen relatively rarely, compared to the
cached case. The 'switch to the blocked thread's ptregs' operation also
involves a context-switch under this design. That's why i was talking
about scheduling so much: the /only/ true difference between fibrils and
kernel threads is their /scheduling/.

I believe this is the point where your argument fails:

> - setup/teardown costs. Both memory and CPU. This is where the current
> threads simply don't work. The setup cost of doing a clone/exit is
> actually much higher than the cost of doing the whole operation,
> most of the time.

you are comparing apples to oranges - i never said we should
create/destroy a kernel thread for every async op. That would be insane!

what we need to support asynchronous system-calls is the ability to pick
up an /already created/ kernel thread from a pool of per-task kernel
threads and to switch it to under the current user-space and return to
the user-space stack with that new kernel thread running. (The other,
blocked kernel thread stays blocked and is returned into the pool of
'pending' AIO kernel threads.) And this only needs to happen in the
'cachemiss' case anyway. In the 'cached' case no other kernel thread
would be involved at all, the current one just falls straight through
the system-call.

my argument is that the whole notion of cutting this at the kernel stack
and thread info level and making fibrils in essence a separate
scheduling entitity is wrong, wrong, wrong. Why not use plain kernel
threads for this?

[ finally, i think you totally ignored my main argument, state machines.
The networking stack is a full and very nice state machine. It's
kicked from user-space, and zillions of small contexts (sockets) are
living on without any of the originating tasks having to be involved.
So i'm still holding to the fundamental notion that within the kernel
this form of AIO is a nice but /secondary/ mechanism. If a subsystem
is able to pull it off, it can implement asynchronity via a state
machine - and it will outperform any thread based AIO. Or not. We'll
see. For something like the VFS i doubt we'll see (and i doubt we
/want/ to see) a 'native' state-machine implementation.

this is btw. quite close to the Tux model of doing asynchronous block
IO and asynchronous VFS events such as asynchronous open(). Tux uses a
pool of kernel threads to pass blocking work to, while not holding up
the 'main' thread. But the main Tux speedup comes from having a native
state machine for all the networking IO. ]

Ingo

Andi Kleen

unread,

Feb 2, 2007, 7:22:26 AM2/2/07

to Ingo Molnar

Ingo Molnar <mi...@elte.hu> writes:

> and for one of the most important IO
> disciplines, networking, that is reality already.

Not 100% -- a few things in TCP/IP at least are blocking still.
Mostly relatively obscure things though.

Also the sockets model is currently incompatible with direct zero-copy RX/TX,
which needs fixing.

-Andi

Andi Kleen

unread,

Feb 2, 2007, 7:23:59 AM2/2/07

to Christoph Hellwig

Christoph Hellwig <h...@infradead.org> writes:
>
> I tend to agree. Note that there is one thing we should be doing one
> one day (not only if we want to use it for aio) is to make kernel threads
> more lightweight. Thereéis a lot of baggae we keep around in task_struct
> and co that only makes sense for threads that have a user space part and
> aren't or shouldn't be needed for a purely kernel-resistant thread.

I suspect you will get a lot of this for free from the current namespace
efforts.

-Andi

Linus Torvalds

unread,

Feb 2, 2007, 10:58:10 AM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Ingo Molnar wrote:
>
> My only suggestion was to have a couple of transparent kernel threads
> (not fibrils) attached to a user context that does asynchronous
> syscalls! Those kernel threads would be 'switched in' if the current
> user-space thread blocks - so instead of having to 'create' any of them
> - the fast path would be to /switch/ them to under the current
> user-space, so that user-space processing can continue under that other
> thread!

But in that case, you really do end up with "fibrils" anyway.

Because those fibrils are what would be the state for the blocked system
calls when they aren't scheduled.

We may have a few hundred thousand system calls a second (maybe that's not
actually reasonable, but it should be what we *aim* for), and 99% of them
will hopefully hit the cache and never need any separate IO, but even if
it's just 1%, we're talking about thousands of threads.

I do _not_ think that it's reasonable to have thousands of threads state
around just "in case". Especially if all those threadlets are then
involved in signals etc - something that they are totally uninterested in.

I think it's a lot more reasonable to have just the kernel stack page for
"this was where I was when I blocked". IOW, a fibril-like thing. You need
some data structure to set up the state *before* you start doing any
threads at all, because hopefully the operation will be totally
synchronous, and no secondary thread is ever really needed!

What I like about fibrils is that they should be able to handle the cached
case well: the case where no "real" scheduling (just the fibril stack
switches) takes place.

Now, most traditional DB loads would tend to use AIO only when they "know"
that real IO will take place (the AIO call itself will probably be
O_DIRECT most of the time). So I suspect that a lot of those users will
never really have the cached case, but one of my hopes is to be able to do
exactly the things that we have *not* done well: asynchronous file opens
and pathname lookups, which is very common in a file server.

If done *really* right, a perfectly normal app could do things like
asynchronous stat() calls to fill in the readdir results. In other words,
what *I* would like to see is the ability to have something *really*
simple like "ls" use this, without it actually being a performance hit
for the common case where everythign is cached.

Have you done "ls -l" on a big uncached directory where the inodes
are all over the disk lately? You can hear the disk whirr. THAT is the
kind of "normal user" thing I'd like to be able to fix, and the db case is
actually secondary. The DB case is much much more limited (ok, so somebody
pointed out that they want slightly more than just read/write, but
still.. We're talking "special code".)

> [ finally, i think you totally ignored my main argument, state machines.

I ignored your argument, because it's not really relevant. The fact that
networking (and TCP in particular) has state machines is because it is a
packetized environment. Nothing else is. Think pathname lookup etc. They
are all *fundamentally* environments with a call stack.

So the state machine argument is totally bogus - it results in a
programming model that simply doesn't match the *normal* setup. You want
the kernel programming model to appear "linear" even when it isn't,
because it's too damn hard to think nonlinearly.

Yes, we could do pathname lookup with that kind of insane setup too. But
it would be HORRID!

Linus

Alan

unread,

Feb 2, 2007, 2:48:44 PM2/2/07

to Linus Torvalds

This one got shelved while I sorted other things out as it warranted a
longer look. Some comments follow, but firstly can we please bury this
"fibril" name. The constructs Zach is using appear to be identical to
co-routines, and they've been called that in computer science literature
for fifty years. They are one of the great and somehow forgotten ideas.
(and I admit I've used them extensively in past things where its
wonderful for multi-player gaming so I'm a convert already).

The stuff however isn't as free as you make out. Current kernel logic
knows about various things being "safe" but with fibrils you have to
address additional questions such as "What happens if I issue an I/O and
change priority". You also have an 800lb gorilla hiding behind a tree
waiting for you in priviledge and permission checking.

Right now current->*u/gid is safe across a syscall start to end, with an
asynchronous setuid all hell breaks loose. I'm not saying we shouldn't do
this, in fact we'd be able to do some of the utterly moronic poxix thread
uid handling in kernel space if we did, just that it isn't free. We have
locking rules defined by the magic serializing construct called
"the syscall" and you break those.

I'd expect the odd other gorilla waiting to mug you as well and the ones
nobody has thought of will be the worst 8)

The number of co-routines and stacks can be dealt with two ways - you use
small stacks allocated when you create a fibril, or you grab a page, use
separate IRQ stacks and either fail creation with -ENOBUFS etc which
drops work on user space, or block (for which cases ??) which also means
an overhead on co-routine exits. That can be tunable, for embedded easily
tuned right down.

Traditional co-routines have clear notions of being able to create a
co-routine, stack them and fire up specific ones. In part this is done
because many things expressed in this way know what to fire up next. It's
also a very clean way to express driver problem with a lot of state

Essentially as a co-routine is simply making "%esp" roughly the same as
the C++ world's "self".

You get some other funny things from co-routines which are very powerful,
very dangerous, or plain insane depending upon your view of life. One big
one is the ability for real men (and women) to do stuff like this,
because you don't need to keep the context attached to the same task.

send_reset_command(dev);
wait_for_irq_event(dev->irq);
/* co-routine continues in IRQ context here */
clean_up_reset_command(dev);
exit_irq_event();
/* co-routine continues out of IRQ context here */
send_identify_command(dev);

Notice we just dealt with all the IRQ stack problems the moment an IRQ is
a co-routine transfer 8)

Ditto with timers, although for the kernel that might not be smart as we
have a lot of timers.

Less insanely you can create a context, start doing stuff in it and then
pass it to someone else local variables, state and all. This one is
actually rather useful for avoiding a lot of the 'D' state crap in the
kernel.

For example we have driver code that sleeps uninterruptibly because its
too hard to undo the mess and get out of the current state if it is
interrupted. In the world of sending other people co-routines you just do
this

coroutine_set(MUST_COMPLETE);

and in exit

foreach(coroutine)
if(coroutine->flags & MUST_COMPLETE)
inherit_coroutine(init, coroutine);

and obviously you don't pass any over that will then not do the right
thing before accessing user space (well unless implementing
'read_for_someone_else()' or other strange syscalls - like ptrace...)

Other questions really relate to the scheduling - Zach do you intend
schedule_fibrils() to be a call code would make or just from schedule() ?

Linus will now tell me I'm out of my tree...

Alan (who used to use Co-routines in real languages on 36bit
computers with 9bit bytes before learning C)

Linus Torvalds

unread,

Feb 2, 2007, 3:15:16 PM2/2/07

to Alan

On Fri, 2 Feb 2007, Alan wrote:
>
> This one got shelved while I sorted other things out as it warranted a
> longer look. Some comments follow, but firstly can we please bury this
> "fibril" name. The constructs Zach is using appear to be identical to
> co-routines, and they've been called that in computer science literature
> for fifty years. They are one of the great and somehow forgotten ideas.
> (and I admit I've used them extensively in past things where its
> wonderful for multi-player gaming so I'm a convert already).

Well, they are indeed coroutines, but they are coroutines in the same
sense any "CPU scheduler" ends up being a coroutine.

They are NOT the generic co-routine that some languages support natively.
So I think trying to call them coroutines would be even more misleading
than calling them fibrils.

In other workds the whole *point* of the fibril is that you can do
"coroutine-like stuff" while using a "normal functional linear programming
paradign".

Wouldn't you agree?

(I love the concept of coroutines, but I absolutely detest what the code
ends up looking like. There's a good reason why people program mostly in
linear flow: that's how people think consciously - even if it's obviously
not how the brain actually works).

And we *definitely* don't want to have a coroutine programming interface
in the kernel. Not in C.

> The stuff however isn't as free as you make out. Current kernel logic
> knows about various things being "safe" but with fibrils you have to
> address additional questions such as "What happens if I issue an I/O and
> change priority". You also have an 800lb gorilla hiding behind a tree
> waiting for you in priviledge and permission checking.

This is why I think it should be 100% clear that things happen in process
context. That just answers everything. If you want to synchronize with
async events and change IO priority, you should do exactly that:

wait_for_async();
ioprio(newprority);

and that "solves" that problem. Leave it to user space.

> Right now current->*u/gid is safe across a syscall start to end, with an
> asynchronous setuid all hell breaks loose. I'm not saying we shouldn't do
> this, in fact we'd be able to do some of the utterly moronic poxix thread
> uid handling in kernel space if we did, just that it isn't free. We have
> locking rules defined by the magic serializing construct called
> "the syscall" and you break those.

I agree. As mentioned, we probably will have fallout.

> The number of co-routines and stacks can be dealt with two ways - you use
> small stacks allocated when you create a fibril, or you grab a page, use
> separate IRQ stacks and either fail creation with -ENOBUFS etc which
> drops work on user space, or block (for which cases ??) which also means
> an overhead on co-routine exits. That can be tunable, for embedded easily
> tuned right down.

Right. It should be possible to just say "use a max parallelism factor of
5", and if somebody submits a hundred AIO calls and they all block, when
it hits #6, it will just do it synchronously.

Basically, what I'm hoping can come out of this (and this is a simplistic
example, but perhaps exactly *because* of that it hopefully also shows
that we canactually make *simple* interfaces for complex asynchronous
things):

struct one_entry *prev = NULL;
struct dirent *de;

while ((de = readdir(dir)) != NULL) {
struct one_entry *entry = malloc(..);

/* Add it to the list, fill in the name */
entry->next = prev;
prev = entry;
strcpy(entry->name, de->d_name);

/* Do the stat lookup async */
async_stat(de->d_name, &entry->stat_buf);
}
wait_for_async();
.. Ta-daa! All done ..

and it *should* allow us to do all the stat lookup asynchronously.

Done right, this should basically be no slower than doing it with a real
stat() if everything was cached. That would kind of be the holy grail
here.

> You get some other funny things from co-routines which are very powerful,
> very dangerous, or plain insane

You forgot "very hard to think about".

We DO NOT want coroutines in general. It's clever, but it's
(a) impossible to do without language support that C doesn't have, or
some really really horrid macro constructs that really only work for
very specific and simple cases.
(b) very non-intuitive unless you've worked with coroutines a lot (and
almost nobody has)

> Linus will now tell me I'm out of my tree...

I don't think you're wrong in theory, I just thnk that in practice,
withing the confines of (a) existing code, (b) existing languages, and (c)
existing developers, we really REALLY don't want to expose coroutines as
such.

But if you wanted to point out that what we want to do is get the
ADVANTAGES of coroutines, without actually have to program them as such,
then yes, I agree 100%. But we shouldn't call them coroutines, because the
whole point is that as far as the user interface is concerned, they don't
look like that. In the kernel, they just look like normal linear
programming.

Linus

Davide Libenzi

unread,

Feb 2, 2007, 4:06:53 PM2/2/07

to Linus Torvalds

On Fri, 2 Feb 2007, Linus Torvalds wrote:

> > You get some other funny things from co-routines which are very powerful,
> > very dangerous, or plain insane
>
> You forgot "very hard to think about".
>
> We DO NOT want coroutines in general. It's clever, but it's
> (a) impossible to do without language support that C doesn't have, or
> some really really horrid macro constructs that really only work for
> very specific and simple cases.
> (b) very non-intuitive unless you've worked with coroutines a lot (and
> almost nobody has)

Actually, coroutines are not too bad to program once you have a
total-coverage async scheduler to run them. The attached (very sketchy)
example uses libpcl ( http://www.xmailserver.org/libpcl.html ) and epoll
as scheduler (but here you can really use anything). You can implement
coroutines in many way, from C preprocessor macros up to anything, but in
the libpcl case they are simply switched stacks. Like fibrils are supposed
to be. The problem is that in order to make a real-life example of
coroutine-based application work, you need everything that can put you at
sleep (syscalls or any external library call you have no control on)
implemented in an async way. And what I ended up doing is exactly what Zab
did inside the kernel. In my case a dynamic pool of (userspace) threads
servicing any non-native potentially pre-emptive call, and signaling the
result to a pollable fd (pipe in my case) that is integrated in the epoll
(poll/select whatever) scheduler.
I personally find Zab idea a really good one, since it allows for generic
kernel async implementation, w/out the burden of dirtying kernel code
paths with AIO knowledge. Being it fibrils or real kthreads, it is IMO
definitely worth a very close look.

- Davide

cotest.c

Linus Torvalds

unread,

Feb 2, 2007, 4:10:47 PM2/2/07

to Davide Libenzi

On Fri, 2 Feb 2007, Davide Libenzi wrote:
>
> Actually, coroutines are not too bad to program once you have a
> total-coverage async scheduler to run them.

No, no, I don't disagree at all. In fact, I agree emphatically.

It's just that you need the scheduler to run them, in order to not "see"
them as coroutines. Then, you can program everything *as*if* it was just a
regular declarative linear language with multiple threads).

And that gets us the same programming interface as we always have, and
people can forget about the fact that in a very real sense, they are using
coroutines with the scheduler just keeping track of it all for them.

After all, that's what we do between processes *anyway*. You can
technically see the kernel as one big program that uses coroutines and the
scheduler just keeping track of every coroutine instance. It's just that I
doubt that any kernel programmer really thinks in those terms. You *think*
in terms of "threads".

Alan

unread,

Feb 2, 2007, 4:18:46 PM2/2/07

to Linus Torvalds

> They are NOT the generic co-routine that some languages support natively.
> So I think trying to call them coroutines would be even more misleading
> than calling them fibrils.

Its actually pretty damned close the Honeywell B co-routine package, with
a kernel twist to be honest.

> ends up looking like. There's a good reason why people program mostly in
> linear flow: that's how people think consciously - even if it's obviously
> not how the brain actually works).

The IRQ example below is an example of how it linearizes - so it cuts
both ways like most tools, admittedly one of the blades is at the handle
end in this case ...

> Basically, what I'm hoping can come out of this (and this is a simplistic
> example, but perhaps exactly *because* of that it hopefully also shows
> that we canactually make *simple* interfaces for complex asynchronous
> things):
>
> struct one_entry *prev = NULL;
> struct dirent *de;
>
> while ((de = readdir(dir)) != NULL) {
> struct one_entry *entry = malloc(..);
>
> /* Add it to the list, fill in the name */
> entry->next = prev;
> prev = entry;
> strcpy(entry->name, de->d_name);
>
> /* Do the stat lookup async */
> async_stat(de->d_name, &entry->stat_buf);
> }
> wait_for_async();

The brown and sticky will hit the rotating air impeller pretty hard if you
are not very careful about how that ends up scheduled. Its one thing to
exploit the ability to pull all the easy lookups out in advance, and
another having created all the parallelism to turn into into sane disk
scheduling and wakeups without scaling hit. But you do at least have the
opportunity to exploit it I guess.

> > You get some other funny things from co-routines which are very powerful,
> > very dangerous, or plain insane
>
> You forgot "very hard to think about".

I'm not sure handing a fibril off to another task is that hard to think
about. It's not easy to turn it around as an async_exit() keeping the
other fibrils around because of the mass of rules and behaviours tied to
process exit but its perhaps not impossible.

Other minor evil. If we use fibrils we need to be careful we
know in advance how many fibrils an operation needs so we don't deadlock
on them in critical places like writeout paths when we either hit the per
task limit or we have no page for another stack.

Alan

Linus Torvalds

unread,

Feb 2, 2007, 4:30:55 PM2/2/07

to Alan

On Fri, 2 Feb 2007, Alan wrote:
>

> The brown and sticky will hit the rotating air impeller pretty hard if you
> are not very careful about how that ends up scheduled

Why do you think that?

With cooperative scheduling (like the example Zach posted), there is
absolutely no "brown and sticky" wrt any CPU usage. Which is why
cooperative scheduling is a *good* thing. If you want to blow up your
1024-node CPU cluster, you'd to it with "real threads".

Also, with sane default limits of fibrils per process (say, in the 5-10),
it also ends up beign good for IO. No "insane" IO bombs, but an easy way
for users to just just get a reasonable amount of IO parallelism without
having to use threading (which is hard).

So, best of both worlds.

Yes, *of*course* you want to have limits on outstanding work. And yes, a
database server would set those limits much higher ("Only a thousand
outstanding IO requests? Can we raise that to ten thousand, please?") than
a regular process ("default: 5, and the super-user can raise it for you if
you're good").

But there really shouldn't be any downsides.

(Of course, there will be downsides. I'm sure there will be. But I don't
see any really serious and obvious ones).

> Other minor evil. If we use fibrils we need to be careful we
> know in advance how many fibrils an operation needs so we don't deadlock
> on them in critical places like writeout paths when we either hit the per
> task limit or we have no page for another stack.

Since we'd only create fibrils on a system call entry level, and system
calls are independent, how would you do that anyway?

Once a fibril has been created, it will *never* depend on any other fibril
resources ever again. At least not in any way that any normal non-fibril
call wouldn't already do as far as I can see.

Linus

Ingo Molnar

unread,

Feb 2, 2007, 5:28:23 PM2/2/07

to Linus Torvalds

* Linus Torvalds <torv...@linux-foundation.org> wrote:

> On Fri, 2 Feb 2007, Ingo Molnar wrote:
> >
> > My only suggestion was to have a couple of transparent kernel threads
> > (not fibrils) attached to a user context that does asynchronous
> > syscalls! Those kernel threads would be 'switched in' if the current
> > user-space thread blocks - so instead of having to 'create' any of them
> > - the fast path would be to /switch/ them to under the current
> > user-space, so that user-space processing can continue under that other
> > thread!
>
> But in that case, you really do end up with "fibrils" anyway.
>
> Because those fibrils are what would be the state for the blocked
> system calls when they aren't scheduled.
>
> We may have a few hundred thousand system calls a second (maybe that's
> not actually reasonable, but it should be what we *aim* for), and 99%
> of them will hopefully hit the cache and never need any separate IO,
> but even if it's just 1%, we're talking about thousands of threads.
>
> I do _not_ think that it's reasonable to have thousands of threads
> state around just "in case". Especially if all those threadlets are
> then involved in signals etc - something that they are totally
> uninterested in.
>
> I think it's a lot more reasonable to have just the kernel stack page
> for "this was where I was when I blocked". IOW, a fibril-like thing.

ok, i think i noticed another misunderstanding. The kernel thread based
scheme i'm suggesting would /not/ 'switch' to another kernel thread in
the cached case, by default. It would just execute in the original
context (as if it were a synchronous syscall), and the switch to a
kernel thread from the pool would only occur /if/ the context is about
to block. (this 'switch' thing would be done by the scheduler)
User-space gets back an -EAIO error code immediately and transparently -
but already running under the new kernel thread.

i.e. in the fully cached case there would be no scheduling at all - in
fact no thread pool is needed at all.

regarding cost:

the biggest memory resource cost of a kernel thread (assuming it has no
real user-space context) /is/ its kernel stack page, which is 4K or 8K.
The task struct takes ~1.5K. Once we have a ready kernel thread around,
it's quite cheap to 'flip' it to under any arbitrary user-space context:
change its thread_info->task pointer to the user-space context's task
struct, copy the mm pointer, the fs pointer to the "worker thread",
switch the thread_info, update ptregs - done. Hm?

Note: such a 'flip' would only occur when the original context blocks,
/not/ on every async syscall.

regarding CPU resource costs, i dont think there should be significant
signal overhead, because the original task is still only one instance,
and the kernel thread that is now running with the blocked kernel stack
is not part of the signal set. (Although it might make sense to make
such async syscalls interruptible, just like any syscall.)

The 'pool' of kernel threads doesnt even have to be per-task, it can be
a natural per-CPU thing - and its size will grow/shrink [with a low
update frequency] depending on how much AIO parallelism there is in the
workload. (But it can also be strictly per-user-context - to make sure
that a proper ->mm ->fs, etc. is set up and that when the async system
calls execute they have all the right context info.)

and note the immediate scheduling benefits: if an app (say like
OpenOffice) is single-threaded but has certain common ops coded as async
syscalls, then if any of those syscalls blocks then it could utilize
/more than one/ CPU. I.e. we could 'spread' a single-threaded app's
processing to multiple cores/hardware-threads /without/ having to
multi-thread the app in an intrusive way. I.e. this would be a
finegrained threading of syscalls, executed as coroutines in essence.
With fibrils all sorts of scheduling limitations occur and no
parallelism is possible.

in fact an app could also /trigger/ the execution of a syscall in a
different context - to create parallelism artificially - without any
blocking event. So we could do:

cookie1 = sys_async(sys_read, params);
cookie2 = sys_async(sys_write, params);

[ ... calculation loop ... ]

wait_on_async_syscall(cookie1);
wait_on_async_syscall(cookie2);

or something like that. Without user-space having to create threads
itself, etc. So basically, we'd make kernel threads more useful, and
we'd make threading safer - by only letting syscalls thread.

> What I like about fibrils is that they should be able to handle the
> cached case well: the case where no "real" scheduling (just the fibril
> stack switches) takes place.

the cached case (when a system call would not block at all) would not
necessiate any switch to another kernel thread at all - the task just
executes its system call as if it were synchronous!

that's the nice thing: we can do this switch-to-another-kernel-thread
magic thing right in the scheduler when we block - and the switched-to
thread will magically return to user-space (with a -EAIO return code) as
if nothing happened (while the original task blocks). I.e. under this
scheme i'm suggesting we have /zero/ setup cost in the cached case. The
optimistic case just falls through and switches to nothing else. Any
switching cost only occurs in the slowpath - and even that cost is very
low.

once a kernel thread that ran off with the original stack finishes the
async syscall and wants to return the return code, this can be gathered
via a special return-code ringbuffer that notifies finished syscalls. (A
magic cookie is associated to every async syscall.)

> So the state machine argument is totally bogus - it results in a
> programming model that simply doesn't match the *normal* setup. You
> want the kernel programming model to appear "linear" even when it
> isn't, because it's too damn hard to think nonlinearly.
>
> Yes, we could do pathname lookup with that kind of insane setup too.
> But it would be HORRID!

yeah, but i guess not nearly as horrid as writing a new OS from scratch
;-)

seriously, i very much think and agree that programming state machines
is hard and not desired in most of the kernel. But it can be done, and
sometimes (definitely not in the common case) it's /cleaner/ than
functional programming. I've programmed an HTTP and an FTP in-kernel
server via a state machine and it worked better than i initially
expected. It needs different thinking but there /are/ people around with
that kind of thinking, so we just cannot exclude the possibility. [ It's
just that such people usually dedicate their brain to mental
fantasies^H^H^Hexcercises called 'Higher Mathematics' :-) ]

> [...] The fact that networking (and TCP in particular) has state

> machines is because it is a packetized environment.

rough ballpark figures: for things like webserving or fileserving (or
mailserving), networking sockets are the reason for context-blocking
events in 90% of the cases (mostly due to networking latency). 9% of the
blocking happens due to plain block IO, and 1% happens due to VFS
metadata (inode, directory, etc.) blocking.

( in Tux i had to handle /all/ of these sources of blocking because even
1% kills your performance if you do a hundred thousand requests per
second - but in terms of design weight, networking is pretty damn
important. )

and interestingly, modern IO frameworks tend to gravitate towards a
packetized environment as well. I.e. i dont think state machines are
/that/ unimportant.

Ingo

Alan

unread,

Feb 2, 2007, 5:37:24 PM2/2/07

to Linus Torvalds

> > The brown and sticky will hit the rotating air impeller pretty hard if you
> > are not very careful about how that ends up scheduled
>
> Why do you think that?
>
> With cooperative scheduling (like the example Zach posted), there is
> absolutely no "brown and sticky" wrt any CPU usage. Which is why
> cooperative scheduling is a *good* thing. If you want to blow up your
> 1024-node CPU cluster, you'd to it with "real threads".

You end up with a lot more things running asynchronously. In the current
world we see a series of requests for attributes and hopefully we do
readahead and all is neatly ordered. If fibrils are not ordered the same
way then we could make it worse as we might not pick the right readahead
for example.

> Since we'd only create fibrils on a system call entry level, and system
> calls are independent, how would you do that anyway?

If we stick to that limit it ought to be ok. We've been busy slapping
people who call sys_*, except for internal magic like kernel_thread

Ingo Molnar

unread,

Feb 2, 2007, 5:50:14 PM2/2/07

to Linus Torvalds

* Linus Torvalds <torv...@linux-foundation.org> wrote:

> With cooperative scheduling (like the example Zach posted), there is
> absolutely no "brown and sticky" wrt any CPU usage. Which is why
> cooperative scheduling is a *good* thing. If you want to blow up your
> 1024-node CPU cluster, you'd to it with "real threads".

i'm not worried about the 1024-node cluster case.

i also fully agree that in some cases /not/ going parallel and having a
cooperative relationship between execution contexts can be good.

but if the application /has/ identified fundamental parallelism, we
/must not/ shut that parallelism off by /designing/ this interface to
use the fibril thing which is a limited cooperative, single-CPU entity.
I cannot over-emphasise it how wrong that feels to me. Cooperativeness
isnt bad, but it should be an /optional/ thing, not hardcoded into the
design!

If the application tells us: "gee, you can execute this syscall in
parallel!" (which AIO /is/ about after all), and if we have idle
cores/hardware-threads nearby, it would be the worst thing to not
execute that in parallel if the syscall blocks or if the app asks for
that syscall to be executed in parallel right away, even in the cached
case.

if we were in the 1.2 days i might agree that fibrils are perhaps easier
on the kernel, but today the Linux kernel doesnt even use this
cooperativeness anywhere. We have all the hard work done already. The
full kernel is threaded. We can execute arbitrary number of kernel
contexts off a single user context, we can execute parallel syscalls and
we scale very well doing so.

all that is needed is this new facility and some scheduler hacking to
enable "transparent, kernel-side threading". That enables AIO,
coroutines and more. It brings threading to a whole new level, because
it makes it readily and gradually accessible to single-threaded apps
too.

[ and if we are worried about the 1024 CPU cluster (or about memory use)
then we could limit such threads to only overlap in a limited number,
etc. Just like we'd have to do with fibrils anyway. But with fibrils
we /force/ single-threadedness, which, i'm quite sure, is just about
the worst thing we can do. ]

Ingo

Linus Torvalds

unread,

Feb 2, 2007, 5:50:21 PM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Ingo Molnar wrote:
>
> Note: such a 'flip' would only occur when the original context blocks,
> /not/ on every async syscall.

Right.

So can you take a look at Zach's fibril idea again? Because that's exactly
what it does. It basically sets a flag, saying "flip to this when you
block or yield". Of course, it's a bit bigger than just a flag, since it
needs to describe what to flip to, but that's the basic idea.

Now, if you want to make fibrils *also* then actually use a separate
thread, that's an extension. But you were arguing as if they should use
threads to begin with, and that sounds stupid. Now you seem to retract it,
since you say "only if you need to block".

THAT'S THE POINT. That's what makes fibrils cooperative. The "only if you
block" is really what makes a fibril be something else than a regular
thread.

Linus

Linus Torvalds

unread,

Feb 2, 2007, 6:02:33 PM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Ingo Molnar wrote:
>

> but if the application /has/ identified fundamental parallelism, we
> /must not/ shut that parallelism off by /designing/ this interface to
> use the fibril thing which is a limited cooperative, single-CPU entity.

Right. We should for example encourage people to use some kind of
paralellizing construct.

I know! We could even call them "threads", so to give people the idea that
they are independent smaller entities in a thicker "rope", and we could
call that bigger entity a "task" or "process", since it "processes" data.

Or is that just too far out?

Linus

Linus Torvalds

unread,

Feb 2, 2007, 6:19:36 PM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Linus Torvalds wrote:
> On Fri, 2 Feb 2007, Ingo Molnar wrote:
> >
> > but if the application /has/ identified fundamental parallelism, we
> > /must not/ shut that parallelism off by /designing/ this interface to
> > use the fibril thing which is a limited cooperative, single-CPU entity.
>
> Right. We should for example encourage people to use some kind of
> paralellizing construct.
>
> I know! We could even call them "threads", so to give people the idea that
> they are independent smaller entities in a thicker "rope", and we could
> call that bigger entity a "task" or "process", since it "processes" data.
>
> Or is that just too far out?

So the above was obviously tongue-in-cheek, but you should really think
about the context here.

We're discussing doing *single* system calls. There is absolutely zero
point to try to parallelize the work over multiple CPU's or threads. We're
literally talking about doing things where the actual CPU cost is in the
hundreds of nanoseconds, and where traditionally a rather noticeable part
of the cost is not the code itself, but the high cost of taking a system
call trap, and saving all the register state.

When parallelising "real work", I absolutely agree with you: we should use
threads. But you need to look at what it is we parallelize here, and ask
yourself why we're doing what we're doing, and why people aren't *already*
just using a separate thread for it.

Davide Libenzi

unread,

Feb 2, 2007, 6:37:42 PM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Ingo Molnar wrote:

> in fact an app could also /trigger/ the execution of a syscall in a
> different context - to create parallelism artificially - without any
> blocking event. So we could do:
>
> cookie1 = sys_async(sys_read, params);
> cookie2 = sys_async(sys_write, params);
>
> [ ... calculation loop ... ]
>
> wait_on_async_syscall(cookie1);
> wait_on_async_syscall(cookie2);
>
> or something like that. Without user-space having to create threads
> itself, etc. So basically, we'd make kernel threads more useful, and
> we'd make threading safer - by only letting syscalls thread.

Since I still think that the many-thousands potential async operations
coming from network sockets are better handled with a classical event
machanism [1], and since smooth integration of new async syscall into the
standard POSIX infrastructure is IMO a huge win, I think we need to have a
"bridge" to allow async completions being detectable through a pollable
(by the mean of select/poll/epoll whatever) device.
In that way you can handle async operations with the best mechanism that
is fit for them, and gather them in a single async scheduler.

[1] Unless you really want to have thousands of kthreads/fibrils lingering
on the system.

- Davide

Alan

unread,

Feb 2, 2007, 6:53:03 PM2/2/07

to Linus Torvalds

> When parallelising "real work", I absolutely agree with you: we should use
> threads. But you need to look at what it is we parallelize here, and ask
> yourself why we're doing what we're doing, and why people aren't *already*
> just using a separate thread for it.

Because its a pain in the arse and because its very hard to self tune. If
you've got async_anything then the thread/fibril/synchronous/whatever
decision can be made kernel side based upon expected cost and other
tradeoffs, even if its as dumb as per syscall or per syscall/filp type
guessing.

Alan

Davide Libenzi

unread,

Feb 2, 2007, 7:02:40 PM2/2/07

to Ingo Molnar

On Fri, 2 Feb 2007, Davide Libenzi wrote:

> On Fri, 2 Feb 2007, Ingo Molnar wrote:
>
> > in fact an app could also /trigger/ the execution of a syscall in a
> > different context - to create parallelism artificially - without any
> > blocking event. So we could do:
> >
> > cookie1 = sys_async(sys_read, params);
> > cookie2 = sys_async(sys_write, params);
> >
> > [ ... calculation loop ... ]
> >
> > wait_on_async_syscall(cookie1);
> > wait_on_async_syscall(cookie2);
> >
> > or something like that. Without user-space having to create threads
> > itself, etc. So basically, we'd make kernel threads more useful, and
> > we'd make threading safer - by only letting syscalls thread.
>
> Since I still think that the many-thousands potential async operations
> coming from network sockets are better handled with a classical event
> machanism [1], and since smooth integration of new async syscall into the
> standard POSIX infrastructure is IMO a huge win, I think we need to have a
> "bridge" to allow async completions being detectable through a pollable
> (by the mean of select/poll/epoll whatever) device.
> In that way you can handle async operations with the best mechanism that
> is fit for them, and gather them in a single async scheduler.

To clarify further, below are the API and the use case of my userspace
implementation. The guasi_fd() gives you back a pollable (POLLIN) fd to be
integrated in your prefered event retrieval interface. Once it fd is
signaled, you can fetch your completed requests using guasi_fetch() and
schedule work based on that.
The GUASI implementation uses pthreads, but it is clear that an in-kernel
async syscall implementation can take wiser decisions, and optimize the
heck out of it (locks, queues, ...).

- Davide

/*
* Example of async pread using GUASI
*/
static long guasi_wrap__pread(void *priv, long const *params) {

return (long) pread((int) params[0], (void *) params[1],
(size_t) params[2], (off_t) params[3]);
}

guasi_req_t guasi__pread(guasi_t hctx, void *priv, void *asid, int prio,
int fd, void *buf, size_t size, off_t off) {

return guasi_submit(hctx, priv, asid, prio, guasi_wrap__pread, 4,
(long) fd, (long) buf, (long) size, (long) off);
}

---
/*
* guasi by Davide Libenzi (generic userspace async syscall implementation)
* Copyright (C) 2003 Davide Libenzi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Davide Libenzi <dav...@xmailserver.org>
*
*/

#if !defined(_GUASI_H)
#define _GUASI_H

#define GUASI_MAX_PARAMS 16

#define GUASI_STATUS_PENDING 1
#define GUASI_STATUS_ACTIVE 2
#define GUASI_STATUS_COMPLETE 3

typedef long (*guasi_syscall_t)(void *, long const *);

typedef struct s_guasi { } *guasi_t;
typedef struct s_guasi_req { } *guasi_req_t;

struct guasi_reqinfo {
void *priv; /* Call private data. Passed to guasi_submit */
void *asid; /* Async request ID. Passed to guasi_submit */
long result; /* Return code of "proc" passed to guasi_submit */
long error; /* errno */
int status; /* GUASI_STATUS_* */
};

guasi_t guasi_create(int min_threads, int max_threads, int max_priority);
void guasi_free(guasi_t hctx);
int guasi_fd(guasi_t hctx);
guasi_req_t guasi_submit(guasi_t hctx, void *priv, void *asid, int prio,
guasi_syscall_t proc, int nparams, ...);
int guasi_fetch(guasi_t hctx, guasi_req_t *reqs, int nreqs);
int guasi_req_info(guasi_req_t hreq, struct guasi_reqinfo *rinf);
void guasi_req_free(guasi_req_t hreq);

#endif

Ingo Molnar

unread,

Feb 2, 2007, 7:04:01 PM2/2/07

to Linus Torvalds

* Linus Torvalds <torv...@linux-foundation.org> wrote:

> THAT'S THE POINT. That's what makes fibrils cooperative. The "only if
> you block" is really what makes a fibril be something else than a
> regular thread.

Well, in my picture, 'only if you block' is a pure thread utilization
decision: bounce a piece of work to another thread if this thread cannot
complete it. (if the kernel is lucky enough that the user context told
it "it's fine to do that".)

it is 'incidental parallelism' instead of 'intentional parallelism', but
the random and unpredictable nature of it doesnt change anything about
the fundamental fact: we start a new thread of execution in essence.

Typically it will be rare in a workload as it will be driven by
cachemisses, but for example in DB workloads the 'cachemiss' will be the
/common case/ - because the DB manages the cache itself.

And how to run a thread of execution is a fundamental /scheduling/
decision: it is the acceptance of and the adoption to the cost of work
migration - if no forced wait happens then often it's cheaper to execute
all work locally and serially.

[ in fact, such a mechanism doesnt even always have to be driven from
the scheduler itself: such a 'bounce current work to another thread'
event could occur when we detect that a pagecache page is missing and
that we have to do a ->readpage, etc. Tux does that since 1999: the
cutoff for 'bounce work' was when a soft cache (the pagecache or the
dentry cache) was missed - not when we went into the IO path. This has
the advantage that the Tux cachemiss threads could do /all/ the IO
preparation and IO completion on the same CPU and in one go - while
the user context was able to continue executing. ]

But this is also a function of hardware: for example on a Transputer i'd
bounce off all such work immediately (even if it's a sys_time()
syscall), all the time, even if fully cached, no exceptions, because the
hardware is such that another CPU can pick it up in the next cycle.

while we definitely dont want to bounce short-lived cached syscalls to
another thread, for longer ones or ones which we /expect/ to block we
might want to do it like that straight away. [Especially on a multi-core
CPU that has a shared L2 cache (and doubly so on a HT/SMT CPU that has a
shared L1 cache).]

i dont see anything here that mandates (or even strongly supports) the
notion of cooperative scheduling. The moment a context sees a 'cache
miss', it is totally fair to potentially distribute it to other CPUs. It
wont run for a long time and it will be totally cache-cold when the 'IO
done' event occurs - hence we should schedule it where the IO event
occured. Which might easily be the same CPU where the user context is
running right now (we prefer last-run CPUs on wakeups), but not
necessarily - it's a general scheduling decision.

> > Note: such a 'flip' would only occur when the original context
> > blocks, /not/ on every async syscall.
>
> Right.
>
> So can you take a look at Zach's fibril idea again? Because that's
> exactly what it does. It basically sets a flag, saying "flip to this
> when you block or yield". Of course, it's a bit bigger than just a
> flag, since it needs to describe what to flip to, but that's the basic
> idea.

i know Zach's code ... i really do. Even if i didnt look at the code
(which i did), Jonathon Corbet did a very nice writeup about fibrils on
LWN.net two days ago, which i've read as well:

http://lwn.net/Articles/219954/

So there's no misunderstanding on my side i think.

> Now, if you want to make fibrils *also* then actually use a separate
> thread, that's an extension.

oh please, Linus. I /did/ suggest this as an extension to Zach's idea!
Look at the Subject line - i'm reacting to the specific fibril code of
Zach. I wrote this:

| as per my other email, i dont really like this concept. This is the
| killer:
|
| > [...] There can be multiple of them in the process of executing for
| > a given task_struct, but only one can every be actively running at a
| > time. [...]

|
| there's almost no scheduling cost from being able to arbitrarily
| schedule a kernel thread - but there are /huge/ benefits in it.
|

| would it be hard to redo your AIO patches based on a pool of plain
| simple kernel threads?

see http://lkml.org/lkml/2007/2/1/40.

Ingo

bert hubert

unread,

Feb 2, 2007, 7:24:54 PM2/2/07

to Linus Torvalds

On Fri, Feb 02, 2007 at 03:17:57PM -0800, Linus Torvalds wrote:

> threads. But you need to look at what it is we parallelize here, and ask
> yourself why we're doing what we're doing, and why people aren't *already*
> just using a separate thread for it.

Partially this is for the bad reason that creating "i/o threads" (or even
processes) has a bad stigma to it, and additionally has always felt crummy.

On the first reason, the 'pain' of creating threads is actually rather
minor, so this feeling may have been wrong. The main thing is that you don't
wantonly create a thousand i/o threads, whereas you conceivably might want
to have a thousand outstanding i/o requests. At least I know I want to have
that ability.

Secondly, the actual mechanics of i/o processes isn't trivial, and feels
wasteful with lots of additional copying, or in the case of threads,
queueing and posting.

Bert

--
http://www.PowerDNS.com Open source, database driven DNS Software
http://netherlabs.nl Open and Closed source services

Linus Torvalds

unread,

Feb 2, 2007, 7:57:20 PM2/2/07

to Ingo Molnar

On Sat, 3 Feb 2007, Ingo Molnar wrote:
>
> Well, in my picture, 'only if you block' is a pure thread utilization
> decision: bounce a piece of work to another thread if this thread cannot
> complete it. (if the kernel is lucky enough that the user context told
> it "it's fine to do that".)

Sure, you can do it that way too. But at that point, your argument that we
shouldn't do it with fibrils is wrong: you'd still need basically the
exact same setup that Zach does in his fibril stuff, and the exact same
hook in the scheduler, testing the exact same value ("do we have a pending
queue of work").

So at that point, you really are arguing about a rather small detail in
the implementation, I think.

Which is fair enough.

But I actually think the *bigger* argument and problems are elsewhere,
namely in the interface details. Notably, I think the *real* issues end up
how we handle synchronization, and how we handle signalling. Those are in
many ways (I think) more important than whether we actually can schedule
these trivial things on multiple CPU's concurrently or not.

For example, I think serialization is potentially a much more expensive
issue. Could we, for example, allow users to serialize with these things
*without* having to go through the expense of doing a system call? Again,
I'm thinking of the case of no IO happening, in which case there also
won't be any actual threading taking place, in which case it's a total
waste of time to do a system call at all.

And trying to do that actually has implications for the interfaces (like
possibly returning a zero cookie for the async() system call if it was
doable totally synchronously?)

Signal handling is similar: I actually think that a "async()" system call
should be interruptible within the context of the caller, since we would
want to *try* to execute it synchronously. That automatically means that
we have semantic meaning for fibrils and signal handling.

Finally, can we actually get POSIX aio semantics with this? Can we
implement the current aio_xyzzy() system calls using this same feature?
And most importantly - does it perform well enough that we really can do
that?

THOSE are to me bigger questions than what happens inside the kernel, and
whether we actually end up using another thread if we end up doing it
non-synchronously.

Linus

Suparna Bhattacharya

unread,

Feb 3, 2007, 2:10:58 AM2/3/07

to Linus Torvalds

This would be useful - the application wouldn't have to set up state
to remember for handling completions for operations that complete synchronously
I know Samba folks would like that.

The laio_syscall implementation (Lazy asynchronous IO) seems to have
experimented with such an interface
http://www.usenix.org/events/usenix04/tech/general/elmeleegy.html

Regards
Suparna

>
> Signal handling is similar: I actually think that a "async()" system call
> should be interruptible within the context of the caller, since we would
> want to *try* to execute it synchronously. That automatically means that
> we have semantic meaning for fibrils and signal handling.
>
> Finally, can we actually get POSIX aio semantics with this? Can we
> implement the current aio_xyzzy() system calls using this same feature?
> And most importantly - does it perform well enough that we really can do
> that?
>
> THOSE are to me bigger questions than what happens inside the kernel, and
> whether we actually end up using another thread if we end up doing it
> non-synchronously.
>
> Linus
>

> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majo...@kvack.org. For more info on Linux AIO,
> see: http://www.kvack.org/aio/
> Don't email: <a href=mailto:"aa...@kvack.org">aa...@kvack.org</a>

--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Lab, India

Ingo Molnar

unread,

Feb 3, 2007, 3:38:29 AM2/3/07

to Linus Torvalds

* Linus Torvalds <torv...@linux-foundation.org> wrote:

> On Sat, 3 Feb 2007, Ingo Molnar wrote:
> >
> > Well, in my picture, 'only if you block' is a pure thread
> > utilization decision: bounce a piece of work to another thread if
> > this thread cannot complete it. (if the kernel is lucky enough that
> > the user context told it "it's fine to do that".)
>
> Sure, you can do it that way too. But at that point, your argument
> that we shouldn't do it with fibrils is wrong: you'd still need
> basically the exact same setup that Zach does in his fibril stuff, and
> the exact same hook in the scheduler, testing the exact same value
> ("do we have a pending queue of work").

did i ever lose a single word of complaint about those bits? Those are
not an issue to me. They can be applied to kernel threads just as much.

As i babbled in the very first email about this topic:

| 1) improve our basic #1 design gradually. If something is a
| bottleneck, if the scheduler has grown too fat, cut some slack. If
| micro-threads or fibrils offer anything nice for our basic thread
| model: integrate it into the kernel.

i should have said explicitly that to flip user-space from one kernel
thread to another one (upon blocking or per request) is a nice thing and
we should integrate that into the kernel's thread model.

But really, being a scheduler guy i was much more concerned about the
duplication and problems caused by the fibril concept itself - which
duplication and complexity makes up 80% of Zach's submitted patchset.
For example this bit:

[PATCH 3 of 4] Teach paths to wake a specific void * target

would totally go away if we used kernel threads for this. In the fibril
approach this is where the mess starts. Either a 'normal' wakeup has to
wake up all fibrils, or we have to make damn sure that a wakeup that in
reality goes to a fibril is never woken via wake_up/wake_up_process.

( Furthremore, i tried to include user-space micro-threads in the
argument as well, which Evgeniy Polyako raised not so long ago related
to the kevent patchset. All these micro-thread things are of a similar
genre. )

i totally agree that the API /should/ be the main focus - but i didnt
pick the topic and most of the patchset's current size is due to the IMO
avoidable fibril concept.

regarding the API, i dont really agree with the current form and design
of Zach's interface.

fundamentally, the basic entity of this thing should be a /system call/,
not the artificial fibril thing:

+struct asys_call {
+ struct asys_result *result;
+ struct fibril fibril;
+};

i.e. the basic entity should be something that represents a system call,
with its up to 6 arguments, the later return code, state, flags and two
list entries:

struct async_syscall {
unsigned long nr;
unsigned long args[6];
long err;
unsigned long state;
unsigned long flags;
struct list_head list;
struct list_head wait_list;
unsigned long __pad[2];
};

(64 bytes on 32-bit, 128 bytes on 64-bit)

furthermore, i think this API should be fundamentally vectored and
fundamentally async, and hence could solve another issue as well:
submitting many little pieces of work of different IO domains in one go.

[ detail: there should be no traditional signals used at all (Zach's
stuff doesnt use them, and correctly so), only if the async syscall
that is performed generates a signal. ]

The normal and most optimal workflow should be a user-space ring-buffer
of these constant-size struct async_syscall entries:

struct async_syscall ringbuffer[1024];

LIST_HEAD(submitted);
LIST_HEAD(pending);
LIST_HEAD(completed);

the 3 list heads are both known to the kernel and to user-space, and are
actively managed by both. The kernel drives the execution of the async
system calls based on the 'submitted' list head (until it empties it)
and moves them over to the 'pending' list. User-space can complete async
syscalls based on the 'completed' list. (but a sycall can optinally be
marked as 'autocomplete' as well via the 'flags' field, in that case
it's not moved to the 'completed' list but simply removed from the
'pending' list. This can be useful for system calls that have some
implicit notification effect.)

( Note: optionally, a helper kernel-thread, when it finishes processing
a syscall, could also asynchronously check the 'submitted' list and
pick up new work. That would allow the submission of new syscalls
without any entry into the kernel. So for example on an SMT system,
this could result in essence one CPU could running in pure user-space
submitting async syscalls via the ringbuffer, while another CPU would
in essence be running pure kernel-space, executing those entries. )

another crutial bit is the waiting on pending work. But because every
pending syscall entity is either already completed or has a real kernel
thread associated with it, that bit is mostly trivial: user-space can
wait on 'any' pending syscall to complete, or it could wait for a
specific list of syscalls to complete (using the ->wait_list). It could
also wait on 'a minimum number of N syscalls to complete' - to create
batching of execution. And of course it can periodically check the
'completed' list head if it has a constant and highly parallel flow of
workload - that way the 'waiting' does not actually have to happen most
of the time.

Looks like we can hit many birds with this single stone: AIO, vectored
syscalls, finegrained system-call parallelism. Hm?

Ingo

Matt Mackall

unread,

Feb 3, 2007, 4:39:37 AM2/3/07

to Ingo Molnar

On Sat, Feb 03, 2007 at 09:23:08AM +0100, Ingo Molnar wrote:
> The normal and most optimal workflow should be a user-space ring-buffer
> of these constant-size struct async_syscall entries:
>
> struct async_syscall ringbuffer[1024];
>
> LIST_HEAD(submitted);
> LIST_HEAD(pending);
> LIST_HEAD(completed);

It's wrong to call this a ring buffer as things won't be completed in
any particular order. So you'll need a fourth list head for which
buffer elements are free. At which point, you might as well leave it
entirely up to the application to manage the allocation of
async_syscall structs. It may know it only needs two, or ten thousand,
or five per client...

--
Mathematics is the supreme nostalgia of our time.

Ingo Molnar

unread,

Feb 3, 2007, 5:19:50 AM2/3/07

to Matt Mackall

* Matt Mackall <m...@selenic.com> wrote:

> On Sat, Feb 03, 2007 at 09:23:08AM +0100, Ingo Molnar wrote:
> > The normal and most optimal workflow should be a user-space ring-buffer
> > of these constant-size struct async_syscall entries:
> >
> > struct async_syscall ringbuffer[1024];
> >
> > LIST_HEAD(submitted);
> > LIST_HEAD(pending);
> > LIST_HEAD(completed);
>
> It's wrong to call this a ring buffer as things won't be completed in

> any particular order. [...]

yeah, i realized this when i sent the mail. I wanted to say 'array of
elements' - and it's clear from these list heads that it's fully out of
order. (it should be an array so that the pages of those entries can be
pinned and that completion can be manipulated from any context,
anytime.)

(the queueing i described closely resembles Tux's "Tux syscall request"
handling scheme.)

> [...] So you'll need a fourth list head for which buffer elements are

> free. At which point, you might as well leave it entirely up to the
> application to manage the allocation of async_syscall structs. It may
> know it only needs two, or ten thousand, or five per client...

sure - it should be variable but still the array should be compact, and
should be registered with the kernel. That way security checks can be
done once, the pages can be pinned, accessed anytime, etc.

Ingo

li...@horizon.com

unread,

Feb 3, 2007, 9:12:32 AM2/3/07

to linux-...@vger.kernel.org

First of all, may I say, this is a wonderful piece of work.
It absolutely reeks of The Right Thing. Well done!

However, while I need to study it in a lot more detail, I think Ingo's
implementation ideas make a lot more immediate sense. It's the same
idea that I thought up.

Let me make it concrete. When you start an async system call:
- Preallocate a second kernel stack, but don't do anything
with it. There should probably be a per-CPU pool of
preallocated threads to avoid too much allocation and
deallocation.
- Also at this point, do any resource limiting.
- Set the (normally NULL) "thread blocked" hook pointer to
point to a handler, as explained below.
- Start down the regular system call path.
- In the fast-path case, the system call completes without blocking and
we set up the completion structure and return to user space.
We may want to return a special value to user space to tell it that
there's no need to call asys_await_completion. I think of it as the
Amiga's IOF_QUICK.
- Also, when returning, check and clear the thread-blocked hook.

Note that we use one (cache-hot) stack for everything and do as little
setup as possible on the fast path.

However, if something blocks, it hits the slow path:
- If something would block the thread, the scheduler invokes the
thread-blocked hook before scheduling a new thread.
- The hook copies the necessary state to a new (preallocated) kernel
stack, which takes over the original caller's identity, so it can return
immediately to user space with an "operation in progress" indicator.
- The scheduler hook is also cleared.
- The original thread is blocked.
- The new thread returns to user space and execution continues.

- The original thread completes the system call. It may block again,
but as its block hook is now clear, no more scheduler magic happens.

- When the operation completes and returns to sys_sys_submit(), it
notices that its scheduler hook is no longer set. Thus, this is a
kernel-only worker thread, and it fills in the completion structure,
places itself back in the available pool, and commits suicide.

Now, there is no chance that we will ever implement kernel state machines
for every little ioctl. However, there may be some "async fast paths"
state machines that we can use. If we're in a situation where we can
complete the operation without a kernel thread at all, then we can
detect the "would block" case (probably in-line, but you could
use a different scheduler hook function) and set up the state machine
structure. Then return "operation in progress" and let the I/O
complete in its own good time.

Note that you don't need to implement all of a system call as an explicit
state machine; only its completion. So, for example, you could do
indirect block lookups via an implicit (stack-based) state machine,
but the final I/O via an explicit one. And you could do this only for
normal block devices and not NFS. You only need to convert the hot
paths to the explicit state machine form; the bulk of the kernel code
can use separate kernel threads to do async system calls.

I'm also in the "why do we need fibrils?" camp. I'm studying the code,
and looking for a reason, but using the existing thread abstraction
seems best. If you encountered some fundamental reason why kernel threads
were Really Really Hard, then maybe it's worth it, but it's a new entity,
and entia non sunt multiplicanda praeter necessitatem.

One thing you can do for real-time tasks is, in addition to the
non-blocking flag (return EAGAIN from asys_submit rather than blocking),
you could have an "atomic" flag that would avoid blocking to preallocate
the additional kernel thread! Then you'd really be guaranteed no
long delays, ever.

Davide Libenzi

unread,

Feb 4, 2007, 12:13:30 AM2/4/07

to Zach Brown

On Tue, 30 Jan 2007, Zach Brown wrote:

> + /*
> + * XXX The idea is to copy all but the actual call stack. Obviously
> + * this is wildly arch-specific and belongs abstracted out.
> + */
> + *next->ti = *ti;
> + *thread_info_pt_regs(next->ti) = *thread_info_pt_regs(ti);

arch copy_thread_info()?

> + current->per_call = next->per_call;

Pointer instead of structure copy? percall_clone()/percall_free()?

> + /* always switch to a runnable fibril if we aren't being preempted */
> + if (unlikely(!(preempt_count() & PREEMPT_ACTIVE) &&
> + !list_empty(&prev->runnable_fibrils))) {
> + schedule_fibril(prev);
> + /*
> + * finish_fibril_switch() drops the rq lock and enables
> + * premption, but the popfl disables interrupts again. Watch
> + * me learn how context switch locking works before your very
> + * eyes! XXX This will need to be fixed up by throwing
> + * together something like the prepare_lock_switch() path the
> + * scheduler does. Guidance appreciated!
> + */
> + local_irq_enable();
> + return;
> + }

Yes, please (prepare/finish) ... ;)

- Davide

Zach Brown

unread,

Feb 5, 2007, 11:45:44 AM2/5/07

to Alan

> Other questions really relate to the scheduling - Zach do you intend
> schedule_fibrils() to be a call code would make or just from
> schedule() ?

I'd much rather keep the current sleeping API in as much as is
possible. So, yeah, if we can get schedule() to notice and behave
accordingly I'd prefer that. In the current code it's keyed off
finding a stack allocation hanging off of current->. If the caller
didn't care about guaranteeing non-blocking submission then we
wouldn't need that.. we could use a thread_info flag bit, or
something. Avoiding that allocation in the cached case would be nice.

> Alan (who used to use Co-routines in real languages on 36bit
> computers with 9bit bytes before learning C)

Yes, don't despair, I'm not co-routine ignorant. In fact, I'm almost
positive it was you who introduced them to me at some point in the
previous millennium ;).

- z

Zach Brown

unread,

Feb 5, 2007, 12:03:17 PM2/5/07

to Ingo Molnar

> ok, i think i noticed another misunderstanding. The kernel thread
> based
> scheme i'm suggesting would /not/ 'switch' to another kernel thread in
> the cached case, by default. It would just execute in the original
> context (as if it were a synchronous syscall), and the switch to a
> kernel thread from the pool would only occur /if/ the context is about
> to block. (this 'switch' thing would be done by the scheduler)

Yeah, this is what I imagined when you described doing this with
threads instead of these 'fibril' things.

It sounds like you're suggesting that we keep the 1:1 relationship
between task_struct and thread_info. That would avoid the risks that
the current fibril approach brings. It insists that all of
task_struct is shared between concurrent fibrils (even if only
between blocking points). As I understand what Ingo is suggesting,
we'd instead only explicitly share the fields that we migrate (copy
or get a reference) as we move the stack from the submitting
task_struct to a waiting_task struct as the submission blocks.

We trade initial effort to make things safe in the presence of
universal sharing for effort to introduce sharing as people notice
deficient behaviour. If that's the way we prefer to go, I'm cool
with that. I might have gone slightly nuts in preferring *identical*
sync and async behaviour.

The fast path would look almost identical to the existing fibril
switch. We'd just have a few more fields to sync up between the two
task_structs.

Ingo, am I getting this right? This sounds pretty straight forward
to prototype from the current patches. I can certainly give it a try.

> it's quite cheap to 'flip' it to under any arbitrary user-space
> context:
> change its thread_info->task pointer to the user-space context's task
> struct, copy the mm pointer, the fs pointer to the "worker thread",
> switch the thread_info, update ptregs - done. Hm?

Or maybe you're talking about having concurrent executing
thread_info's pointing to the user-space submitting task_struct?
That really does sound like the current fibril approach, with even
more sharing of thread_info's that might be executing on other cpus?

Either way, I want to give it a try. If we can measure it performing
reasonably in the cached case then I think everyone's happy?

> is not part of the signal set. (Although it might make sense to make
> such async syscalls interruptible, just like any syscall.)

I think we all agree that they have to be interruptible by now,
right? If for no other reason than to interrupt pending poll with no
timeout, say, as the task exits..

> The 'pool' of kernel threads doesnt even have to be per-task, it
> can be
> a natural per-CPU thing

Yeah, absolutely.

- z

Zach Brown

unread,

Feb 5, 2007, 12:13:58 PM2/5/07

to Davide Libenzi

> Since I still think that the many-thousands potential async operations
> coming from network sockets are better handled with a classical event
> machanism [1], and since smooth integration of new async syscall
> into the
> standard POSIX infrastructure is IMO a huge win, I think we need to
> have a
> "bridge" to allow async completions being detectable through a
> pollable
> (by the mean of select/poll/epoll whatever) device.

Ugh, I'd rather not if we don't have to.

It seems like you could get this behaviour from issuing a poll/select
(really?)/epoll as one of the async calls to complete. (And you
mention this in a later email? :))

Part of my thinking on this is that we might want it to be really
trivial to create and wait on groups of ops.. maybe as a context.
One of the things posix AIO wants is the notion of submitting and
waiting on a group of ops as a "list". That sounds like we might be
able to implement it by issuing ops against a context, created as
part of the submission, and then waiting for it to drain.

Being able to wait on that with file->poll() obviously requires
juggling file-> associations which sounds like more weight than we
might want. Or it'd be optional and we'd get more moving parts and
divergent paths to test.

So, sure, it's possible and not terribly difficult, but I'd rather
avoid it if people can be convinced to get the same behaviour by
issuing an async instance of their favourite readiness syscall.

- z

Zach Brown

unread,

Feb 5, 2007, 12:45:15 PM2/5/07

to Ingo Molnar

> But really, being a scheduler guy i was much more concerned about the
> duplication and problems caused by the fibril concept itself - which
> duplication and complexity makes up 80% of Zach's submitted patchset.
> For example this bit:
>
> [PATCH 3 of 4] Teach paths to wake a specific void * target
>
> would totally go away if we used kernel threads for this.

Uh, would it? Are you talking about handing off the *task_struct*
that it was submitted under to each worker thread that inherits the
stack?

I guess I hadn't considered going that far. I had somehow
constructed a block in my mind that we couldn't release the
task_struct from the submitting task. But maybe we can be clever
enough with the task_struct updating that userspace wouldn't notice a
significant change.

Hmm.

> i totally agree that the API /should/ be the main focus - but i didnt
> pick the topic and most of the patchset's current size is due to
> the IMO
> avoidable fibril concept.

I, too, totally agree. I didn't even approach the subject for
exactly the reason you allude to -- I wanted to get the hard parts of
the kernel side right first.

> regarding the API, i dont really agree with the current form and
> design
> of Zach's interface.

Haha, well, yes, of course. You couldn't have thought that the dirt-
stupid sys_asys_wait_for_completion() was anything more than simple
scaffolding to test the kernel bits.

> fundamentally, the basic entity of this thing should be a /system
> call/,
> not the artificial fibril thing:
>
> +struct asys_call {
> + struct asys_result *result;
> + struct fibril fibril;
> +};

You picked a weird struct to highlight here. struct asys_input seems
more related to the stuff you go on to discuss below. This asys_call
struct is a relatively irrelevant internal detail of how
asys_teardown_stack() gets from a fibril to the pre-allocated
completion state once the call has returned.

> The normal and most optimal workflow should be a user-space ring-
> buffer
> of these constant-size struct async_syscall entries:
>
> struct async_syscall ringbuffer[1024];
>
> LIST_HEAD(submitted);
> LIST_HEAD(pending);
> LIST_HEAD(completed);

I strongly disagree here, and I'm hoping you're not as keen on this
now -- your reply to Matt gives me hope.

As mentioned, that they complete out-of-order leads, at least, to
having separate submission and completion rings. I'm not sure a
submission ring makes any sense given the goal of processing the
calls in submission and only creating threads if it blocks. A simple
copy of an array of these input structs sounds fine to me.

When I think about the completion side I tend to hope we can end up
with something like what VJ talked about in his net channels work.
producer/consumer rings with head and tail pointers in different
cache lines. AFAIK the kevent work has headed in that direction, but
I haven't kept up. Uli has certainly mentioned it in his 'ec' (event
channels) proposals.

The posix AIO list completion and, sadly, signals on completion need
to be considered, too.

Honestly, though, I'm not worried about this part. We'll easily come
to an agreement. I'm just not going to distract myself with it until
we're happy with the scheduler side.

> Looks like we can hit many birds with this single stone: AIO, vectored
> syscalls, finegrained system-call parallelism. Hm?

Hmm, indeed. Some flags could let userspace tell the kernel not to
bother with all this threading/concurrency/aio nonsense and just
issue them serially. It'll sound nuts in these days of cheap
syscalls and vsyscall helpers, but some Oracle folks might love this
for issuing a gettimeofday() pair around syscalls they want to profile.

I hadn't considered that as a potential property of this interface.

- z

Zach Brown

unread,

Feb 5, 2007, 12:55:18 PM2/5/07

to Davide Libenzi

>
>
>> + current->per_call = next->per_call;
>
> Pointer instead of structure copy?

Sure, there are lots of trade-offs there, but the story changes if we

keep the 1:1 relationship between task_struct and thread_info.

- z

Davide Libenzi

unread,

Feb 5, 2007, 1:37:07 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:

> > Since I still think that the many-thousands potential async operations
> > coming from network sockets are better handled with a classical event
> > machanism [1], and since smooth integration of new async syscall into the
> > standard POSIX infrastructure is IMO a huge win, I think we need to have a
> > "bridge" to allow async completions being detectable through a pollable
> > (by the mean of select/poll/epoll whatever) device.
>
> Ugh, I'd rather not if we don't have to.
>
> It seems like you could get this behaviour from issuing a

> poll/select(really?)/epoll as one of the async calls to complete. (And you

> mention this in a later email? :))

Yes, no need for the above. We can just host a poll/epoll in an async()
operation, and demultiplex once that gets ready.

- Davide

Davide Libenzi

unread,

Feb 5, 2007, 1:53:13 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:

> > The 'pool' of kernel threads doesnt even have to be per-task, it can be
> > a natural per-CPU thing
>
> Yeah, absolutely.

Hmmm, so we issue an async sys_read(), what a get_file(fd) will return for
a per-CPU kthread executing such syscall? Unless we teach context_switch()
to do a inherit-trick for "files" (even in that case, it won't work if
we switch from another context). And, is it all for it?
IMO it's got to be either a per-process thread pool or a fibril approach.
Or we need some sort of enter_context()/leave_context() (adopt mm, files,
..) to have a per-CPU kthread to be able to execute the syscall from the
async() caller context. Hmmm?

- Davide

Zach Brown

unread,

Feb 5, 2007, 2:22:19 PM2/5/07

to Davide Libenzi

> Or we need some sort of enter_context()/leave_context() (adopt mm,
> files,

> ...) to have a per-CPU kthread to be able to execute the syscall

> from the
> async() caller context.

I believe that's what Ingo is hoping for, yes.

- z

Davide Libenzi

unread,

Feb 5, 2007, 2:26:52 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:

> > The normal and most optimal workflow should be a user-space ring-buffer
> > of these constant-size struct async_syscall entries:
> >
> > struct async_syscall ringbuffer[1024];
> >
> > LIST_HEAD(submitted);
> > LIST_HEAD(pending);
> > LIST_HEAD(completed);
>
> I strongly disagree here, and I'm hoping you're not as keen on this now --
> your reply to Matt gives me hope.
>
> As mentioned, that they complete out-of-order leads, at least, to having
> separate submission and completion rings. I'm not sure a submission ring
> makes any sense given the goal of processing the calls in submission and only
> creating threads if it blocks. A simple copy of an array of these input
> structs sounds fine to me.

The "result" of one async operation is basically a cookie and a result
code. Eight or sixteen bytes at most. IMO, before going wacko designing
complex shared userspace-kernel result buffers, I think it'd be better
measuring the worth-value of the thing ;)

- Davide

Davide Libenzi

unread,

Feb 5, 2007, 2:38:35 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:

> > Or we need some sort of enter_context()/leave_context() (adopt mm, files,
> > ...) to have a per-CPU kthread to be able to execute the syscall from the
> > async() caller context.
>
> I believe that's what Ingo is hoping for, yes.

Ok, but then we should ask ourselves if it's really worth to have a
per-CPU pool (that will require quite a few changes to the current way
of doing things), or a per-process pool (that would basically work as is).
What advantage gives us a per-CPU pool?
Setup cost? Not really IMO. Thread creation is pretty cheap, and a typical
process using async will have a pretty huge lifespan (compared to the pool
creation cost).
Configurability scores for a per-process pool, because it may allow each
process (eventually) to size his own.
What's the real point in favour of a per-CPU pool, that justify all the
changes that will have to be done in order to adopt such concept?

- Davide

Zach Brown

unread,

Feb 5, 2007, 2:42:38 PM2/5/07

to Davide Libenzi

> The "result" of one async operation is basically a cookie and a result
> code. Eight or sixteen bytes at most.

s/basically/minimally/

Well, yeah. The patches I sent had:

struct asys_completion {
long return_code;
unsigned long cookie;
};

That's as stupid as it gets.

> IMO, before going wacko designing
> complex shared userspace-kernel result buffers, I think it'd be better
> measuring the worth-value of the thing ;)

Obviously, yes.

The potential win is to be able to have one place to wait for
collection from multiple sources. Some of them might want more data
per event. They can always indirect out via a cookie pointer, sure,
but at insanely high message rates (10gige small messages) one might
not want that.

Davide Libenzi

unread,

Feb 5, 2007, 3:10:43 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:

> > The "result" of one async operation is basically a cookie and a result
> > code. Eight or sixteen bytes at most.
>
> s/basically/minimally/
>
> Well, yeah. The patches I sent had:
>
> struct asys_completion {
> long return_code;
> unsigned long cookie;
> };
>
> That's as stupid as it gets.

No, that's *really* it ;)
The cookie you pass, and the return code of the syscall.
If there other data transfered? Sure, but that data transfered during the
syscall processing, and handled by the syscall (filling up a sys_read
buffer just for example).

> > IMO, before going wacko designing
> > complex shared userspace-kernel result buffers, I think it'd be better
> > measuring the worth-value of the thing ;)
>
> Obviously, yes.
>
> The potential win is to be able to have one place to wait for collection from
> multiple sources. Some of them might want more data per event. They can
> always indirect out via a cookie pointer, sure, but at insanely high message
> rates (10gige small messages) one might not want that.

Did I miss something? The async() syscall will allow (with few
restrictions) to execute whatever syscall in an async fashion. An syscall
returns a result code (long). Plus, you need to pass back the
userspace-provided cookie of course. A cookie is very likely a direct
pointer to the userspace session the async syscall applies to, so a
"(my_session *) results[i].cookie" will bring you directly on topic.
Collection of multiple sources? What do you mean? What's wrong with:

int async_wait(struct asys_completion *results, int nresults);

Is saving an 8/16 bytes double copy worth going wacko in designing shared
userspace/kernel buffers, when the syscall that lays behind an
asys_completion is prolly touching KBs of RAM during its execution?

- Davide

Zach Brown

unread,

Feb 5, 2007, 3:22:50 PM2/5/07

to Davide Libenzi

> No, that's *really* it ;)

For syscalls, sure.

The kevent work incorporates Uli's desire to have more data per
event. Have you read his OLS stuff? It's been a while since I did
so I've lost the details of why he cares to have more.

Let me say it again, maybe a little louder this time: I'm not
interested in worrying about this aspect of the API until the
scheduler mechanics are more solidified.

- z

Linus Torvalds

unread,

Feb 5, 2007, 3:40:11 PM2/5/07

to Davide Libenzi

On Mon, 5 Feb 2007, Davide Libenzi wrote:
>
> No, that's *really* it ;)
>
> The cookie you pass, and the return code of the syscall.
> If there other data transfered? Sure, but that data transfered during the
> syscall processing, and handled by the syscall (filling up a sys_read
> buffer just for example).

Indeed. One word is *exactly* what a normal system call returns too.

That said, normally we have a user-space library layer to turn that into
the "errno + return value" thing, and in the case of async() calls we
very basically wouldn't have that. So either:

- we'd need to do it in the kernel (which is actually nasty, since
different system calls have slightly different semantics - some don't
return any error value at all, and negative numbers are real numbers)

- we'd have to teach user space about the "negative errno" mechanism, in
which case one word really is alwats enough.

Quite frankly, I much prefer the second alternative. The "negative errno"
thing has not only worked really really well inside the kernel, it's so
obviously 100% superior to the standard UNIX "-1 + errno" approach that
it's not even funny.

To see why "negative errno" is better, just look at any threaded program,
or look at any program that does multiple calls and needs to save the
errno not from the last one, but from some earlier one (eg, it does a
"close()" in between returning *its* error, and the real operation that
we care about).

> Did I miss something? The async() syscall will allow (with few
> restrictions) to execute whatever syscall in an async fashion. An syscall
> returns a result code (long). Plus, you need to pass back the
> userspace-provided cookie of course.

HOWEVER, they get returned differently. The cookie gets returned
immediately, the system call result gets returned in-memory only after the
async thing has actually completed.

I would actually argue that it's not the kernel that should generate any
cookie, but that user-space should *pass*in* the cookie it wants to, and
the kernel should consider it a pointer to a 64-bit entity which is the
return code.

In other words, the only "cookie" we need is literally the pointer to the
results. And that's obviously something that the user space has to set up
anyway.

So how about making the async interface be:

// returns negative for error
// zero for "synchronous"
// positive kernel "wait for me" cookie for success
long sys_async_submit(
unsigned long flags,
long *user_result_ptr,
long syscall,
unsigned long *args);

and the "args" thing would literally just fill up the registers.

The real downside here is that it's very architecture-specific this way,
and that means that x86-64 (and other 64-bit ones) would need to have
emulation layers for the 32-bit ones, but they likely need to do that
*anyway*, so it's probably not a huge downside. The alternative is to:

- make a new architecture-independent system call enumeration for the
async interface

- make everything use 64-bit values.

Now, making an architecture-independent system call enumeration may
actually make sense regardless, because it would allow sys_async() to have
its own system call table and put the limitations and rules for those
system calls there, instead of depending on the per-architecture system
call table that tends to have some really architecture-specific details.

Hmm?

Linus

Linus Torvalds

unread,

Feb 5, 2007, 3:43:16 PM2/5/07

to Zach Brown

On Mon, 5 Feb 2007, Zach Brown wrote:
>
> For syscalls, sure.
>
> The kevent work incorporates Uli's desire to have more data per event. Have
> you read his OLS stuff? It's been a while since I did so I've lost the
> details of why he cares to have more.

You'd still do that as _arguments_ to the system call, not as the return
value.

Also, quite frankly, I tend to find Uli over-designs things. The whole
disease of "make things general" is a CS disease that some people take to
extreme.

The good thing about generic code is not that it solves some generic
problem. The good thing about generics is that they mean that you can
_avoid_ solving other problems AND HAVE LESS CODE. But some people seem to
think that "generic" means that you have to have tons of code to handle
all the possible cases, and that *completely* misses the point.

We want less code. The whole (and really, the _only_) point of the
fibrils, at least as far as I'm concerned, is to *not* have special code
for aio_read/write/whatever.

Linus

Davide Libenzi

unread,

Feb 5, 2007, 4:09:48 PM2/5/07

to Linus Torvalds

On Mon, 5 Feb 2007, Linus Torvalds wrote:

> Indeed. One word is *exactly* what a normal system call returns too.
>
> That said, normally we have a user-space library layer to turn that into
> the "errno + return value" thing, and in the case of async() calls we
> very basically wouldn't have that. So either:
>
> - we'd need to do it in the kernel (which is actually nasty, since
> different system calls have slightly different semantics - some don't
> return any error value at all, and negative numbers are real numbers)
>
> - we'd have to teach user space about the "negative errno" mechanism, in
> which case one word really is alwats enough.
>
> Quite frankly, I much prefer the second alternative. The "negative errno"
> thing has not only worked really really well inside the kernel, it's so
> obviously 100% superior to the standard UNIX "-1 + errno" approach that
> it's not even funny.

Currently it's in the syscall wrapper. Couldn't we have it in the
asys_teardown_stack() stub?

> HOWEVER, they get returned differently. The cookie gets returned
> immediately, the system call result gets returned in-memory only after the
> async thing has actually completed.
>
> I would actually argue that it's not the kernel that should generate any
> cookie, but that user-space should *pass*in* the cookie it wants to, and
> the kernel should consider it a pointer to a 64-bit entity which is the
> return code.

Yes. Let's have the userspace to "mark" the async operation. IMO the
cookie should be something transparent to the kernel.
Like you said though, that'd require compat-code (unless we fix the size).

- Davide

Zach Brown

unread,

Feb 5, 2007, 4:22:53 PM2/5/07

to Linus Torvalds

> - we'd need to do it in the kernel (which is actually nasty, since
> different system calls have slightly different semantics - some
> don't
> return any error value at all, and negative numbers are real
> numbers)
>
> - we'd have to teach user space about the "negative errno"
> mechanism, in
> which case one word really is alwats enough.
>
> Quite frankly, I much prefer the second alternative. The "negative
> errno"
> thing has not only worked really really well inside the kernel,
> it's so
> obviously 100% superior to the standard UNIX "-1 + errno" approach
> that
> it's not even funny.

I agree, and I imagine you'd have a hard time finding someone who
actually *likes* the errno convention :)

> I would actually argue that it's not the kernel that should
> generate any
> cookie, but that user-space should *pass*in* the cookie it wants
> to, and
> the kernel should consider it a pointer to a 64-bit entity which is
> the
> return code.

Yup. That's how the current code (and epoll, and fs/aio.c, and..) work.

Cancelation comes into this discussion, I think. Hopefully its
reasonable to expect userspace to be able to manage cookies well
enough that they can use them to issue cancels and only hit the ops
they intend to. It means we have to give them the tools to
differentiate between a racing completion and cancelation so they can
reuse a cookie at the right time, but that doesn't sound fatal.

> - make everything use 64-bit values.

This would be my preference.

> Now, making an architecture-independent system call enumeration may
> actually make sense regardless, because it would allow sys_async()
> to have
> its own system call table and put the limitations and rules for those
> system calls there, instead of depending on the per-architecture
> system
> call table that tends to have some really architecture-specific
> details.

Maybe, sure. I don't have a lot of insight into this. Hopefully
some arch maintainers can jump in?

- z

Kent Overstreet

unread,

Feb 5, 2007, 4:32:23 PM2/5/07

to Davide Libenzi

> > HOWEVER, they get returned differently. The cookie gets returned
> > immediately, the system call result gets returned in-memory only after the
> > async thing has actually completed.
> >
> > I would actually argue that it's not the kernel that should generate any
> > cookie, but that user-space should *pass*in* the cookie it wants to, and
> > the kernel should consider it a pointer to a 64-bit entity which is the
> > return code.
>
> Yes. Let's have the userspace to "mark" the async operation. IMO the
> cookie should be something transparent to the kernel.
> Like you said though, that'd require compat-code (unless we fix the size).

You don't need an explicit cookie if you're passing in a pointer to
the return code, it doesn't really save you anything to do so. Say
you've got a bunch of user threads (with or without stacks, it doesn't
matter).

struct asys_ret {
int ret;
struct thread *p;
};

struct asys_ret r;
r.p = me;

async_read(fd, buf, nbytes, &r);

Then you just have your async_getevents return the same pointers you
passed in, and your main event loop gets pointers to its threads for
free.

It seems cleaner to do it this way vs. returning structs with the
actual return code and a cookie, as threads get the return code
exactly where they want it.

Keep in mind that the epoll way (while great for epoll, I do love it)
makes sense because it doesn't have to deal with any sort of return
codes.

My only other point is that you really do want a bulk asys_submit
instead of doing a syscall per async syscall; one of the great wins of
this approach is heavily IO driven apps can batch up syscalls.

bert hubert

unread,

Feb 5, 2007, 4:37:19 PM2/5/07

to Davide Libenzi

On Fri, Feb 02, 2007 at 03:37:09PM -0800, Davide Libenzi wrote:

> Since I still think that the many-thousands potential async operations
> coming from network sockets are better handled with a classical event
> machanism [1], and since smooth integration of new async syscall into the
> standard POSIX infrastructure is IMO a huge win, I think we need to have a
> "bridge" to allow async completions being detectable through a pollable
> (by the mean of select/poll/epoll whatever) device.

> [1] Unless you really want to have thousands of kthreads/fibrils lingering
> on the system.

From my end as an application developer, yes please. Either make it
perfectly ok to have thousands of outstanding asynchronous system calls (I
work with thousands of separate sockets), or allow me to select/poll/epoll
on the "async fd".

Alternatively, something like SIGIO ('SIGASYS'?) might be considered, but,
well, the fd might be easier.

In fact, perhaps the communication channel might simply *be* an fd. Queueing
up syscalls sounds remarkably like sending datagrams.

Bert

--
http://www.PowerDNS.com Open source, database driven DNS Software
http://netherlabs.nl Open and Closed source services

David Miller

unread,

Feb 5, 2007, 4:45:09 PM2/5/07

to dav...@xmailserver.org

From: Davide Libenzi <dav...@xmailserver.org>
Date: Mon, 5 Feb 2007 10:24:34 -0800 (PST)

> Yes, no need for the above. We can just host a poll/epoll in an async()
> operation, and demultiplex once that gets ready.

I can hear Evgeniy crying 8,000 miles away.

I strongly encourage a lot of folks commenting in this thread to
familiarize themselves with kevent and how it handles this stuff. I
see a lot of suggestions for things he has totally implemented and
solved already in kevent.

I'm not talking about Zach's fibril's, I'm talking about the interface
aspects of these discussions.

Linus Torvalds

unread,

Feb 5, 2007, 4:58:14 PM2/5/07

to bert hubert

On Mon, 5 Feb 2007, bert hubert wrote:
>
> From my end as an application developer, yes please. Either make it
> perfectly ok to have thousands of outstanding asynchronous system calls (I
> work with thousands of separate sockets), or allow me to select/poll/epoll
> on the "async fd".

No can do.

Allocating an fd is actually too expensive, exactly because a lot of these
operations are supposed to be a few hundred ns, and taking locks is simply
a bad idea.

But if you want to, we could have a *separate* "convert async cookie to
fd" so that you can poll for it, or something.

I doubt very many people want to do that. It would tend to simply be nicer
to do

async(poll);
async(waitpid);
async(.. wait foranything else ..)

followed by a

wait_for_async();

That's just a much NICER approach, I would argue. And it automatically
and very naturally solves the "wait for different kinds of events"
question, in a way that "poll()" never did (except by turning all events
into file descriptors or signals).

> Alternatively, something like SIGIO ('SIGASYS'?) might be considered, but,
> well, the fd might be easier.

Again. NO WAY. Signals are just damn expensive. At most, it would be an
option again, but if you want high performance, signals simply aren't very
good. They are also a nice way to make your user-space code very racy.

> In fact, perhaps the communication channel might simply *be* an fd. Queueing
> up syscalls sounds remarkably like sending datagrams.

I'm the first to say that file descriptors is the UNIX way, but so are
processes, and I think this is MUCH better done as a "process" interface.
In other words, instead of doing it as a filedescriptor, do it as a
"micro-fork/exec", and have the "wait()" equivalent. It's just that we
don't fork a "real process", and we don't exec a "real program", we just
exec a single system call.

If you think of it in those terms, it all makes sense *without* any file
descriptors what-so-ever, and the "wait_for_async()" interface also makes
a ton of sense (it really *is* "waitpid()" for the system call).

Linus

bert hubert

unread,

Feb 5, 2007, 5:08:13 PM2/5/07

to Linus Torvalds

On Mon, Feb 05, 2007 at 01:57:15PM -0800, Linus Torvalds wrote:

> I doubt very many people want to do that. It would tend to simply be nicer
> to do
>
> async(poll);

Yeah - I saw that technique being mentioned later on in the thread, and it
would work, I think.

To make up for the waste of time, some other news. I asked Matt Dillon of
DragonflyBSD why he removed asynchronous system calls from his OS, and he
told me it was because of the problems he had implementing them in the
kernel:

There were two basic problems: First, it added a lot of overhead when
most system calls are either non-blocking anyway (like getpid()).
Second and more importantly it was very, very difficult to recode the
system calls that COULD block to actually be asynchronous in the kernel.
I spent some time recoding nanosleep() to operate asynchronously and it
was a huge mess.

Aside from that, they did not discover any skeletons hidden in the closet,
although from mailing list traffic, I gather the asynchronous system calls
didn't see a lot of use. If I understand it correctly, for a number of years
they emulated asynchronous system calls using threads.

We'd be sidestepping the need to update all syscalls via 'fibrils' of
course.

> If you think of it in those terms, it all makes sense *without* any file
> descriptors what-so-ever, and the "wait_for_async()" interface also makes
> a ton of sense (it really *is* "waitpid()" for the system call).

It has me excited in any case. Once anything even remotely testable appears
(Zach tells me not to try the current code), I'll work it into MTasker
(http://ds9a.nl/mtasker) and make it power a nameserver that does async i/o,
for use with very very large zones that aren't preloaded.

Bert

--
http://www.PowerDNS.com Open source, database driven DNS Software
http://netherlabs.nl Open and Closed source services

Zach Brown

unread,

Feb 5, 2007, 5:16:07 PM2/5/07

to bert hubert

> It has me excited in any case. Once anything even remotely testable
> appears
> (Zach tells me not to try the current code), I'll work it into MTasker
> (http://ds9a.nl/mtasker) and make it power a nameserver that does
> async i/o,
> for use with very very large zones that aren't preloaded.

I'll be sure to let you know :)

- z

Davide Libenzi

unread,

Feb 5, 2007, 5:35:56 PM2/5/07

to Linus Torvalds

On Mon, 5 Feb 2007, Linus Torvalds wrote:

> On Mon, 5 Feb 2007, bert hubert wrote:
> >
> > From my end as an application developer, yes please. Either make it
> > perfectly ok to have thousands of outstanding asynchronous system calls (I
> > work with thousands of separate sockets), or allow me to select/poll/epoll
> > on the "async fd".
>
> No can do.
>
> Allocating an fd is actually too expensive, exactly because a lot of these
> operations are supposed to be a few hundred ns, and taking locks is simply
> a bad idea.
>
> But if you want to, we could have a *separate* "convert async cookie to
> fd" so that you can poll for it, or something.
>
> I doubt very many people want to do that. It would tend to simply be nicer
> to do
>
> async(poll);
> async(waitpid);
> async(.. wait foranything else ..)
>
> followed by a
>
> wait_for_async();
>
> That's just a much NICER approach, I would argue. And it automatically
> and very naturally solves the "wait for different kinds of events"
> question, in a way that "poll()" never did (except by turning all events
> into file descriptors or signals).

Bert, that was the first suggestion I gave to Zab. But then I realized
that a multiplexed poll/epoll can be "hosted" in an async op, just like
Linus showed above. Will work just fine IMO.

- Davide

Davide Libenzi

unread,

Feb 5, 2007, 7:16:01 PM2/5/07

to David Miller

On Mon, 5 Feb 2007, David Miller wrote:

> From: Davide Libenzi <dav...@xmailserver.org>
> Date: Mon, 5 Feb 2007 10:24:34 -0800 (PST)
>
> > Yes, no need for the above. We can just host a poll/epoll in an async()
> > operation, and demultiplex once that gets ready.
>
> I can hear Evgeniy crying 8,000 miles away.
>
> I strongly encourage a lot of folks commenting in this thread to
> familiarize themselves with kevent and how it handles this stuff. I
> see a lot of suggestions for things he has totally implemented and
> solved already in kevent.

David, I'm sorry but I only briefly looked at the work Evgeniy did on
kevent. So excuse me if I say something broken in the next few sentences.
Zab's async syscall interface is a pretty simple one. It accepts the
syscall number, the parameters for the syscall, and a cookie. It returns a
syscall result code, and your cookie (that's the meat of it, at least).
IMO its interface should be optimized for what it does.
Could this submission/retrieval be inglobated inside a "generic"
submission/retrieval API? Sure you can. But then you end up having
submission/event structures with 17 members, 3 of which are valid at each
time. The API becomes more difficult to use IMO, because suddendly you
have to know which field are good for each event you're submitting/fetching.
IMHO, genericity can be built in userspace, *if* one really wants it and,
of course, provided that the OS gives you the tools to build it.
The problem before, was that it was hard to bridge something like poll/epoll
with other "by nature" sync operations. Evgeniy kevent is one attempt,
Zab's async is another one. IMO the async syscall is a *very poweful* one,
since it allows for *total coverage* async support w/out "plugs" all over
the kernel paths.
But as Zab said, the kernel implementation is more important ATM.

- Davide

Scot McKinley

unread,

Feb 5, 2007, 7:29:24 PM2/5/07

to Linus Torvalds

As Joel mentioned earlier, from an Oracle perspective, one of the key
things we are looking for is a nice clean *common* wait point. We don't
really care whether this common wait point is the old libaio:async-poll,
epoll, or "wait_for_async". And if "wait_for_async" has the added
benefit of scaling, all the better.

However, it is desirable for that common wait-routine to have the
ability to return explicit completions, instead of requiring a follow-on
call to some other query/wait for events/completions for each of the
different type of async submissions done (poll, pid, i/o, ...).
Obviously not a "must-have", but desirable.

It is also desirable (if possible) to have immediate completions (either
immediate errs or async submissions that complete synchronously)
communicated at submission time, instead of via the common wait-routine.

Finally, it is agreed that neg-errno is a much better approach for the
return code. The threading/concurrency issues associated w/ the current
unix errno has always been buggy area for Oracle Networking code.

Regards, -Scot

Linus Torvalds wrote:

>--
>To unsubscribe, send a message with 'unsubscribe linux-aio' in
>the body to majo...@kvack.org. For more info on Linux AIO,
>see: http://www.kvack.org/aio/
>Don't email: <a href=mailto:"aa...@kvack.org">aa...@kvack.org</a>

Davide Libenzi

unread,

Feb 5, 2007, 7:33:17 PM2/5/07

to Linus Torvalds

On Mon, 5 Feb 2007, Davide Libenzi wrote:

> On Mon, 5 Feb 2007, Linus Torvalds wrote:
>
> > Indeed. One word is *exactly* what a normal system call returns too.
> >
> > That said, normally we have a user-space library layer to turn that into
> > the "errno + return value" thing, and in the case of async() calls we
> > very basically wouldn't have that. So either:
> >
> > - we'd need to do it in the kernel (which is actually nasty, since
> > different system calls have slightly different semantics - some don't
> > return any error value at all, and negative numbers are real numbers)
> >
> > - we'd have to teach user space about the "negative errno" mechanism, in
> > which case one word really is alwats enough.
> >
> > Quite frankly, I much prefer the second alternative. The "negative errno"
> > thing has not only worked really really well inside the kernel, it's so
> > obviously 100% superior to the standard UNIX "-1 + errno" approach that
> > it's not even funny.
>
> Currently it's in the syscall wrapper. Couldn't we have it in the
> asys_teardown_stack() stub?

Eeeek, that was something *really* stupid I said :D

David Miller

unread,

Feb 5, 2007, 7:49:10 PM2/5/07

to scot.m...@oracle.com

From: Scot McKinley <scot.m...@oracle.com>
Date: Mon, 05 Feb 2007 16:27:44 -0800

> As Joel mentioned earlier, from an Oracle perspective, one of the key
> things we are looking for is a nice clean *common* wait point.

How much investigation have the Oracle folks (besides Zach :-) done
into Evgeniy's kevent interfaces and how much feedback have they given
to him.

I know it sounds like I'm being a pain in the ass, but it saddens
me that there is this whole large body of work implemented to solve
a problem, the maintainer keeps posting patch sets and the whole
discussions has gone silent.

I'd be quiet if there were some well formulated objections to his work
being posted, but people are posting nothing. So either it's a
perfect API or people aren't giving it the attention and consideration
it deserves.

Joel Becker

unread,

Feb 5, 2007, 7:49:58 PM2/5/07

to Scot McKinley

On Mon, Feb 05, 2007 at 04:27:44PM -0800, Scot McKinley wrote:
> Finally, it is agreed that neg-errno is a much better approach for the
> return code. The threading/concurrency issues associated w/ the current
> unix errno has always been buggy area for Oracle Networking code.

As Scot knows, when Oracle started using the current io_submit(2)
and io_getevents(2), -errno was a big win.

Joel

--

"Born under a bad sign.
I been down since I began to crawl.
If it wasn't for bad luck,
I wouldn't have no luck at all."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel....@oracle.com
Phone: (650) 506-8127

Al Boldi

unread,

Feb 6, 2007, 8:41:40 AM2/6/07

to linux-...@vger.kernel.org

Linus Torvalds wrote:
> On Mon, 5 Feb 2007, Zach Brown wrote:
> > For syscalls, sure.
> >
> > The kevent work incorporates Uli's desire to have more data per event.
> > Have you read his OLS stuff? It's been a while since I did so I've lost
> > the details of why he cares to have more.
>
> You'd still do that as _arguments_ to the system call, not as the return
> value.
>
> Also, quite frankly, I tend to find Uli over-designs things. The whole
> disease of "make things general" is a CS disease that some people take to
> extreme.
>
> The good thing about generic code is not that it solves some generic
> problem. The good thing about generics is that they mean that you can
> _avoid_ solving other problems AND HAVE LESS CODE.

Yes, that would be generic code, in the pure sense.

> But some people seem to
> think that "generic" means that you have to have tons of code to handle
> all the possible cases, and that *completely* misses the point.

That would be generic code too, but by way of functional awareness. This is
sometimes necessary, as no pure generic code has been found.

What's important is not the generic code, but rather the correct abstraction
of the problem-domain, regardless of it's implementation, as that can be
conveniently hidden behind the interface.

> We want less code. The whole (and really, the _only_) point of the
> fibrils, at least as far as I'm concerned, is to *not* have special code
> for aio_read/write/whatever.

What we want is correct code, and usually that means less code in the long
run.

So, instead of allowing the implementation to dictate the system design, it
may be advisable to concentrate on the design first, to achieve an abstract
interface that is realized by an implementation second.

Thanks!

--
Al

Davide Libenzi

unread,

Feb 6, 2007, 3:25:47 PM2/6/07

to Kent Overstreet

On Mon, 5 Feb 2007, Kent Overstreet wrote:

> > > HOWEVER, they get returned differently. The cookie gets returned
> > > immediately, the system call result gets returned in-memory only after the
> > > async thing has actually completed.
> > >
> > > I would actually argue that it's not the kernel that should generate any
> > > cookie, but that user-space should *pass*in* the cookie it wants to, and
> > > the kernel should consider it a pointer to a 64-bit entity which is the
> > > return code.
> >
> > Yes. Let's have the userspace to "mark" the async operation. IMO the
> > cookie should be something transparent to the kernel.
> > Like you said though, that'd require compat-code (unless we fix the size).
>
> You don't need an explicit cookie if you're passing in a pointer to
> the return code, it doesn't really save you anything to do so. Say
> you've got a bunch of user threads (with or without stacks, it doesn't
> matter).
>
> struct asys_ret {
> int ret;
> struct thread *p;
> };
>
> struct asys_ret r;
> r.p = me;
>
> async_read(fd, buf, nbytes, &r);

Hmm, are you working for Symbian? Because that's exactly how they track
pending async operations (address of a status variable - wrapped in a
class of course, being them) ;)
That's another way of doing it, IMO no better no worse than letting
explicit cookie selection from userspace. You still have to have the
compat code though, either ways.

- Davide

Linus Torvalds

unread,

Feb 6, 2007, 3:47:09 PM2/6/07

to Kent Overstreet

On Mon, 5 Feb 2007, Kent Overstreet wrote:
>

> You don't need an explicit cookie if you're passing in a pointer to
> the return code, it doesn't really save you anything to do so. Say
> you've got a bunch of user threads (with or without stacks, it doesn't
> matter).
>
> struct asys_ret {
> int ret;
> struct thread *p;
> };
>
> struct asys_ret r;
> r.p = me;
>
> async_read(fd, buf, nbytes, &r);

That's horrible. It means that "r" cannot have automatic linkage (since
the stack will be *gone* by the time we need to fill in "ret"), so now you
need to track *two* pointers: "me" and "&r".

Wouldn't it be much better to just track one (both in user space and in
kernel space).

In kernel space, the "one pointer" would be the fibril pointer (which
needs to have all the information necessary for completing the operation
anyway), and in user space, it would be better to have just the cookie be
a pointer to the place where you expect the return value (since you need
both anyway).

I think the point here (for *both* the kernel and user space) would be to
try to keep the interfaces really easy to use. For the kernel, it means
that we don't ever pass anything new around: the "fibril" pointer is
basically defined by the current execution thread.

And for user space, it means that we pass the _one_ thing around that we
need for both identifying the async operation to the kernel (the "cookie")
for wait or cancel, and the place where we expect the return value to be
found (which in turn can _easily_ represent a whole "struct aiocb *",
since the return value obviously has to be embedded in there anyway).

Linus

David Miller

unread,

Feb 6, 2007, 4:17:07 PM2/6/07

to torv...@linux-foundation.org

From: Linus Torvalds <torv...@linux-foundation.org>
Date: Tue, 6 Feb 2007 12:46:11 -0800 (PST)

> And for user space, it means that we pass the _one_ thing around that we
> need for both identifying the async operation to the kernel (the "cookie")
> for wait or cancel, and the place where we expect the return value to be
> found (which in turn can _easily_ represent a whole "struct aiocb *",
> since the return value obviously has to be embedded in there anyway).

I really think that Evgeniy's kevent is a good event notification
mechanism for anything, including AIO.

Events are events, applications want a centralized way to receive and
process them.

It's already implemented, and if there are tangible problems with it,
Evgeniy has been excellent at responding to criticism and implementing
suggested changes to the interfaces.

Linus Torvalds

unread,

Feb 6, 2007, 4:29:42 PM2/6/07

to David Miller

On Tue, 6 Feb 2007, David Miller wrote:
>
> I really think that Evgeniy's kevent is a good event notification
> mechanism for anything, including AIO.
>
> Events are events, applications want a centralized way to receive and
> process them.

Don't be silly. AIO isn't an event. AIO is an *action*.

The event part is hopefully something that doesn't even *happen*.

Why do people ignore this? Look at a web server: I can pretty much
guarantee that 99% of all filesystem accesses are cached, and doing them
as "events" would be a total and utter waste of time.

You want to do them synchronously, as fast as possible, and you do NOT
want to see them as any kind of asynchronous events.

Yeah, in 1% of all cases it will block, and you'll want to wait for them.
Maybe the kevent queue works then, but if it needs any more setup than the
nonblocking case, that's a big no.

Linus

David Miller

unread,

Feb 6, 2007, 4:32:32 PM2/6/07

to torv...@linux-foundation.org

From: Linus Torvalds <torv...@linux-foundation.org>
Date: Tue, 6 Feb 2007 13:28:34 -0800 (PST)

> Yeah, in 1% of all cases it will block, and you'll want to wait for them.
> Maybe the kevent queue works then, but if it needs any more setup than the
> nonblocking case, that's a big no.

So the idea is to just run it to completion if it won't block and use
a fibril if it would?

kevent could support something like that too.

Eric Dumazet

unread,

Feb 6, 2007, 4:48:08 PM2/6/07

to David Miller

David Miller a écrit :

> From: Linus Torvalds <torv...@linux-foundation.org>
> Date: Tue, 6 Feb 2007 13:28:34 -0800 (PST)
>
>> Yeah, in 1% of all cases it will block, and you'll want to wait for them.
>> Maybe the kevent queue works then, but if it needs any more setup than the
>> nonblocking case, that's a big no.
>
> So the idea is to just run it to completion if it won't block and use
> a fibril if it would?
>
> kevent could support something like that too.

It seems to me that kevent was designed to handle many events sources on a
single endpoint, like epoll (but with different internals). Typical load of
thousand of sockets/pipes providers glued into one queue.

In the fibril case, I guess a thread wont have many fibrils lying around...

Also, kevent needs a fd lookup/fput to retrieve some queued events, and that
may be a performance hit for the AIO case, (fget/fput in a multi-threaded
program cost some atomic ops)

Linus Torvalds

unread,

Feb 6, 2007, 4:51:03 PM2/6/07

to David Miller

On Tue, 6 Feb 2007, David Miller wrote:
>

> So the idea is to just run it to completion if it won't block and use
> a fibril if it would?

That's not how the patches work right now, but yes, I at least personally
think that it's something we should aim for (ie the interface shouldn't
_require_ us to always wait for things even if perhaps an early
implementation might make everything be delayed at first)

Linus

Zach Brown

unread,

Feb 6, 2007, 5:29:50 PM2/6/07

to Linus Torvalds

> That's not how the patches work right now, but yes, I at least
> personally
> think that it's something we should aim for (ie the interface
> shouldn't
> _require_ us to always wait for things even if perhaps an early
> implementation might make everything be delayed at first)

I agree that we shouldn't require a seperate syscall just to get the
return code from ops that didn't block.

It doesn't seem like much of a stretch to imagine a setup where we
can specify completion context as part of the submission itself.

declare_empty_ring(ring);
struct submission sub;

sub.ring = &ring;
sub.nr = SYS_fstat64;
sub.args == ...

ret = submit(&sub, 1);
if (ret == 0) {
wait_for_elements(&ring, 1);
printf("stat gave %d\n", ring[ring->head].rc);
}

You get the idea, it's just an outline.

wait_for_elements() could obviously check the ring before falling
back to kernel sync. I'm pretty keen on the notion of producer/
consumer rings where userspace writes the head as it plucks
completions and the kernel writes the tail as it adds them.

We might want per-call ring pointers, instead of per submission, to
help submitters wait for a group of ops to complete without having to
do their own tracking on event completion. That only makes sense if
we have the waiting mechanics let you only be woken as the number of
events in the ring crosses some threshold. Which I think we want
anyway.

We'd be trading building up a specific completion state with syscalls
for some complexity during submission that pins (and kmaps on
completion) the user pages. Submission could return failure if
pinning these new pages would push us over some rlimit. We'd have to
be *awfully* careful not to let userspace corrupt (munmap?) the ring
and confuse the hell out of the kernel.

Maybe not worth it, but if we *really* cared about making the non-
blocking case almost identical to the sync case and wanted to use the
same interface for batch submission and async completion then this
seems like a possibility.

- z

Kent Overstreet

unread,

Feb 6, 2007, 5:46:17 PM2/6/07

to Linus Torvalds

On 2/6/07, Linus Torvalds <torv...@linux-foundation.org> wrote:
> On Mon, 5 Feb 2007, Kent Overstreet wrote:
> >
> > struct asys_ret {
> > int ret;
> > struct thread *p;
> > };
> >
> > struct asys_ret r;
> > r.p = me;
> >
> > async_read(fd, buf, nbytes, &r);
>
> That's horrible. It means that "r" cannot have automatic linkage (since
> the stack will be *gone* by the time we need to fill in "ret"), so now you
> need to track *two* pointers: "me" and "&r".

You'd only allocate r on the stack if that stack is going to be around
later; i.e. if you're using user threads. Otherwise, you just allocate
it in some struct containing your aiocb or whatever.

> And for user space, it means that we pass the _one_ thing around that we
> need for both identifying the async operation to the kernel (the "cookie")
> for wait or cancel, and the place where we expect the return value to be
> found (which in turn can _easily_ represent a whole "struct aiocb *",
> since the return value obviously has to be embedded in there anyway).
>
> Linus

The "struct aiocb" isn't something you have to or necessarily want to
keep around. It's the way the current aio interface works (which I've
coded to), but I don't really see the point. All it really contains is
the syscall arguments, but once the syscall's in progress there's no
reason the kernel has to refer back to it; similarly for userspace,
it's just another struct that userspace has to keep track of and free
at some later time.

In fact, that's the only sane way you can have a ring for submitted
system calls, as otherwise elements of the ring are getting freed in
essentially random order.

I don't see the point in having a ring for completed events, since
it's at most two pointers per completion; quite a bit less data being
sent back than for submissions.

-----

The trouble with differentiating between calls that block and calls
that don't is you completely loose the ability to batch syscalls
together; this is potentially a major win of an asynchronous
interface.

An app can have a bunch of cheap, fast user space threads servicing
whatever; as they run, they can push their system calls onto a global
stack. When no more can run, it does a giant asys_submit (something
similar to io_submit), then the io_getevents equivilant, running the
user threads that had their syscalls complete.

This doesn't mean you can't run synchronously the syscalls that
wouldn't block, or that you have to allocate a fibril for every
syscall - but for servers that care more about throughput than
latency, this is potentially a big win, in cache effects if nothing
else.

(And this doesn't prevent you from having a different syscall that
submits an asynchronous syscall, but runs it right away if it was able
to without blocking).

Linus Torvalds

unread,

Feb 6, 2007, 6:05:50 PM2/6/07

to Kent Overstreet

On Tue, 6 Feb 2007, Kent Overstreet wrote:
>
> The "struct aiocb" isn't something you have to or necessarily want to
> keep around.

Oh, don't get me wrong - the _only_ reason for "struct aiocb" would be
backwards compatibility. The point is, we'd need to keep that
compatibility to be useful - otherwise we just end up having to duplicate
the work (do _both_ fibrils _and_ the in-kernel AIO).

> I don't see the point in having a ring for completed events, since
> it's at most two pointers per completion; quite a bit less data being
> sent back than for submissions.

I'm certainly personally perfectly happy with the kernel not remembering
any completed events at all - once it's done, it's done and forgotten. So
doing

async(mycookie)
wait_for_async(mycookie)

could actually return with -ECHILD (or similar error).

In other words, if you see it as a "process interface" (instead of as a
"filedescriptor interface"), I'd suggest automatic reaping of the fibril
children. I do *not* think we want the equivalent of zombies - if only
because they are just a lot of work to reap, and potentially a lot of
memory to keep around.

Linus

Davide Libenzi

unread,

Feb 6, 2007, 6:24:29 PM2/6/07

to Kent Overstreet

On Tue, 6 Feb 2007, Kent Overstreet wrote:

> The trouble with differentiating between calls that block and calls
> that don't is you completely loose the ability to batch syscalls
> together; this is potentially a major win of an asynchronous
> interface.

It doesn't necessarly have to, once you extend the single return code to a
vector:

struct async_submit {
void *cookie;
int sysc_nbr;
int nargs;
long args[ASYNC_MAX_ARGS];
int async_result;
};

int async_submit(struct async_submit *a, int n);

And async_submit() can mark each one ->async_result with -EASYNC (syscall
has been batched), or another code (syscall completed w/out schedule).
IMO, once you get a -EASYNC for a syscall, you *have* to retire the result.

- Davide

Joel Becker

unread,

Feb 6, 2007, 6:40:02 PM2/6/07

to Davide Libenzi

On Tue, Feb 06, 2007 at 03:23:47PM -0800, Davide Libenzi wrote:
> struct async_submit {
> void *cookie;
> int sysc_nbr;
> int nargs;
> long args[ASYNC_MAX_ARGS];
> int async_result;
> };
>
> int async_submit(struct async_submit *a, int n);
>
> And async_submit() can mark each one ->async_result with -EASYNC (syscall
> has been batched), or another code (syscall completed w/out schedule).
> IMO, once you get a -EASYNC for a syscall, you *have* to retire the result.

There are pains here, though. On every submit, you have to walk
the entire vector just to know what did or did not complete. I've seen
this in other APIs (eg, async_result would be -EAGAIN for lack of
resources to start this particular fibril). Userspace submit ends up
always walking the array of submissions twice - once to prep them, and
once to check if they actually went async. For longer lists of I/Os,
this is expensive.

Joel

--

"Too much walking shoes worn thin.
Too much trippin' and my soul's worn thin.
Time to catch a ride it leaves today
Her name is what it means.
Too much walking shoes worn thin."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel....@oracle.com
Phone: (650) 506-8127

Davide Libenzi

unread,

Feb 6, 2007, 6:56:45 PM2/6/07

to Joel Becker

On Tue, 6 Feb 2007, Joel Becker wrote:

> On Tue, Feb 06, 2007 at 03:23:47PM -0800, Davide Libenzi wrote:
> > struct async_submit {
> > void *cookie;
> > int sysc_nbr;
> > int nargs;
> > long args[ASYNC_MAX_ARGS];
> > int async_result;
> > };
> >
> > int async_submit(struct async_submit *a, int n);
> >
> > And async_submit() can mark each one ->async_result with -EASYNC (syscall
> > has been batched), or another code (syscall completed w/out schedule).
> > IMO, once you get a -EASYNC for a syscall, you *have* to retire the result.
>
> There are pains here, though. On every submit, you have to walk
> the entire vector just to know what did or did not complete. I've seen
> this in other APIs (eg, async_result would be -EAGAIN for lack of
> resources to start this particular fibril). Userspace submit ends up
> always walking the array of submissions twice - once to prep them, and
> once to check if they actually went async. For longer lists of I/Os,
> this is expensive.

Async syscall submissions are a _one time_ things. It's not like a live fd
that you can push inside epoll and avoid the multiple O(N) passes.
First of all, the amount of syscalls that you'd submit in a vectored way
are limited. They do not depend on the total number of connections, but on
the number of syscalls that you are actualy able to submit in parallel.
Note that it's not a trivial tasks to extract a long enough level of
parallelism, that would make you feel pain in having to walk through the
submission array. Think about the trivial web server case. Remote HTTP
client asks one page, and you may think to batch a few ops together (like
a stat, open, send headers, and sendfile for example), but those cannot be
vectored since they have to complete in order. The stat would even trigger
different response to the HTTP client. You need the open() fd to submit
the send-headers and sendfile.
IMO there are no scalability problems in a multiple submission/retrieval
API like the above (or any variation of it).

- Davide

Joel Becker

unread,

Feb 6, 2007, 7:07:18 PM2/6/07

to Davide Libenzi

On Tue, Feb 06, 2007 at 03:56:14PM -0800, Davide Libenzi wrote:
> Async syscall submissions are a _one time_ things. It's not like a live fd
> that you can push inside epoll and avoid the multiple O(N) passes.
> First of all, the amount of syscalls that you'd submit in a vectored way
> are limited. They do not depend on the total number of connections, but on

I regularly see apps that want to submit 1000 I/Os at once.
Every submit. But it's all against one or two file descriptors. So, if
you return to userspace, they have to walk all 1000 async_results every
time, just to see which completed and which didn't. And *then* go wait
for the ones that didn't. If they just wait for them all, they aren't
spinning cpu on the -EASYNC operations.
I'm not saying that "don't return a completion if we can
non-block it" is inherently wrong or not a good idea. I'm saying that
we need a way to flag them efficiently.

Joel

--

Life's Little Instruction Book #80

"Slow dance"

Joel Becker
Principal Software Developer
Oracle
E-mail: joel....@oracle.com
Phone: (650) 506-8127

Davide Libenzi

unread,

Feb 6, 2007, 7:25:08 PM2/6/07

to Joel Becker

On Tue, 6 Feb 2007, Joel Becker wrote:

> On Tue, Feb 06, 2007 at 03:56:14PM -0800, Davide Libenzi wrote:
> > Async syscall submissions are a _one time_ things. It's not like a live fd
> > that you can push inside epoll and avoid the multiple O(N) passes.
> > First of all, the amount of syscalls that you'd submit in a vectored way
> > are limited. They do not depend on the total number of connections, but on
>
> I regularly see apps that want to submit 1000 I/Os at once.
> Every submit. But it's all against one or two file descriptors. So, if
> you return to userspace, they have to walk all 1000 async_results every
> time, just to see which completed and which didn't. And *then* go wait
> for the ones that didn't. If they just wait for them all, they aren't
> spinning cpu on the -EASYNC operations.
> I'm not saying that "don't return a completion if we can
> non-block it" is inherently wrong or not a good idea. I'm saying that
> we need a way to flag them efficiently.

To how many "sessions" those 1000 *parallel* I/O operations refer to?
Because, if you batch them in an async fashion, they have to be parallel.
Without the per-async operation status code, you'll need to wait a result
*for each* submitted syscall, even the ones that completed syncronously.
Open questions are:

- Is the 1000 *parallel* syscall vectored submission case common?

- Is it more expensive to forcibly have to wait and fetch a result even
for in-cache syscalls, or it's faster to walk the submission array?

- Davide

Joel Becker

unread,

Feb 6, 2007, 7:45:43 PM2/6/07

to Davide Libenzi

On Tue, Feb 06, 2007 at 04:23:52PM -0800, Davide Libenzi wrote:
> To how many "sessions" those 1000 *parallel* I/O operations refer to?
> Because, if you batch them in an async fashion, they have to be parallel.

They're independant. Of course they have to be parallel, that's
what I/O wants.

> Without the per-async operation status code, you'll need to wait a result
> *for each* submitted syscall, even the ones that completed syncronously.

You are right, but it's more efficient in some cases.

> Open questions are:
>
> - Is the 1000 *parallel* syscall vectored submission case common?

Sure is for I/O. It's the majority of the case. If you have
1000 blocks to send out, you want them all down at the request queue at
once, where they can merge.

> - Is it more expensive to forcibly have to wait and fetch a result even
> for in-cache syscalls, or it's faster to walk the submission array?

Not everything is in-cache. Databases will be doing O_DIRECT
and will expect that 90% of their I/O calls will block. Why should they
have to iterate this list every time? If this is the API, they *have*
to. If there's an efficient way to get "just the ones that didn't
block", then it's not a problem.

Joel

--

"The real reason GNU ls is 8-bit-clean is so that they can
start using ISO-8859-1 option characters."
- Christopher Davis (c...@loiosh.kei.com)

Joel Becker
Principal Software Developer
Oracle
E-mail: joel....@oracle.com
Phone: (650) 506-8127

Davide Libenzi

unread,

Feb 6, 2007, 8:15:36 PM2/6/07

to Joel Becker

On Tue, 6 Feb 2007, Joel Becker wrote:

> > - Is it more expensive to forcibly have to wait and fetch a result even
> > for in-cache syscalls, or it's faster to walk the submission array?
>
> Not everything is in-cache. Databases will be doing O_DIRECT
> and will expect that 90% of their I/O calls will block. Why should they
> have to iterate this list every time? If this is the API, they *have*
> to. If there's an efficient way to get "just the ones that didn't
> block", then it's not a problem.

If that's what is wanted, then the async_submit() API can detect the
syncronous completion soon, and drop a result inside the result-queue
immediately. It means that an immediately following async_wait() will find
some completions soon. Or:

struct async_submit {
void *cookie;
int sysc_nbr;
int nargs;
long args[ASYNC_MAX_ARGS];

};
struct async_result {
void *cookie;
long result:
};

int async_submit(struct async_submit *a, struct async_result *r, int n);

Where "r" will store the ones that completed syncronously. I mean, there
are really many ways to do this.
I think ATM the core kernel implementation should be the focus, because
IMO we just scratched the surface of the potential problems that something
like this can arise (scheduling, signaling, cleanup, cancel - just to
name a few).

- Davide

Kent Overstreet

unread,

Feb 6, 2007, 8:22:34 PM2/6/07

to Linus Torvalds

On 2/6/07, Linus Torvalds <torv...@linux-foundation.org> wrote:

> On Tue, 6 Feb 2007, Kent Overstreet wrote:
> >
> > The "struct aiocb" isn't something you have to or necessarily want to
> > keep around.
>
> Oh, don't get me wrong - the _only_ reason for "struct aiocb" would be
> backwards compatibility. The point is, we'd need to keep that
> compatibility to be useful - otherwise we just end up having to duplicate
> the work (do _both_ fibrils _and_ the in-kernel AIO).

Bah, I was unclear here, sorry. I was talking about the userspace interface.

Right now, with the aio interface, io_submit passes in an array of
pointers to struct iocb; there's nothing that says the kernel will be
done with the structs when io_submit returns, so while userspace is
free to reuse the array of pointers, it can't free the actual iocbs
until they complete.

This is slightly stupid, for a couple reasons, and if we're making a
new pair of sycalls it'd be better to do it slightly differently.

What you want is for the async_submit syscall (or whatever it's
called) to pass in an array of structs, and for the kernel to not
reference them after async_submit returns. This is easy; after
async_submit returns, each syscall in the array is either completed
(if it could be without blocking), or in progress, and there's no
reason to need the arguments again.

It also means that the kernel has to copy in only a single userspace
buffer, instead of one buffer per syscall; as Joel mentions, there are
plenty of apps that will be doing 1000s of syscalls at once. From a
userspace perspective it's awesome, it simplifies coding for it and
means you have to hit the heap that much less.

Kent Overstreet

unread,

Feb 6, 2007, 8:25:02 PM2/6/07

to Davide Libenzi

> If that's what is wanted, then the async_submit() API can detect the
> syncronous completion soon, and drop a result inside the result-queue
> immediately. It means that an immediately following async_wait() will find
> some completions soon. Or:
>
> struct async_submit {
> void *cookie;
> int sysc_nbr;
> int nargs;
> long args[ASYNC_MAX_ARGS];
> };
> struct async_result {
> void *cookie;
> long result:
> };
>
> int async_submit(struct async_submit *a, struct async_result *r, int n);
>
> Where "r" will store the ones that completed syncronously. I mean, there
> are really many ways to do this.

That interface (modifying async_submit to pass in the size of the
result array) would work great.

Joel Becker

unread,

Feb 6, 2007, 8:31:17 PM2/6/07

to Davide Libenzi

On Tue, Feb 06, 2007 at 05:15:02PM -0800, Davide Libenzi wrote:
> I think ATM the core kernel implementation should be the focus, because

Yeah, I was thinking the same thing. I originally posted just
to make the point :-)

Joel

--

Life's Little Instruction Book #99

"Think big thoughts, but relish small pleasures."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel....@oracle.com
Phone: (650) 506-8127

Michael K. Edwards

unread,

Feb 7, 2007, 1:17:37 AM2/7/07

to Davide Libenzi, Kent Overstreet, Linus Torvalds, Zach Brown, Ingo Molnar, Linux Kernel Mailing List, linu...@kvack.org, Suparna Bhattacharya, Benjamin LaHaise

On 2/6/07, Joel Becker <Joel....@oracle.com> wrote:
> Not everything is in-cache. Databases will be doing O_DIRECT
> and will expect that 90% of their I/O calls will block. Why should they
> have to iterate this list every time? If this is the API, they *have*
> to. If there's an efficient way to get "just the ones that didn't
> block", then it's not a problem.

It's usually efficient, especially in terms of programmer effort, for
the immediate path to resemble as nearly as possible what you would
have done with the synchronous equivalent. (If there's some value in
parallelizing the query across multiple CPUs, you probably don't want
the kernel guessing how to partition it.) But what's efficient for
the delayed path is to be tightly bound to the arrival of the AIO
result, and to do little more than schedule it into the appropriate
event queue or drop it if it is stale. The immediate and delayed
paths will often share part, but not all, of their implementation, and
most of the shared part is probably data structure setup that can
precede the call itself. The rest of the delayed path is where the
design effort should go, because it's the part that has the sort of
complex impact on system performance that is hard for application
programmers to think clearly about.

Oracle isn't the only potential userspace user of massively concurrent
AIO with a significant, but not dominant, fraction of cache hits. I'm
familiar with a similar use case in network monitoring, in which one
would like to implement the attribute tree and query translation more
or less as a userspace filesystem, while leaving both the front-end
caching and the back-end throttling, retries, etc. to in-kernel state
machines. When 90% of the data requested by the front end (say, a
Python+WxWidgets GUI) is available from the VFS cache, only the other
10% should actually carry the AIO overhead.

Let's look at that immediately available fraction from the GUI
programmer's perspective. He wants to look up some attributes from a
whole batch of systems, and wants to present all immediately available
results to the user, with the rest grayed out or something. Each
request for data that is available from cache should result
immediately in a call to his (perhaps bytecode-language) callback,
which fills in a slot in the data structure that he's going to present
wholesale. There's no reason why the immediate version of the
callback should be unable to allocate memory, poke at thread-local
structures, etc.; and in practice there's little to be gained by
parallelizing this fraction (or even aggressively delivering AIOs that
complete quickly) because you'd need to thread-safe that data
structure, which probably isn't worth it in performance and certainly
isn't in programmer effort and likelihood of Heisenbugs.

Delayed results, on the other hand, probably have to use the GUI's
event posting mechanism to queue the delivered data (probably in a
massaged form) into a GUI update thread. Hence the delayed callback
can be delivered in some totally other context if it's VM- and
scheduler-efficient to do so; it's probably just doing a couple of
memcpys and a sem_post or some such. The only reason it isn't a
totally separate chunk of code is that it uses the same context layout
as the immediate path, and may have to poke at some of the same
pre-allocated places to update completion statistics, etc.

(I implemented something similar to this in userspace using Python
generators for the closure-style callbacks, in the course of rewriting
a GUI that had a separate protocol translator process in place of the
userspace filesystem. The thread pool that serviced responses from
the protocol translator operated much like Zach's fibrils, and used a
sort of lookup by request cookie to retrieve the closure and feed it
the result, which had the side effect of posting the appropriate
event. It worked, fast, and it was more or less comprehensible to
later maintainers despite the use of Python's functional features,
because the AIO delivery was kept separate from both the plain-vanilla
immediate-path code and the GUI-idiom event queue processing.)

The broader issue here is that only the application developer really
knows how the AIO results ought to be funneled into the code that
wants them, which could be a database query engine or a GUI update
queue or Murphy knows what. This "application AIO closure" step is
distinct from the application-neutral closure that needs to run in a
kernel "fibril" (extracting stat() results from an NFS response, or
whatever). So it seems to me that applications ought to be able to
specify a userspace closure to be executed on async I/O completion (or
timeout, error, etc.), and this closure should be scheduled
efficiently on completion of the kernel bit.

The delayed path through the userspace closure would partly resemble a
signal handler in that it shouldn't touch thread or heap context, just
poke at pre-allocated process-global memory locations and/or
synchronization primitives. (A closer parallel, for those familiar
with it, would be the "event handlers" of O/S's with cooperative
multitasking and a single foreground application; MacOS 6.x with
MultiFinder and PalmOS 4.x come to mind.)

What if we share a context+stack page between kernel and userspace to
be used by both the kernel "I/O completion" closure and the userspace
"event handler" closure? After all, these are the pieces that
cooperatively multitask with one another. Pop the kernel AIO closure
scheduler into the tasklet queue right after the softirq tasklet --
surely 99% of "fibrils" would become runnable due to something that
happens in a softirq, and it would make at least as much sense to run
there as in the task's schedule() path. The event handler would be
scheduled in most respects like a signal handler in a POSIX threaded
process -- running largely in the context of some application thread
(on syscall exit or by preemption), and limited in the set of APIs it
can call.

In this picture, the ideal peristalsis would usually be ISR exit path
-> softirq -> kernel closure (possibly not thread-like at all, just a
completion scheduled from a tasklet) -> userspace closure ->
application thread. The kernel and userspace closures could actually
share a stack page which also contains the completion context for
both. Linus's async_stat() example is a good one, I think. Here is
somewhat fuller userspace code, without the syntactic sugar that could
easily be used to make the callbacks more closure-ish:

/* linux/aeiou.h */
typedef void (*aeiou_stat_cb_t) (int, struct aeiou_stat *);

struct aeiou_stat __ALIGN_ME_PROPERLY__ {
aeiou_stat_cb_t cb; /* userspace completion hook */
struct stat stat_buf;
union {
int filedes;
char name[NAME_MAX+1];
} u;
#ifdef __KERNEL__
... completion context for the kernel AIO closure ...
#endif
}

/* The returned pointer is the cookie for all */
/* subsequent aeiou calls in this request group. */
void *__aeiou_alloc_aeiou_stat(size_t uctx_bytes);

#define aeiou_begin(ktype, utype, field) \
(utype *)(__aeiou_alloc_##ktype(offsetof(utype, field))

/* foo.c */
struct one_entry {
... closure context for the userspace event handler ...
struct aeiou_stat s;
}

static void my_cb(int is_delayed, struct aeiou_stat *as) {
struct one_entry *my_context = container_of(as, struct
one_entry, s);
... code that runs in userspace "event handler" context ...
}

..

struct one_entry *entry = aeiou_begin(aeiou_stat, struct one_entry, s);
struct dirent *de;

entry->s.cb = my_cb;
/* set up some process-global data structure to hold */
/* the results of this burst of async_stat calls */

while ((de = readdir(dir)) != NULL) {
strcpy(entry->s.u.name, de->d_name);
/* set up any additional application context */
/* in *entry for this individual async_stat call */

aeiou_stat(entry);
}
/* application tracks outstanding AIOs using data structure */
/* there could also be an aeiou_checkprogress(entry) */
...
aeiou_end(entry);

(The use of "aeiou_stat" rather than a more general class of async I/O
calls is for illustration purposes.)

If the stat data is immediately available when aeiou_stat() is called,
the struct stat gets filled in and the callback is run immediately in
the current stack context. If not, the contents of *entry are copied
to a new page (possibly using COW VM magic), and the syscall returns.
On the next trip through the scheduler (or when a large enough batch
of AIOs have been queued to be worth initiating them at the cost of
shoving the userspace code out of cache), the kernel closures are set
up in the opaque trailer to aeiou_stat in the copies, and the AIOs are
initiated.

The signature of aeiou_stat is deliberately limited to a single
pointer, since all of its arguments are likely to be interesting to
one or both closures. There is no need to pass the offset to the
kernel parameter sub-struct into calls after the initial aeiou_begin;
the kernel has to check the validity of the "entry" pointer/cookie
anyway, so it had best keep track of the enclosing allocation bounds,
offset to the syscall parameter structure, etc. in a place where
userspace can't alter it. Both kernel and userspace closures
eventually run with their stack in the shared page, after the closure
context area. The userspace closure has to respect
signal-handler-like limitations on its powers if is_delayed is true;
it will run in the right process context but has no particular thread
context and can't call anything that could block or allocate memory.

I think this sort of interface might work well for both GUI event
frameworks and real-time streaming media playback/mixing, which are
two common ways for AIO to enter the mere userspace programmer's
sphere of concern (and also happen to be areas where I have some
expertise). Would it work for the Oracle use case?

Cheers,
- Michael

Michael K. Edwards

unread,

Feb 7, 2007, 4:18:21 AM2/7/07

to Davide Libenzi, Kent Overstreet, Linus Torvalds, Zach Brown, Ingo Molnar, Linux Kernel Mailing List, linu...@kvack.org, Suparna Bhattacharya, Benjamin LaHaise

Man, I should have edited that down before sending it. Hopefully this
is clearer:

- The usual programming model for AIO completion in GUIs, media
engines, and the like is an application callback. Data that is
available immediately may be handled quite differently from data that
arrives after a delay, and usually the only reason for both code paths
to be in the same callback is shared code to maintain counters, etc.
associated with the AIO batch. These shared operations, and the other
things one might want to do in the delayed path, needn't be able to
block or allocate memory.

- AIO requests that are serviced from cache ought to immediately
invoke the callback, in the same thread context as the caller, fixing
up the stack so that the callback returns to the instruction following
the syscall. That way the "immediate completion" path through the
callback can manipulate data structures, allocate memory, etc. just as
if it had followed a synchronous call.

- AIO requests that need data not in cache should probably be
batched in order to avoid evicting the userspace AIO submission loop,
the immediate completion branch of the callback, and their data
structures from cache on every miss. If you can use VM copy-on-write
tricks to punt a page of AIO request parameters and closure context
out to another CPU for immediate processing without stomping on your
local caches, great.

- There's not much point in delivering AIO responses all the way
to userspace until the AIO submission loop is done, because they're
probably going to be handled through some completely different event
queue mechanism in the delayed path through the callback. Trying to
squeeze a few AIO responses into the same data structure as if they
had been in cache is likely to create race conditions or impose
needless locking overhead on the otherwise serialized immediate
completion branch.

- The result of the external AIO may arrive on a different CPU
with something completely else in foreground; but in real use cases
it's probably a different thread of the same process. If you can use
the closure context page as the stack page for the kernel bit of the
AIO completion, and then use it again from userspace as the stack page
for the application bit, then the whole ISR -> softirq -> kernel
closure -> application closure path has minimal system impact.

- The delayed path through the application callback can't block
and can't touch data structures that are thread-local or may be in an
incoherent state at this juncture (called during a more or less
arbitrary ISR exit path, a bit like a signal handler). That's OK,
because it's probably just massaging the AIO response into fields of a
preallocated object dangling off of a global data structure and doing
a sem_post or some such. (It might even just drop it if it's stale.)

- As far as I can tell (knowing little about the scheduler per
se), these kernel closures aren't much like Zach's "fibrils"; they'd
be invoked from a tasklet chained more or less immediately after the
softirq dispatch tasklet. I have no idea whether the cost of finding
the appropriate kernel closure(s) associated with the data that
arrived in the course of a softirq, pulling them over to the CPU where
the softirq just ran, and popping out to userspace to run the
application closure is exorbitant, or if it's even possible to force a
process switch from inside a tasklet that way.

Hope this helps, and sorry for the noise,

Michael K. Edwards

unread,

Feb 7, 2007, 4:37:45 AM2/7/07

to Davide Libenzi, Kent Overstreet, Linus Torvalds, Zach Brown, Ingo Molnar, Linux Kernel Mailing List, linu...@kvack.org, Suparna Bhattacharya, Benjamin LaHaise

An idiot using my keyboard wrote:
> - AIO requests that are serviced from cache ought to immediately
> invoke the callback, in the same thread context as the caller, fixing
> up the stack so that the callback returns to the instruction following
> the syscall. That way the "immediate completion" path through the
> callback can manipulate data structures, allocate memory, etc. just as
> if it had followed a synchronous call.

Or, of course:
if (async_stat(entry) == 0) {
... immediate completion code path ...
}

Ugh. But I think the discussion about the delayed path still holds.

[PATCH 1 of 4] Introduce per_call_chain()

Zach Brown

Zach Brown

Zach Brown

Ingo Molnar

Ingo Molnar

Christoph Hellwig

Ingo Molnar

Mark Lord

Ingo Molnar

Linus Torvalds

Zach Brown

Benjamin LaHaise

Zach Brown

Ingo Molnar

Andi Kleen

Andi Kleen

Linus Torvalds

Alan

Linus Torvalds

Davide Libenzi

Linus Torvalds

Alan

Linus Torvalds

Ingo Molnar

Alan

Ingo Molnar

Linus Torvalds

Linus Torvalds

Linus Torvalds

Davide Libenzi

Alan

Davide Libenzi

Ingo Molnar

bert hubert

Linus Torvalds

Suparna Bhattacharya

Ingo Molnar

Matt Mackall

Ingo Molnar

li...@horizon.com

Davide Libenzi

Zach Brown

Zach Brown

Zach Brown

Zach Brown

Zach Brown

Davide Libenzi

Davide Libenzi

Zach Brown

Davide Libenzi

Davide Libenzi

Zach Brown

Davide Libenzi

Zach Brown

Linus Torvalds

Linus Torvalds

Davide Libenzi

Zach Brown

Kent Overstreet

bert hubert

David Miller

Linus Torvalds

bert hubert

Zach Brown

Davide Libenzi

Davide Libenzi

Scot McKinley

Davide Libenzi

David Miller

Joel Becker

Al Boldi

Davide Libenzi

Linus Torvalds

David Miller

Linus Torvalds

David Miller

Eric Dumazet

Linus Torvalds

Zach Brown