[patch 2/5] signalfd v2 - signalfd core ...

Davide Libenzi

unread,

Mar 7, 2007, 8:22:25 PM3/7/07

to Linux Kernel Mailing List

This patch series implements the new signalfd() and signalfd_dequeue()
system calls. I took part of the original Linus code (and you know how
badly it can be broken :), and I added even more breakage ;)
The patch had to be almost completely changed. This patch allows multiple
signalfd to listen for signals on the same sighand, w/out raing with
dequeue_signal. Plus other changes that I don't remember (see here for the
original patch http://tinyurl.com/3yuna5 ).
This seems to be working fine on my Dual Opteron machine. I made a quick
test program for it:

http://www.xmailserver.org/signafd-test.c

The signalfd() system call implements signal delivery into a file
descriptor receiver. The signalfd file descriptor if created with the
following API:

int signalfd(int ufd, const sigset_t *mask, size_t masksize);

The "ufd" parameter allows to change an existing signalfd sigmask, w/out
going to close/create cycle (Linus idea). Use "ufd" == -1 if you want a
brand new signalfd file.
The "mask" allows to specify the signal mask of signals that we are
interested in. The "masksize" parameter is the size of "mask".
Note that signalfd delivery and standard signal delivery can go in
parallel. So you can receive signals on the signalfd file, and on the
signal handlers. This makes the system more flexible IMO. If you don't
want to see standard delivery, just pass the same "mask" to
sigprocmask(SIG_BLOCK).
The signalfd fd supports the poll(2) system call. The poll(2) will return
POLLIN when signals are available to be dequeued. As a direct consequence
of supporting the Linux poll subsystem, the signalfd fd can use used
together with epoll(2) too.
A new system call has been also introduced to allow signal dequeueing:

int signalfd_dequeue(int fd, siginfo_t *info, long timeo);

The "fd" parameter must ba a signalfd file descriptor. The "info" parameter
is a pointer to the siginfo that will receive the dequeued signal, and
"timeo" is a timeout in milliseconds, or -1 for infinite.
The signalfd_dequeue function returns 0 if successfull.

Signed-off-by: Davide Libenzi <dav...@xmailserver.org>

- Davide

Index: linux-2.6.20.ep2/fs/signalfd.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20.ep2/fs/signalfd.c 2007-03-07 17:06:07.000000000 -0800
@@ -0,0 +1,369 @@
+/*
+ * fs/signalfd.c
+ *
+ * Copyright (C) 2003 Linus Torvalds
+ *
+ * Mon Mar 5, 2007: Davide Libenzi <dav...@xmailserver.org>
+ * Changed signal delivery and de-queueing.
+ * Now using anonymous inode source.
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+#include <linux/signalfd.h>
+
+#include <asm/uaccess.h>
+
+
+
+#define MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
+
+
+
+struct signalfd_ctx {
+ struct list_head lnk;
+ wait_queue_head_t wqh;
+ sigset_t sigmask;
+ sigset_t pending;
+ struct list_head squeue[_NSIG];
+ long lost_sigs;
+ struct task_struct *tsk;
+ struct sighand_struct *sighand;
+};
+
+struct signalfd_sq {
+ struct list_head lnk;
+ siginfo_t info;
+};
+
+
+
+static void signalfd_cleanup(struct signalfd_ctx *ctx);
+static int signalfd_close(struct inode *inode, struct file *file);
+static unsigned int signalfd_poll(struct file *filp, poll_table *wait);
+static struct signalfd_sq *signalfd_fetchsig(struct signalfd_ctx *ctx);
+
+
+
+static const struct file_operations signalfd_fops = {
+ .release = signalfd_close,
+ .poll = signalfd_poll,
+};
+static struct kmem_cache *signalfd_ctx_cachep;
+static struct kmem_cache *signalfd_sq_cachep;
+
+
+/*
+ * This must be called with the sighand lock held.
+ */
+int signalfd_deliver(struct sighand_struct *sighand, int sig, struct siginfo *info)
+{
+ int nsig = 0;
+ struct list_head *pos;
+ struct signalfd_ctx *ctx;
+ struct signalfd_sq *sq;
+
+ list_for_each(pos, &sighand->sfdlist) {
+ ctx = list_entry(pos, struct signalfd_ctx, lnk);
+ /*
+ * We use a negative signal value as a way to broadcast that the
+ * sighand has been orphaned, so that we can notify all the
+ * listeners about this.
+ */
+ if (sig < 0)
+ __wake_up_locked(&ctx->wqh, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
+ else if (sigismember(&ctx->sigmask, sig) &&
+ (sig >= SIGRTMIN || !sigismember(&ctx->pending, sig))) {
+ sigaddset(&ctx->pending, sig);
+ sq = kmem_cache_alloc(signalfd_sq_cachep, GFP_ATOMIC);
+ if (sq) {
+ signal_fill_info(&sq->info, sig, info);
+ list_add_tail(&sq->lnk, &ctx->squeue[sig - 1]);
+ } else
+ ctx->lost_sigs++;
+ __wake_up_locked(&ctx->wqh, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
+ nsig++;
+ }
+ }
+
+ return nsig;
+}
+
+
+/*
+ * Create a file descriptor that is associated with our signal
+ * state. We can pass it around to others if we want to, but
+ * it will always be _our_ signal state.
+ */
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+{
+ int i, error;
+ sigset_t sigmask;
+ struct signalfd_ctx *ctx;
+ struct file *file;
+ struct inode *inode;
+
+ error = -EINVAL;
+ if (sizemask != sizeof(sigset_t) ||
+ copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
+ goto err_exit;
+ sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ if (ufd == -1) {
+ error = -ENOMEM;
+ ctx = kmem_cache_alloc(signalfd_ctx_cachep, GFP_KERNEL);
+ if (!ctx)
+ goto err_exit;
+
+ init_waitqueue_head(&ctx->wqh);
+ for (i = 0; i < _NSIG; i++)
+ INIT_LIST_HEAD(&ctx->squeue[i]);
+ ctx->lost_sigs = 0;
+ memset(&ctx->pending, 0, sizeof(ctx->pending));
+ ctx->sigmask = sigmask;
+ ctx->tsk = current;
+ get_task_struct(current);
+
+ /*
+ * We also increment the sighand count to make sure
+ * it doesn't go away from us in poll() when the task
+ * exits (which can happen if the fd is passed to
+ * another process with unix domain sockets.
+ *
+ * This also guarantees that an execve() will reallocate
+ * the signal state, and thus avoids security concerns
+ * with a untrusted process that passes off the signal
+ * queue fd to another, and then does a suid execve.
+ */
+ ctx->sighand = current->sighand;
+ atomic_inc(&ctx->sighand->count);
+
+ /*
+ * Add this fd to the list of signal listeners.
+ */
+ spin_lock_irq(&ctx->sighand->siglock);
+ list_add_tail(&ctx->lnk, &ctx->sighand->sfdlist);
+ spin_unlock_irq(&ctx->sighand->siglock);
+
+ /*
+ * When we call this, the initialization must be complete, since
+ * aino_getfd() will install the fd.
+ */
+ error = aino_getfd(&ufd, &inode, &file, "[signalfd]",
+ &signalfd_fops, ctx);
+ if (error)
+ goto err_fdalloc;
+ } else {
+ error = -EBADF;
+ file = fget(ufd);
+ if (!file)
+ goto err_exit;
+ ctx = file->private_data;
+ error = -EINVAL;
+ if (file->f_op != &signalfd_fops) {
+ fput(file);
+ goto err_exit;
+ }
+ spin_lock_irq(&ctx->sighand->siglock);
+ ctx->sigmask = sigmask;
+ spin_unlock_irq(&ctx->sighand->siglock);
+ wake_up(&ctx->wqh);
+ fput(file);
+ }
+
+ return ufd;
+
+err_fdalloc:
+ signalfd_cleanup(ctx);
+err_exit:
+ return error;
+}
+
+
+static void signalfd_cleanup(struct signalfd_ctx *ctx)
+{
+ int i;
+ struct list_head *head;
+ struct signalfd_sq *sq;
+
+ spin_lock_irq(&ctx->sighand->siglock);
+ list_del(&ctx->lnk);
+ spin_unlock_irq(&ctx->sighand->siglock);
+ __cleanup_sighand(ctx->sighand);
+ put_task_struct(ctx->tsk);
+
+ for (i = 0; i < _NSIG; i++) {
+ head = &ctx->squeue[i];
+ while (!list_empty(head)) {
+ sq = list_entry(head->next, struct signalfd_sq, lnk);
+ list_del(&sq->lnk);
+ kmem_cache_free(signalfd_sq_cachep, sq);
+ }
+ }
+
+ kmem_cache_free(signalfd_ctx_cachep, ctx);
+}
+
+
+static int signalfd_close(struct inode *inode, struct file *file)
+{
+ signalfd_cleanup(file->private_data);
+ return 0;
+}
+
+
+static unsigned int signalfd_poll(struct file *filp, poll_table *wait)
+{
+ struct signalfd_ctx *ctx = filp->private_data;
+ int i;
+ unsigned int events = 0;
+ unsigned long const *pending, *mask;
+
+ poll_wait(filp, &ctx->wqh, wait);
+
+ if (ctx->lost_sigs)
+ events |= POLLERR;
+ /*
+ * Let the caller get a POLLIN in this case, ala socket recv() when
+ * the peer disconnect.
+ */
+ if (ctx->sighand != ctx->tsk->sighand)
+ events |= POLLIN;
+
+ pending = ctx->pending.sig;
+ mask = ctx->sigmask.sig;
+ for (i = 0; i < _NSIG_WORDS; i++, pending++, mask++)
+ if (*pending & *mask)
+ break;
+
+ return i == _NSIG_WORDS ? events: events | POLLIN;
+}
+
+
+static struct signalfd_sq *signalfd_fetchsig(struct signalfd_ctx *ctx)
+{
+ int i, bsig, sig;
+ unsigned long isig;
+ unsigned long *pending, *mask;
+ struct signalfd_sq *sq;
+
+ pending = ctx->pending.sig;
+ mask = ctx->sigmask.sig;
+ for (i = 0; i < _NSIG_WORDS; i++, pending++, mask++) {
+ if ((isig = *pending & *mask) != 0) {
+ bsig = ffz(~isig);
+ sig = bsig + i * _NSIG_BPW;
+ BUG_ON(list_empty(&ctx->squeue[sig]));
+ sq = list_entry(ctx->squeue[sig].next, struct signalfd_sq, lnk);
+ list_del(&sq->lnk);
+ if (list_empty(&ctx->squeue[sig]))
+ *pending &= ~(1UL << bsig);
+
+ return sq;
+ }
+ }
+ return NULL;
+}
+
+
+asmlinkage long sys_signalfd_dequeue(int fd, siginfo_t __user *info, long timeo)
+{
+ int res;
+ long jtimeo;
+ struct file *file;
+ struct signalfd_ctx *ctx;
+ struct sighand_struct *sighand;
+ struct signalfd_sq *sq;
+ DECLARE_WAITQUEUE(wait, current);
+
+ res = -EBADF;
+ file = fget(fd);
+ if (!file)
+ goto err_exit;
+ res = -EINVAL;
+ if (file->f_op != &signalfd_fops)
+ goto err_putfd;
+ ctx = file->private_data;
+ sighand = ctx->sighand;
+
+ jtimeo = (timeo < 0 || timeo >= MAX_MSTIMEO) ?
+ MAX_SCHEDULE_TIMEOUT: (timeo * HZ + 999) / 1000;
+
+ spin_lock_irq(&sighand->siglock);
+ res = -EAGAIN;
+ if ((sq = signalfd_fetchsig(ctx)) == NULL && jtimeo) {
+ __add_wait_queue(&ctx->wqh, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if ((sq = signalfd_fetchsig(ctx)) != NULL)
+ break;
+ if (sighand != ctx->tsk->sighand) {
+ /*
+ * Let the caller read zero byte, ala socket recv()
+ * when the peer disconnect.
+ */
+ res = -ENOTCONN;
+ break;
+ }
+ if (signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
+ if (!jtimeo)
+ break;
+ spin_unlock_irq(&sighand->siglock);
+ jtimeo = schedule_timeout(jtimeo);
+ spin_lock_irq(&sighand->siglock);
+ }
+ __remove_wait_queue(&ctx->wqh, &wait);
+ set_current_state(TASK_RUNNING);
+ }
+ spin_unlock_irq(&sighand->siglock);
+ if (likely(sq)) {
+ res = copy_siginfo_to_user(info, &sq->info);
+ if (unlikely(res)) {
+ spin_lock_irq(&sighand->siglock);
+ sigaddset(&ctx->pending, sq->info.si_signo);
+ list_add(&sq->lnk, &ctx->squeue[sq->info.si_signo - 1]);
+ spin_unlock_irq(&sighand->siglock);
+ } else
+ kmem_cache_free(signalfd_sq_cachep, sq);
+ }
+err_putfd:
+ fput(file);
+err_exit:
+ return res;
+}
+
+
+static int __init signalfd_init(void)
+{
+ signalfd_ctx_cachep = kmem_cache_create("signalfd_ctx_cache",
+ sizeof(struct signalfd_ctx),
+ 0, SLAB_PANIC, NULL, NULL);
+ signalfd_sq_cachep = kmem_cache_create("signalfd_sq_cache",
+ sizeof(struct signalfd_sq),
+ 0, SLAB_PANIC, NULL, NULL);
+ return 0;
+}
+
+
+static void __exit signalfd_exit(void)
+{
+ kmem_cache_destroy(signalfd_ctx_cachep);
+ kmem_cache_destroy(signalfd_sq_cachep);
+}
+
+module_init(signalfd_init);
+module_exit(signalfd_exit);
+
+MODULE_LICENSE("GPL");
Index: linux-2.6.20.ep2/include/linux/init_task.h
===================================================================
--- linux-2.6.20.ep2.orig/include/linux/init_task.h 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/include/linux/init_task.h 2007-03-07 15:59:01.000000000 -0800
@@ -84,6 +84,7 @@
.count = ATOMIC_INIT(1), \
.action = { { { .sa_handler = NULL, } }, }, \
.siglock = __SPIN_LOCK_UNLOCKED(sighand.siglock), \
+ .sfdlist = LIST_HEAD_INIT(sighand.sfdlist), \
}

extern struct group_info init_groups;
Index: linux-2.6.20.ep2/include/linux/sched.h
===================================================================
--- linux-2.6.20.ep2.orig/include/linux/sched.h 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/include/linux/sched.h 2007-03-07 15:59:01.000000000 -0800
@@ -379,6 +379,7 @@
atomic_t count;
struct k_sigaction action[_NSIG];
spinlock_t siglock;
+ struct list_head sfdlist;
};

struct pacct_struct {
Index: linux-2.6.20.ep2/kernel/fork.c
===================================================================
--- linux-2.6.20.ep2.orig/kernel/fork.c 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/kernel/fork.c 2007-03-07 15:59:01.000000000 -0800
@@ -1422,8 +1422,10 @@
struct sighand_struct *sighand = data;

if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR)
+ SLAB_CTOR_CONSTRUCTOR) {
spin_lock_init(&sighand->siglock);
+ INIT_LIST_HEAD(&sighand->sfdlist);
+ }
}

void __init proc_caches_init(void)
Index: linux-2.6.20.ep2/include/linux/syscalls.h
===================================================================
--- linux-2.6.20.ep2.orig/include/linux/syscalls.h 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/include/linux/syscalls.h 2007-03-07 15:59:01.000000000 -0800
@@ -602,6 +602,8 @@
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
size_t len);
asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
+asmlinkage long sys_signalfd_dequeue(int fd, siginfo_t __user *info, long timeo);

int kernel_execve(const char *filename, char *const argv[], char *const envp[]);

Index: linux-2.6.20.ep2/fs/Makefile
===================================================================
--- linux-2.6.20.ep2.orig/fs/Makefile 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/fs/Makefile 2007-03-07 15:59:01.000000000 -0800
@@ -11,7 +11,7 @@
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
- stack.o anon_inodes.o
+ stack.o anon_inodes.o signalfd.o

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
Index: linux-2.6.20.ep2/include/linux/signalfd.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.20.ep2/include/linux/signalfd.h 2007-03-07 15:59:01.000000000 -0800
@@ -0,0 +1,27 @@
+/*
+ * include/linux/signalfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <dav...@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_SIGNALFD_H
+#define _LINUX_SIGNALFD_H
+
+
+int signalfd_deliver(struct sighand_struct *sighand, int sig, struct siginfo *info);
+
+/*
+ * No need to fall inside signalfd_deliver() and get/release a spinlock,
+ * if no signal listeners are available.
+ */
+static inline int signalfd_notify(struct sighand_struct *sighand, int sig,
+ struct siginfo *info)
+{
+ if (likely(list_empty(&sighand->sfdlist)))
+ return 0;
+ return signalfd_deliver(sighand, sig, info);
+}
+
+#endif /* _LINUX_SIGNALFD_H */
+
Index: linux-2.6.20.ep2/kernel/signal.c
===================================================================
--- linux-2.6.20.ep2.orig/kernel/signal.c 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/kernel/signal.c 2007-03-07 15:59:01.000000000 -0800
@@ -26,6 +26,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
+#include <linux/signalfd.h>

#include <asm/param.h>
#include <asm/uaccess.h>
@@ -709,6 +710,28 @@
}
}

+void signal_fill_info(struct siginfo *dinfo, int sig, struct siginfo *sinfo)
+{
+ switch ((unsigned long) sinfo) {
+ case (unsigned long) SEND_SIG_NOINFO:
+ dinfo->si_signo = sig;
+ dinfo->si_errno = 0;
+ dinfo->si_code = SI_USER;
+ dinfo->si_pid = current->pid;
+ dinfo->si_uid = current->uid;
+ break;
+ case (unsigned long) SEND_SIG_PRIV:
+ dinfo->si_signo = sig;
+ dinfo->si_errno = 0;
+ dinfo->si_code = SI_KERNEL;
+ dinfo->si_pid = 0;
+ dinfo->si_uid = 0;
+ break;
+ default:
+ copy_siginfo(dinfo, sinfo);
+ }
+}
+
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigpending *signals)
{
@@ -735,25 +758,7 @@
info->si_code >= 0)));
if (q) {
list_add_tail(&q->list, &signals->list);
- switch ((unsigned long) info) {
- case (unsigned long) SEND_SIG_NOINFO:
- q->info.si_signo = sig;
- q->info.si_errno = 0;
- q->info.si_code = SI_USER;
- q->info.si_pid = current->pid;
- q->info.si_uid = current->uid;
- break;
- case (unsigned long) SEND_SIG_PRIV:
- q->info.si_signo = sig;
- q->info.si_errno = 0;
- q->info.si_code = SI_KERNEL;
- q->info.si_pid = 0;
- q->info.si_uid = 0;
- break;
- default:
- copy_siginfo(&q->info, info);
- break;
- }
+ signal_fill_info(&q->info, sig, info);
} else if (!is_si_special(info)) {
if (sig >= SIGRTMIN && info->si_code != SI_USER)
/*
@@ -780,6 +785,11 @@
BUG_ON(!irqs_disabled());
assert_spin_locked(&t->sighand->siglock);

+ /*
+ * Deliver the signal to listening signalfds ...
+ */
+ signalfd_notify(t->sighand, sig, info);
+
/* Short-circuit ignored signals. */
if (sig_ignored(t, sig))
goto out;
@@ -968,6 +978,11 @@
assert_spin_locked(&p->sighand->siglock);
handle_stop_signal(sig, p);

+ /*
+ * Deliver the signal to listening signalfds ...
+ */
+ signalfd_notify(p->sighand, sig, info);
+
/* Short-circuit ignored signals. */
if (sig_ignored(p, sig))
return ret;
Index: linux-2.6.20.ep2/fs/exec.c
===================================================================
--- linux-2.6.20.ep2.orig/fs/exec.c 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/fs/exec.c 2007-03-07 15:59:01.000000000 -0800
@@ -50,6 +50,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
+#include <linux/signalfd.h>

#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -755,11 +756,18 @@
recalc_sigpending();

spin_unlock(&newsighand->siglock);
+
+ /*
+ * Tell all the sighand listeners that this sighand has
+ * been detached. Needs to be called with the sighand lock
+ * held.
+ */
+ signalfd_notify(oldsighand, -1, NULL);
+
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);

- if (atomic_dec_and_test(&oldsighand->count))
- kmem_cache_free(sighand_cachep, oldsighand);
+ __cleanup_sighand(oldsighand);
}

BUG_ON(!thread_group_leader(tsk));
Index: linux-2.6.20.ep2/kernel/exit.c
===================================================================
--- linux-2.6.20.ep2.orig/kernel/exit.c 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/kernel/exit.c 2007-03-07 15:59:01.000000000 -0800
@@ -42,6 +42,7 @@
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/blkdev.h>
+#include <linux/signalfd.h>

#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -120,6 +121,10 @@

tsk->signal = NULL;
tsk->sighand = NULL;
+ /*
+ * Notify that this sighand has been detached.
+ */
+ signalfd_notify(sighand, -1, NULL);
spin_unlock(&sighand->siglock);
rcu_read_unlock();

Index: linux-2.6.20.ep2/include/linux/signal.h
===================================================================
--- linux-2.6.20.ep2.orig/include/linux/signal.h 2007-03-07 15:55:43.000000000 -0800
+++ linux-2.6.20.ep2/include/linux/signal.h 2007-03-07 15:59:01.000000000 -0800
@@ -233,6 +233,7 @@
return sig <= _NSIG ? 1 : 0;
}

+extern void signal_fill_info(struct siginfo *dinfo, int sig, struct siginfo *sinfo);
extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
extern long do_sigpending(void __user *, unsigned long);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

David M. Lloyd

unread,

Mar 8, 2007, 9:32:14 AM3/8/07

to Davide Libenzi

On Wed, 2007-03-07 at 17:21 -0800, Davide Libenzi wrote:
> int signalfd_dequeue(int fd, siginfo_t *info, long timeo);
>
> The "fd" parameter must ba a signalfd file descriptor. The "info" parameter
> is a pointer to the siginfo that will receive the dequeued signal, and
> "timeo" is a timeout in milliseconds, or -1 for infinite.
> The signalfd_dequeue function returns 0 if successfull.

Does this support non-blocking mode? It doesn't seem to at my level of
understanding anyway. If I use this with EPOLLET for example, I'd
expect to get a single EPOLLIN when a signal arrives, which would
indicate to me that I must call signalfd_dequeue() in a loop until I get
EAGAIN in order to be sure I've consumed all the outstanding signals so
that the edge-triggered notification can be "re-armed".

Make sense?

- DML

Davide Libenzi

unread,

Mar 8, 2007, 10:46:10 AM3/8/07

to David M. Lloyd

On Thu, 8 Mar 2007, David M. Lloyd wrote:

> On Wed, 2007-03-07 at 17:21 -0800, Davide Libenzi wrote:
> > int signalfd_dequeue(int fd, siginfo_t *info, long timeo);
> >
> > The "fd" parameter must ba a signalfd file descriptor. The "info" parameter
> > is a pointer to the siginfo that will receive the dequeued signal, and
> > "timeo" is a timeout in milliseconds, or -1 for infinite.
> > The signalfd_dequeue function returns 0 if successfull.
>
> Does this support non-blocking mode? It doesn't seem to at my level of
> understanding anyway. If I use this with EPOLLET for example, I'd
> expect to get a single EPOLLIN when a signal arrives, which would
> indicate to me that I must call signalfd_dequeue() in a loop until I get
> EAGAIN in order to be sure I've consumed all the outstanding signals so
> that the edge-triggered notification can be "re-armed".

This patch, if you get a POLLIN, you have a signal to read for sure (well,
unless you another thread/task reads it before you - but that's just
somthing you have to take care). There is not explicit check for
O_NONBLOCK now, but a zero timeout would do exactly the same thing.

- Davide

Linus Torvalds

unread,

Mar 8, 2007, 11:24:03 AM3/8/07

to Davide Libenzi

On Thu, 8 Mar 2007, Davide Libenzi wrote:
>
> This patch, if you get a POLLIN, you have a signal to read for sure (well,
> unless you another thread/task reads it before you - but that's just
> somthing you have to take care). There is not explicit check for
> O_NONBLOCK now, but a zero timeout would do exactly the same thing.

You missed David's worry, I think.

Not only is POLLIN potentially an edge event (depending on the interface
you use to fetch it), but even as a level-triggered one you generally want
to read as much as possible per POLLIN event, and go back to the event
loop only when you get EAGAIN.

So that's in addition to the read/signal race with other
threads/processes.

You solved it by having a separate system call, but since it's a regular
file descriptor, why have a new system call at all, and not just make it
be a "read()"? In which case you definitely want O_NONBLOCK support.

The UNIX philosophy is often quoted as "everything is a file", but that
really means "everything is a stream of bytes".

In Windows, you have 15 different versions of "read()" with sockets and
files and pipes all having strange special cases and special system calls.
That's not the UNIX way. It should be just a "read()", and then people can
use general libraries and treat all sources the same.

For example, the main select/poll/epoll loop may be the one doing all the
reading, and then pass off "full buffers only" to the individual per-fd
"action routines". And that kind of model really very fundamentally wants
an fd to be an fd to be an fd - not "some file descriptors need
'read_from_sigfd()', and some file descriptors need 'read()', and some
file descriptors need 'recvmsg()'" etc.

So I think you should get rid of signalfd_dequeue(), and just replace it
with a "read()" function.

Linus

Davide Libenzi

unread,

Mar 8, 2007, 11:30:06 AM3/8/07

to Linus Torvalds

The reason for the special function, was not to provide a non-blocking
behaviour with zero timeout (that just a side effect), but to read the
siginfo. I was all about using read(2) (and v1 used it), but when you have
to transfer complex structures over it, it becomes hell. How do you
cleanly compat over a f_op->read callback for example?

- Davide

Michael K. Edwards

unread,

Mar 8, 2007, 11:41:04 AM3/8/07

to Davide Libenzi

On 3/8/07, Davide Libenzi <dav...@xmailserver.org> wrote:
> The reason for the special function, was not to provide a non-blocking
> behaviour with zero timeout (that just a side effect), but to read the
> siginfo. I was all about using read(2) (and v1 used it), but when you have
> to transfer complex structures over it, it becomes hell. How do you
> cleanly compat over a f_op->read callback for example?

Make it a netlink socket and fetch your structures using recvmsg().
siginfo_t belongs in ancillary data.

The UNIX philosophy is "everything's a file". The Berkeley philosophy
is "everything's a socket, except for files, which are feeble
mini-sockets". I'd go with the Berkeley crowd here.

Cheers,
- Michael

Linus Torvalds

unread,

Mar 8, 2007, 12:16:49 PM3/8/07

to Davide Libenzi

On Thu, 8 Mar 2007, Davide Libenzi wrote:
>
> The reason for the special function, was not to provide a non-blocking
> behaviour with zero timeout (that just a side effect), but to read the
> siginfo. I was all about using read(2) (and v1 used it), but when you have
> to transfer complex structures over it, it becomes hell. How do you
> cleanly compat over a f_op->read callback for example?

I agree that it gets a bit "interesting", and one option might be that the
"read()" interface just gets the signal number and the minimal siginfo
information, which is, after all, what 99% of all apps actually care
about.

But "siginfo_t" is really a *horrible* structure. Nobody sane should ever
use siginfo_t, and the designer of that thing was so high on LSD that it's
not even funny. Re-using fields in a union? Values that depend on other
bits in the thing in random manners?

In other words, I bet that we could just make it a *lot* better by making
the read structure be:

- 16 4-byte fields (fixed 64-byte packet), where each field is an
uint32_t (we could even do it in network byte order if we care and if
you want to just pipe the information from one machine to another, but
that sounds a bit excessive ;)

- Just put the fields people actually use at fixed offsets: si_signo,
si_errno, si_pid, si_uid, si_band, si_fd.

- that still leaves room for the other cases if anybody ever wants them
(but I doubt it - things like si_addr are really only useful for
synchronous signals that are actually done as *signals*, since you
cannot defer a SIGBUS/SIGSEGV/SIGILL *anyway*).

So I bet 99% of users actually just want si_signo, while some small subset
might want the SIGCHLD info and some of the special cases (eg we migth
want to add si_addr as a 64-bit thing just because the USB stack sends a
SI_ASYNCIO thing for completed URB's, so a libusb might want it, but
that's probably the only such user).

And it would be *cleaner* than the mess that is siginfo_t..

(I realize that siginfo_t is ugly because it built up over time, using the
same buffer for many different things. I'm just saying that we can
probably do better by *not* using it, and just laying things out in a
cleaner manner to begin with, which also solves any compatibility issues)

Linus

Linus Torvalds

unread,

Mar 8, 2007, 12:29:00 PM3/8/07

to Michael K. Edwards

On Thu, 8 Mar 2007, Michael K. Edwards wrote:
>
> Make it a netlink socket and fetch your structures using recvmsg().
> siginfo_t belongs in ancillary data.

Gaah. That interface is horrible.

> The UNIX philosophy is "everything's a file". The Berkeley philosophy
> is "everything's a socket, except for files, which are feeble
> mini-sockets". I'd go with the Berkeley crowd here.

No, the berkeley crowd is totally out to lunch.

I might agree with you *if* you could actually do "recvmsg()" on arbitrary
file descriptors, but you cannot.

We could fix that in Linux, of course, but the fact is, "recvmsg()" is
*not* a superset of "read()". In general, it's a *subset*, exactly because
very few file descriptors support it.

So the normal way to read from a file descriptor (and the *only* way in
any generic select loop) is to use "read()". That's the only thing that
works for everything. And we shouldn't break that.

The sad part is that there really is no reason why the BSD crowd couldn't
have done recvmsg() as an "extended read with per-system call flags",
which would have made things like O_NONBLOCK etc unnecessary, because you
could do it just with MSG_DONTWAIT..

So anybody who would "go with the Berkeley crowd" really shows a lot of
bad taste, I'm afraid. The Berkeley crowd really messed up here, and it's
so long ago that I don't think there is any point in us trying to fix it
any more.

(But if somebody makes recvmgs a general VFS interface and makes it just
work for everything, I'd probably take the patch as a cleanup. I really
think it should have been a "struct file_operations" thing rather than
being a socket-only thing.. But since you couldn't portably use it
anyway, the thing is pretty moot)

Linus

Davide Libenzi

unread,

Mar 8, 2007, 2:22:47 PM3/8/07

to Linus Torvalds

I can do that, no problem. But isn't it better to recognize that this kind
of data just can't be shipped through a non compat-able function?
Like, for example, the current trend to say "just use u64 for a pointer,
it'll be fine". I remeber, a long time ago when the i386 architecture came
out, to say "Wow! 4GB is gonna last *forever*!", let's use u32 for
pointers. Well, forever is almost here in my watches. And all the
userspace code using APIs assuming to cleanly store a pointer in a u32
will have to be re-factored.
So, to cut it short, I can do the pseudo-siginfo read(2), but I don't
like it too much (little, actually). The siginfo, as bad as it is, is a
standard used in many POSIX APIs (hence even in kernel), and IMO if we
want to send that back, a struct siginfo should be.
No?

- Davide

Linus Torvalds

unread,

Mar 8, 2007, 2:27:57 PM3/8/07

to Davide Libenzi

I think it's perfectly fine if you make it "struct siginfo" (even though I
think it's a singularly ugly struct). It's just that then you'd have to
make your read() know whether it's a compat-read or not, which you really
can't.

Which is why you introduced a new system call, but that leads to all the
problems with the file descriptor no longer being *usable*.

Think scripts. It's easy to do reads in perl scripts, and parse the
output. In contrast, making perl use a new system call is quite
challenging.

And *that* is why "everything is a stream of bytes" is so important. You
don't know where the file descriptor has been, or who uses it. Special
system calls for special file descriptors are just *wrong*.

After all, that's why we'd have a signalfd() in the first place: exactly
so that you do *not* have to use special system calls, but can just pass
it on to any event waiting mechanism like select, poll, epoll. The same is
just *even*more*true* when it comes to reading the data!

Linus

Davide Libenzi

unread,

Mar 8, 2007, 2:34:18 PM3/8/07

to Linus Torvalds

On Thu, 8 Mar 2007, Linus Torvalds wrote:

>
>
> On Thu, 8 Mar 2007, Davide Libenzi wrote:
> >
> > So, to cut it short, I can do the pseudo-siginfo read(2), but I don't
> > like it too much (little, actually). The siginfo, as bad as it is, is a
> > standard used in many POSIX APIs (hence even in kernel), and IMO if we
> > want to send that back, a struct siginfo should be.
> > No?
>
> I think it's perfectly fine if you make it "struct siginfo" (even though I
> think it's a singularly ugly struct). It's just that then you'd have to
> make your read() know whether it's a compat-read or not, which you really
> can't.
>
> Which is why you introduced a new system call, but that leads to all the
> problems with the file descriptor no longer being *usable*.
>
> Think scripts. It's easy to do reads in perl scripts, and parse the
> output. In contrast, making perl use a new system call is quite
> challenging.
>
> And *that* is why "everything is a stream of bytes" is so important. You
> don't know where the file descriptor has been, or who uses it. Special
> system calls for special file descriptors are just *wrong*.
>
> After all, that's why we'd have a signalfd() in the first place: exactly
> so that you do *not* have to use special system calls, but can just pass
> it on to any event waiting mechanism like select, poll, epoll. The same is
> just *even*more*true* when it comes to reading the data!

"Cheeseburger it is!" ;)
I'll revert back to read(2) with pseudo-siginfo and O_NONBLOCK handling...

- Davide

Avi Kivity

unread,

Mar 8, 2007, 2:35:08 PM3/8/07

to Davide Libenzi

Davide Libenzi wrote:
> So, to cut it short, I can do the pseudo-siginfo read(2), but I don't
> like it too much (little, actually). The siginfo, as bad as it is, is a
> standard used in many POSIX APIs (hence even in kernel), and IMO if we
> want to send that back, a struct siginfo should be.
>

You can send the data in the 32/64 neutral format, and have glibc
convert it to a siginfo, and everybody's happy.

--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

Davide Libenzi

unread,

Mar 8, 2007, 2:40:48 PM3/8/07

to Avi Kivity

On Thu, 8 Mar 2007, Avi Kivity wrote:

> Davide Libenzi wrote:
> > So, to cut it short, I can do the pseudo-siginfo read(2), but I don't like
> > it too much (little, actually). The siginfo, as bad as it is, is a standard
> > used in many POSIX APIs (hence even in kernel), and IMO if we want to send
> > that back, a struct siginfo should be.
> >
>
> You can send the data in the 32/64 neutral format, and have glibc convert it
> to a siginfo, and everybody's happy.

Except Uli :)
The problem with the siginfo is that it is a brain-dead structure with
unions and data dependent on types (and pointers). Pretty fugly.

- Davide

Oleg Nesterov

unread,

Mar 8, 2007, 3:44:22 PM3/8/07

to Davide Libenzi

Davide Libenzi wrote:
>
> +int signalfd_deliver(struct sighand_struct *sighand, int sig, struct siginfo *info)
> +{
> + int nsig = 0;
> + struct list_head *pos;
> + struct signalfd_ctx *ctx;
> + struct signalfd_sq *sq;
> +
> + list_for_each(pos, &sighand->sfdlist) {
> + ctx = list_entry(pos, struct signalfd_ctx, lnk);
> + /*
> + * We use a negative signal value as a way to broadcast that the
> + * sighand has been orphaned, so that we can notify all the
> + * listeners about this.
> + */
> + if (sig < 0)
> + __wake_up_locked(&ctx->wqh, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
> + else if (sigismember(&ctx->sigmask, sig) &&
> + (sig >= SIGRTMIN || !sigismember(&ctx->pending, sig))) {
> + sigaddset(&ctx->pending, sig);

I don't understand the "(sig >= SIGRTMIN || !sigismember(&ctx->pending, sig))"
check. This mimics the LEGACY_QUEUE() check, but seems strange. The signal may
be pending in ctx->pending just because it was not signalfd_fetchsig()ed, yes?

Please also see below.

> +asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
> +{
>

> [...snip...]

>
> + } else {
> + error = -EBADF;
> + file = fget(ufd);
> + if (!file)
> + goto err_exit;
> + ctx = file->private_data;
> + error = -EINVAL;
> + if (file->f_op != &signalfd_fops) {
> + fput(file);
> + goto err_exit;
> + }
> + spin_lock_irq(&ctx->sighand->siglock);
> + ctx->sigmask = sigmask;
> + spin_unlock_irq(&ctx->sighand->siglock);
> + wake_up(&ctx->wqh);

Can't this race with sys_signalfd_dequeue() which use lockless __add_wait_queue()?
Looks like we should do __wake_up_locked() under ctx->sighand->siglock.

> --- linux-2.6.20.ep2.orig/kernel/signal.c 2007-03-07 15:55:43.000000000 -0800
> +++ linux-2.6.20.ep2/kernel/signal.c 2007-03-07 15:59:01.000000000 -0800
>

> [...snip...]

>
> @@ -780,6 +785,11 @@
> BUG_ON(!irqs_disabled());
> assert_spin_locked(&t->sighand->siglock);
>
> + /*
> + * Deliver the signal to listening signalfds ...
> + */
> + signalfd_notify(t->sighand, sig, info);
> +
> /* Short-circuit ignored signals. */
> if (sig_ignored(t, sig))
> goto out;
> @@ -968,6 +978,11 @@
> assert_spin_locked(&p->sighand->siglock);
> handle_stop_signal(sig, p);
>
> + /*
> + * Deliver the signal to listening signalfds ...
> + */
> + signalfd_notify(p->sighand, sig, info);
> +
> /* Short-circuit ignored signals. */
> if (sig_ignored(p, sig))
> return ret;

It is strange that we are doing signalfd_notify() even if the signal is ignored.
Isn't it better to shift signalfd_notify() into send_signal() ? This way we do
not need the special check in signalfd_deliver() above.

Also, this patch doesn't take send_sigqueue/send_group_sigqueue into account.

Oleg.

Michael K. Edwards

unread,

Mar 8, 2007, 3:53:59 PM3/8/07

to Linus Torvalds

On 3/8/07, Linus Torvalds <torv...@linux-foundation.org> wrote:
> So anybody who would "go with the Berkeley crowd" really shows a lot of
> bad taste, I'm afraid. The Berkeley crowd really messed up here, and it's
> so long ago that I don't think there is any point in us trying to fix it
> any more.

Well, they did invent the socket, which sucks less than a lot of other
things. You would prefer STREAMS, perhaps? :-) My point is that
this is a message-oriented problem, not a stream-oriented problem, and
read() just isn't a very good interface for passing structured
messages around. I agree that the details of recvmsg() ancillary data
are fairly grotty (optimization based on micro-benchmarks, as usual,
back in the PDP/VAX days), but the concept isn't that bad; you let
netlink sockets into the kernel, didn't you?

> (But if somebody makes recvmgs a general VFS interface and makes it just
> work for everything, I'd probably take the patch as a cleanup. I really
> think it should have been a "struct file_operations" thing rather than
> being a socket-only thing.. But since you couldn't portably use it
> anyway, the thing is pretty moot)

sendmsg()/recvmsg() to a file makes perfect sense, if you put the
equivalent of an llseek tuple into ancillary data. And it's also IMHO
the right way to do file AIO -- open up a netlink socket to the entity
that's scheduling the IOs for a given filesystem, use the SCM_RIGHTS
mechanism to do indirect opens (your credentials could come over an
AF_UNIX socket from a completely separate "open server" process), and
multiplex all the AIO to that filesystem across the netlink socket.

If adding this to VFS is something you would seriously consider, I'll
do the work. But I will need some coaching to work around my relative
inexperience with the Linux core code and my lack of taste. :-)

Cheers,
- Michael

Marko Macek

unread,

Mar 8, 2007, 4:05:04 PM3/8/07

to Linus Torvalds

Linus Torvalds wrote:
>
> On Thu, 8 Mar 2007, Davide Libenzi wrote:
>> So, to cut it short, I can do the pseudo-siginfo read(2), but I don't
>> like it too much (little, actually). The siginfo, as bad as it is, is a
>> standard used in many POSIX APIs (hence even in kernel), and IMO if we
>> want to send that back, a struct siginfo should be.
>> No?
>
> I think it's perfectly fine if you make it "struct siginfo" (even though I
> think it's a singularly ugly struct). It's just that then you'd have to
> make your read() know whether it's a compat-read or not, which you really
> can't.
>
> Which is why you introduced a new system call, but that leads to all the
> problems with the file descriptor no longer being *usable*.
>
> Think scripts. It's easy to do reads in perl scripts, and parse the
> output. In contrast, making perl use a new system call is quite
> challenging.

Probably, but someone will have to add the 'signalfd' system call anyway.

> And *that* is why "everything is a stream of bytes" is so important. You
> don't know where the file descriptor has been, or who uses it. Special
> system calls for special file descriptors are just *wrong*.
>
> After all, that's why we'd have a signalfd() in the first place: exactly
> so that you do *not* have to use special system calls, but can just pass
> it on to any event waiting mechanism like select, poll, epoll. The same is
> just *even*more*true* when it comes to reading the data!

The problem with read() returning arbitrary unstructured data is that
there is hard to do versioning/extensibility, since the userspace can't
specify the requested/expected format. The only way it could be done is
by the (nbytes) parameter to read() which is not very nice (and useless
for scripts).

This is the same problem that makes sysfs/procfs fragile unless the
file format is very well specified for extensibility (and it's easy to
f-it up, since there seems to be little consistency there... using
the XML horror would actually be an improvement). Breaking sysfs/procfs
might be acceptable once every few years, but signal handling will be
part of every application event loop and there is no room for breaking
anything.

(although, one could to the versioning the ugly way by creating the new
'signalfd' syscall instead).

I'd say: make read() return the signal number (probably as 4-byte int,
in network byte order), but for everything else, use the system call.

Mark

P.S. I'm currently worried if the fact that FUTEX_FD is being deprecated
is a problem :)

Davide Libenzi

unread,

Mar 8, 2007, 4:13:16 PM3/8/07

to Oleg Nesterov

On Thu, 8 Mar 2007, Oleg Nesterov wrote:

> Davide Libenzi wrote:
> >
> > +int signalfd_deliver(struct sighand_struct *sighand, int sig, struct siginfo *info)
> > +{
> > + int nsig = 0;
> > + struct list_head *pos;
> > + struct signalfd_ctx *ctx;
> > + struct signalfd_sq *sq;
> > +
> > + list_for_each(pos, &sighand->sfdlist) {
> > + ctx = list_entry(pos, struct signalfd_ctx, lnk);
> > + /*
> > + * We use a negative signal value as a way to broadcast that the
> > + * sighand has been orphaned, so that we can notify all the
> > + * listeners about this.
> > + */
> > + if (sig < 0)
> > + __wake_up_locked(&ctx->wqh, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE);
> > + else if (sigismember(&ctx->sigmask, sig) &&
> > + (sig >= SIGRTMIN || !sigismember(&ctx->pending, sig))) {
> > + sigaddset(&ctx->pending, sig);
>
> I don't understand the "(sig >= SIGRTMIN || !sigismember(&ctx->pending, sig))"
> check. This mimics the LEGACY_QUEUE() check, but seems strange. The signal may
> be pending in ctx->pending just because it was not signalfd_fetchsig()ed, yes?

Logic is, if it's not an RT signal, queue only one, otherwise multiple.
The bit on the ->pending mask is clealer only when the queue slot becomes empty.

> Please also see below.
>
> > +asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
> > +{
> >
> > [...snip...]
> >
> > + } else {
> > + error = -EBADF;
> > + file = fget(ufd);
> > + if (!file)
> > + goto err_exit;
> > + ctx = file->private_data;
> > + error = -EINVAL;
> > + if (file->f_op != &signalfd_fops) {
> > + fput(file);
> > + goto err_exit;
> > + }
> > + spin_lock_irq(&ctx->sighand->siglock);
> > + ctx->sigmask = sigmask;
> > + spin_unlock_irq(&ctx->sighand->siglock);
> > + wake_up(&ctx->wqh);
>
> Can't this race with sys_signalfd_dequeue() which use lockless __add_wait_queue()?
> Looks like we should do __wake_up_locked() under ctx->sighand->siglock.

Yes, good catch. Fixed.

The two trasports can rely on different masks. The signalfd_notify() does
not even go in signalfd_deliver() if no signalfds are attached to the
sighand.

> Also, this patch doesn't take send_sigqueue/send_group_sigqueue into account.

I added that too. but I noticed something strange, dunno if intentional or
not. In send_sigqueue and send_group_sigqueue, the check for the
timer-special and the ignored is inverted. This lead to two different
behaviours. Is there a reason for that?

- Davide

Marko Macek

unread,

Mar 8, 2007, 4:26:00 PM3/8/07

to linux-...@vger.kernel.org

Linus Torvalds wrote:
>
> On Thu, 8 Mar 2007, Davide Libenzi wrote:
>> So, to cut it short, I can do the pseudo-siginfo read(2), but I don't
>> like it too much (little, actually). The siginfo, as bad as it is, is a
>> standard used in many POSIX APIs (hence even in kernel), and IMO if we
>> want to send that back, a struct siginfo should be.
>> No?
>
> I think it's perfectly fine if you make it "struct siginfo" (even though I
> think it's a singularly ugly struct). It's just that then you'd have to
> make your read() know whether it's a compat-read or not, which you really
> can't.
>
> Which is why you introduced a new system call, but that leads to all the
> problems with the file descriptor no longer being *usable*.
>
> Think scripts. It's easy to do reads in perl scripts, and parse the
> output. In contrast, making perl use a new system call is quite
> challenging.

Probably, but someone will have to add the 'signalfd' system call anyway.

> And *that* is why "everything is a stream of bytes" is so important. You

> don't know where the file descriptor has been, or who uses it. Special
> system calls for special file descriptors are just *wrong*.
>
> After all, that's why we'd have a signalfd() in the first place: exactly
> so that you do *not* have to use special system calls, but can just pass
> it on to any event waiting mechanism like select, poll, epoll. The same is
> just *even*more*true* when it comes to reading the data!

The problem with read() returning arbitrary unstructured data is that

there is hard to do versioning/extensibility, since the userspace can't
specify the requested/expected format. The only way it could be done is
by the (nbytes) parameter to read() which is not very nice (and useless
for scripts).

This is the same problem that makes sysfs/procfs fragile unless the
file format is very well specified for extensibility (and it's easy to
f-it up, since there seems to be little consistency there... using
the XML horror would actually be an improvement). Breaking sysfs/procfs
might be acceptable once every few years, but signal handling will be
part of every application event loop and there is no room for breaking
anything.

(although, one could to the versioning the ugly way by creating the new
'signalfd' syscall instead).

I'd say: make read() return the signal number (probably as 4-byte int,
in network byte order), but for everything else, use the system call.

Mark

P.S. I'm currently worried if the fact that FUTEX_FD is being deprecated
is a problem :)

-

Oleg Nesterov

unread,

Mar 8, 2007, 4:57:39 PM3/8/07

to Davide Libenzi

Yes, I see what the code does, but I don't undestand why. For example, SIGCHLD was
delivered to the process _and_ handled several times, then sys_signalfd_dequeue()
comes and finds only one siginfo. Isn't this strange?

Sorry, I don't understand. The masks are different, yes, but ->sighand is the
same? How this can make any difference for "if no signalfds are attached" ?

OK. What is the purpose of signalfd? Should it record the signals which were
sent to the process, or only those which were delivered? The latter looks
more natural for me. But inn any case, I don't see a reason to check ->pending
mask in signalfd_deliver().

BTW, sys_signalfd_dequeue() re-queues signalfd_sq if copy_siginfo_to_user()
fails. Why not -EFAULT?

Also. A malicious user can eat all memory, signalfd_deliver()->kmem_cache_alloc()
doesn't check any limits.

> > Also, this patch doesn't take send_sigqueue/send_group_sigqueue into account.
>
> I added that too. but I noticed something strange, dunno if intentional or
> not. In send_sigqueue and send_group_sigqueue, the check for the
> timer-special and the ignored is inverted. This lead to two different
> behaviours. Is there a reason for that?

I was wondering about that too.

Oleg.

Linus Torvalds

unread,

Mar 8, 2007, 5:12:02 PM3/8/07

to Oleg Nesterov

On Fri, 9 Mar 2007, Oleg Nesterov wrote:
>
> Also. A malicious user can eat all memory, signalfd_deliver()->kmem_cache_alloc()
> doesn't check any limits.

This, btw, is one reason I *really* think signalfd() should just use the
same old signal queue, and not try to make its own.

Signal queueing and unqueuing simply isn't that simple.

Linus

Davide Libenzi

unread,

Mar 8, 2007, 5:59:42 PM3/8/07

to Linus Torvalds

On Thu, 8 Mar 2007, Linus Torvalds wrote:

>
>
> On Fri, 9 Mar 2007, Oleg Nesterov wrote:
> >
> > Also. A malicious user can eat all memory, signalfd_deliver()->kmem_cache_alloc()
> > doesn't check any limits.
>
> This, btw, is one reason I *really* think signalfd() should just use the
> same old signal queue, and not try to make its own.
>
> Signal queueing and unqueuing simply isn't that simple.

Alrighty. Back to the dequeue_signal ...

- Davide

Davide Libenzi

unread,

Mar 8, 2007, 6:06:03 PM3/8/07

to Oleg Nesterov

On Fri, 9 Mar 2007, Oleg Nesterov wrote:

> > Logic is, if it's not an RT signal, queue only one, otherwise multiple.
> > The bit on the ->pending mask is clealer only when the queue slot becomes empty.
>
> Yes, I see what the code does, but I don't undestand why. For example, SIGCHLD was
> delivered to the process _and_ handled several times, then sys_signalfd_dequeue()
> comes and finds only one siginfo. Isn't this strange?

That's the same logic the kernel folows for non-RT signals.

> > The two trasports can rely on different masks. The signalfd_notify() does
> > not even go in signalfd_deliver() if no signalfds are attached to the
> > sighand.
>
> Sorry, I don't understand. The masks are different, yes, but ->sighand is the
> same? How this can make any difference for "if no signalfds are attached" ?

The list_empty() che would not make you fall inside signalfd_deliver(),
hence the fast path really lives up to its name ;)

> Also. A malicious user can eat all memory, signalfd_deliver()->kmem_cache_alloc()
> doesn't check any limits.

I'll make that use the std dequeu_signal, so everything is handled in there.

- Davide

Jeremy Fitzhardinge

unread,

Mar 8, 2007, 6:58:12 PM3/8/07

to Linus Torvalds

Linus Torvalds wrote:
> So I think you should get rid of signalfd_dequeue(), and just replace it
> with a "read()" function.
>

The difficulty is that there are 4 different formats of signal structure
you could get: (traditional|siginfo) x (32bit|64bit).

What happens if you're a 32 bit process, you fork and exec a 64bit
process who inherits the signalfd, and they start reading? What format
of signal structure do they get? What do you get? What if you start
doing partial reads?

I think signalfd_dequeue() is warty, but read() has has a number of
details to sort out.

(Hey, can you send signals by writing into the signalfd? Very plan9...)

J

Linus Torvalds

unread,

Mar 8, 2007, 7:11:44 PM3/8/07

to Jeremy Fitzhardinge

On Thu, 8 Mar 2007, Jeremy Fitzhardinge wrote:
>
> The difficulty is that there are 4 different formats of signal structure
> you could get: (traditional|siginfo) x (32bit|64bit).

See my suggestion of a fixed-format (and much cleaned up) pseudo-siginfo
thing earlier in this thread, and also actually mentioned as a "good to
do" in my original email from 2003 that did the original patch ;)

> (Hey, can you send signals by writing into the signalfd? Very plan9...)

It's an obvious extension. Whether there is any real point to it or not, I
dunno. After all, you could just send the signal instead.

Linus

Oleg Nesterov

unread,

Mar 8, 2007, 7:14:54 PM3/8/07

to Davide Libenzi

On 03/08, Davide Libenzi wrote:
>

> On Fri, 9 Mar 2007, Oleg Nesterov wrote:
>
> > > Logic is, if it's not an RT signal, queue only one, otherwise multiple.
> > > The bit on the ->pending mask is clealer only when the queue slot becomes empty.
> >
> > Yes, I see what the code does, but I don't undestand why. For example, SIGCHLD was
> > delivered to the process _and_ handled several times, then sys_signalfd_dequeue()
> > comes and finds only one siginfo. Isn't this strange?
>
> That's the same logic the kernel folows for non-RT signals.

Yes, specific_send_sig_info/__group_send_sig_info "ignores" the non-RT signal

- if we already have a pending one (blocked, or not yet handled)

- the only reason is backward compatibility

signalfd:

- if the same signr was not fetched via sys_signalfd_dequeue()

- the reason is ????

Suppose we are doing a simple application which just logs all signals
which were sent to this process. Now, we can miss the signal if the
logging blocks.

Think about sub-threads. We can send the same non-RT signal to T1, T2, T3
via specific_send_sig_info(). All 3 signals will be delivered, but signalfd
(which is process wide) will see only the first.

> > > The two trasports can rely on different masks. The signalfd_notify() does
> > > not even go in signalfd_deliver() if no signalfds are attached to the
> > > sighand.
> >
> > Sorry, I don't understand. The masks are different, yes, but ->sighand is the
> > same? How this can make any difference for "if no signalfds are attached" ?
>
> The list_empty() che would not make you fall inside signalfd_deliver(),
> hence the fast path really lives up to its name ;)

Ugh. Still can't understand, probably I missed something or misread this patch.

If we shift signalfd_notify() from specific_send_sig_info/__group_send_sig_info
to send_signal(), we have the same "list_empty()" fastpath if no signalfds are
attached to the sighand. The difference is that we don't count sig_ignored()
signals, which looks right to me.

Oleg.

Davide Libenzi

unread,

Mar 8, 2007, 8:17:02 PM3/8/07

to Oleg Nesterov

On Fri, 9 Mar 2007, Oleg Nesterov wrote:

> Ugh. Still can't understand, probably I missed something or misread this patch.
>
> If we shift signalfd_notify() from specific_send_sig_info/__group_send_sig_info
> to send_signal(), we have the same "list_empty()" fastpath if no signalfds are
> attached to the sighand. The difference is that we don't count sig_ignored()
> signals, which looks right to me.

Now I shifted the check into send_signal(), since signalfd now uses the
standard dequeue_signal, and hence compete with standard signal delivery
against the queue. This was initial Linus design.

- Davide

Kent Overstreet

unread,

Mar 9, 2007, 3:22:55 PM3/9/07

to Linus Torvalds

On 3/8/07, Linus Torvalds <torv...@linux-foundation.org> wrote:

> Which is why you introduced a new system call, but that leads to all the
> problems with the file descriptor no longer being *usable*.
>
> Think scripts. It's easy to do reads in perl scripts, and parse the
> output. In contrast, making perl use a new system call is quite
> challenging.

If you're going to allow reads from the fd, then it makes sense to let processes
get the fd by opening a file; if you've got to call siginfo_create()
or what have
you, the fact that you can do reads isn't terribly useful to perl or
what have you.

Inotify even makes you get events with read() today; but you have to use system
calls to get the fd and to send events (inotify_add_watch, inotify_rm_watch),
which is something I've never understood.

It would, to me, be very cool if everything that was a stream of bytes
really was
a file; this is something I've always thought plan 9 got very right. Epoll and
inotify could work this way, and I'm sure there's others I'm forgetting.

If people are interested in this sort of thing, I'll start working on
a patch. Are there
any thoughts on where these sorts of pseudodevices should be these days?

Davide Libenzi

unread,

Mar 9, 2007, 4:34:22 PM3/9/07

to David M. Lloyd

On Thu, 8 Mar 2007, David M. Lloyd wrote:

> On Wed, 2007-03-07 at 17:21 -0800, Davide Libenzi wrote:
> > int signalfd_dequeue(int fd, siginfo_t *info, long timeo);
> >
> > The "fd" parameter must ba a signalfd file descriptor. The "info" parameter
> > is a pointer to the siginfo that will receive the dequeued signal, and
> > "timeo" is a timeout in milliseconds, or -1 for infinite.
> > The signalfd_dequeue function returns 0 if successfull.
>
> Does this support non-blocking mode? It doesn't seem to at my level of
> understanding anyway. If I use this with EPOLLET for example, I'd
> expect to get a single EPOLLIN when a signal arrives, which would
> indicate to me that I must call signalfd_dequeue() in a loop until I get
> EAGAIN in order to be sure I've consumed all the outstanding signals so
> that the edge-triggered notification can be "re-armed".
>

> Make sense?

Yes, the new version can use the O_NONBLOCK flag and read(2) will return
EAGAIN.
Will post a newer version later, together with timerfd ...

- Davide

Denis Vlasenko

unread,

Mar 30, 2007, 7:25:01 PM3/30/07

to Linus Torvalds

On Thursday 08 March 2007 18:28, Linus Torvalds wrote:
> The sad part is that there really is no reason why the BSD crowd couldn't
> have done recvmsg() as an "extended read with per-system call flags",
> which would have made things like O_NONBLOCK etc unnecessary, because you
> could do it just with MSG_DONTWAIT..

Wait a second here... O_NONBLOCK is not just unnecessary - it's buggy!

Try to do nonblocking read from stdin (fd #0) -
* setting O_NONBLOCK with fcntl will set it for all other processes
which has the same stdin!
* trying to reset O_NONBLOCK after the read doesn't help (think kill -9)
* duping fd #0 doesn't help because O_NONBLOCK is not per-fd,
it's shared just like filepos.

I really like that trick with recvmsg + MSG_DONTWAIT instead.
--
vda