perf_event_open+clone = unkillable process

80 views
Skip to first unread message

Dmitry Vyukov

unread,
Feb 1, 2019, 11:48:41 AM2/1/19
to Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Hello,

The following program creates an unkillable process that eats CPU.
/proc/pid/stack is empty, I am not sure what other info I can provide.

Tested is on upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a.
Config is attached.

FTR, generated from the following syzkaller program:

perf_event_open(&(0x7f0000000580)={0x2, 0x70, 0x5c61, 0x2, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0,
0xffffffffffffffff, 0xffffffffffffffff, 0x0)
clone(0x20002100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
timer_create(0x0, &(0x7f0000000000)={0x0, 0x7, 0x4, @tid=r0}, &(0x7f0000000080))
timer_settime(0x0, 0x3, &(0x7f0000000140)={{0x0, 0x1}, {0x0, 0x1c9c380}}, 0x0)


// autogenerated by syzkaller (https://github.com/google/syzkaller)
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
#include <pthread.h>
#include <setjmp.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>

#include <linux/futex.h>

static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* ctx)
{
uintptr_t addr = (uintptr_t)info->si_addr;
const uintptr_t prog_start = 1 << 20;
const uintptr_t prog_end = 100 << 20;
if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED) &&
(addr < prog_start || addr > prog_end)) {
_longjmp(segv_env, 1);
}
exit(sig);
}

static void install_segv_handler(void)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = segv_handler;
sa.sa_flags = SA_NODEFER | SA_SIGINFO;
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...) \
{ \
__atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST); \
if (_setjmp(segv_env) == 0) { \
__VA_ARGS__; \
} \
__atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST); \
}

static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void thread_start(void* (*fn)(void*), void* arg)
{
pthread_t th;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
int i;
for (i = 0; i < 100; i++) {
if (pthread_create(&th, &attr, fn, arg) == 0) {
pthread_attr_destroy(&attr);
return;
}
if (errno == EAGAIN) {
usleep(50);
continue;
}
break;
}
exit(1);
}

#define BITMASK(bf_off, bf_len) (((1ull << (bf_len)) - 1) << (bf_off))
#define STORE_BY_BITMASK(type, htobe, addr, val, bf_off, bf_len) \
*(type*)(addr) = \
htobe((htobe(*(type*)(addr)) & ~BITMASK((bf_off), (bf_len))) | \
(((type)(val) << (bf_off)) & BITMASK((bf_off), (bf_len))))

typedef struct {
int state;
} event_t;

static void event_init(event_t* ev)
{
ev->state = 0;
}

static void event_reset(event_t* ev)
{
ev->state = 0;
}

static void event_set(event_t* ev)
{
if (ev->state)
exit(1);
__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG);
}

static void event_wait(event_t* ev)
{
while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}

static int event_isset(event_t* ev)
{
return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}

static int event_timedwait(event_t* ev, uint64_t timeout)
{
uint64_t start = current_time_ms();
uint64_t now = start;
for (;;) {
uint64_t remain = timeout - (now - start);
struct timespec ts;
ts.tv_sec = remain / 1000;
ts.tv_nsec = (remain % 1000) * 1000 * 1000;
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
if (__atomic_load_n(&ev->state, __ATOMIC_RELAXED))
return 1;
now = current_time_ms();
if (now - start > timeout)
return 0;
}
}

struct thread_t {
int created, call;
event_t ready, done;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
event_wait(&th->ready);
event_reset(&th->ready);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
event_set(&th->done);
}
return 0;
}

static void loop(void)
{
int i, call, thread;
for (call = 0; call < 5; call++) {
for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
event_init(&th->ready);
event_init(&th->done);
event_set(&th->done);
thread_start(thr, th);
}
if (!event_isset(&th->done))
continue;
event_reset(&th->done);
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
event_set(&th->ready);
event_timedwait(&th->done, 45);
break;
}
}
for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
sleep_ms(1);
}

uint64_t r[1] = {0x0};

void execute_call(int call)
{
long res;
switch (call) {
case 0:
NONFAILING(*(uint32_t*)0x20000580 = 2);
NONFAILING(*(uint32_t*)0x20000584 = 0x70);
NONFAILING(*(uint8_t*)0x20000588 = 0x61);
NONFAILING(*(uint8_t*)0x20000589 = 2);
NONFAILING(*(uint8_t*)0x2000058a = 0);
NONFAILING(*(uint8_t*)0x2000058b = 0);
NONFAILING(*(uint32_t*)0x2000058c = 0);
NONFAILING(*(uint64_t*)0x20000590 = 0);
NONFAILING(*(uint64_t*)0x20000598 = 0);
NONFAILING(*(uint64_t*)0x200005a0 = 0);
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 0, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 1, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 2, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 3, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 4, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 5, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 6, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 7, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 8, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 9, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 10, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 11, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 12, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 13, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 14, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 15, 2));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 17, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 18, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 19, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 20, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 21, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 22, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 23, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 24, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 25, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 26, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 27, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 28, 1));
NONFAILING(STORE_BY_BITMASK(uint64_t, , 0x200005a8, 0, 29, 35));
NONFAILING(*(uint32_t*)0x200005b0 = 0);
NONFAILING(*(uint32_t*)0x200005b4 = 0);
NONFAILING(*(uint64_t*)0x200005b8 = 0);
NONFAILING(*(uint64_t*)0x200005c0 = 0);
NONFAILING(*(uint64_t*)0x200005c8 = 0);
NONFAILING(*(uint64_t*)0x200005d0 = 0);
NONFAILING(*(uint32_t*)0x200005d8 = 0);
NONFAILING(*(uint32_t*)0x200005dc = 0);
NONFAILING(*(uint64_t*)0x200005e0 = 0);
NONFAILING(*(uint32_t*)0x200005e8 = 0);
NONFAILING(*(uint16_t*)0x200005ec = 0);
NONFAILING(*(uint16_t*)0x200005ee = 0);
syscall(__NR_perf_event_open, 0x20000580, 0, -1, -1, 0);
break;
case 1:
syscall(__NR_clone, 0x20002100, 0, 0x9999999999999999, 0, -1);
break;
case 2:
res = syscall(__NR_gettid);
if (res != -1)
r[0] = res;
break;
case 3:
NONFAILING(*(uint64_t*)0x20000000 = 0);
NONFAILING(*(uint32_t*)0x20000008 = 7);
NONFAILING(*(uint32_t*)0x2000000c = 4);
NONFAILING(*(uint32_t*)0x20000010 = r[0]);
syscall(__NR_timer_create, 0, 0x20000000, 0x20000080);
break;
case 4:
NONFAILING(*(uint64_t*)0x20000140 = 0);
NONFAILING(*(uint64_t*)0x20000148 = 1);
NONFAILING(*(uint64_t*)0x20000150 = 0);
NONFAILING(*(uint64_t*)0x20000158 = 0x1c9c380);
syscall(__NR_timer_settime, 0, 3, 0x20000140, 0);
break;
}
}
int main(void)
{
syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0);
install_segv_handler();
loop();
return 0;
}
.config

Dmitry Vyukov

unread,
Feb 1, 2019, 12:06:15 PM2/1/19
to Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, Thomas Gleixner, syzkaller
On Fri, Feb 1, 2019 at 5:48 PM Dmitry Vyukov <dvy...@google.com> wrote:
>
> Hello,
>
> The following program creates an unkillable process that eats CPU.
> /proc/pid/stack is empty, I am not sure what other info I can provide.
>
> Tested is on upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a.
> Config is attached.

Looking through other reproducers that create unkillable processes, I
think I found a much simpler reproducer (below). It's single threaded
and just setups SIGBUS handler and does timer_create+timer_settime to
send repeated SIGBUS. The resulting process can't be killed with
SIGKILL.
+Thomas for timers.


// autogenerated by syzkaller (https://github.com/google/syzkaller)

#define _GNU_SOURCE

#include <endian.h>
#include <setjmp.h>
#include <signal.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>

static __thread int skip_segv;
static __thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* ctx)
{
uintptr_t addr = (uintptr_t)info->si_addr;
const uintptr_t prog_start = 1 << 20;
const uintptr_t prog_end = 100 << 20;
if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED) &&
(addr < prog_start || addr > prog_end)) {
_longjmp(segv_env, 1);
}
exit(sig);
}

static void install_segv_handler(void)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_handler = SIG_IGN;
syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = segv_handler;
sa.sa_flags = SA_NODEFER | SA_SIGINFO;
sigaction(SIGSEGV, &sa, NULL);
sigaction(SIGBUS, &sa, NULL);
}

int main(void)
{
syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0);
install_segv_handler();
int tid = syscall(__NR_gettid);
*(uint64_t*)0x20000000 = 0;
*(uint32_t*)0x20000008 = 7;
*(uint32_t*)0x2000000c = 4;
*(uint32_t*)0x20000010 = tid;
syscall(__NR_timer_create, 0, 0x20000000, 0x20000080);
*(uint64_t*)0x2004a000 = 0;
*(uint64_t*)0x2004a008 = 1;
*(uint64_t*)0x2004a010 = 7;
*(uint64_t*)0x2004a018 = 0xe4c;
syscall(__NR_timer_settime, 0, 3, 0x2004a000, 0);
return 0;
}

Jiri Olsa

unread,
Feb 2, 2019, 1:36:25 PM2/2/19
to Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim, luca abeni, Thomas Gleixner, Oleg Nesterov, syzkaller
On Fri, Feb 01, 2019 at 06:06:03PM +0100, Dmitry Vyukov wrote:
> On Fri, Feb 1, 2019 at 5:48 PM Dmitry Vyukov <dvy...@google.com> wrote:
> >
> > Hello,
> >
> > The following program creates an unkillable process that eats CPU.
> > /proc/pid/stack is empty, I am not sure what other info I can provide.
> >
> > Tested is on upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a.
> > Config is attached.
>
> Looking through other reproducers that create unkillable processes, I
> think I found a much simpler reproducer (below). It's single threaded
> and just setups SIGBUS handler and does timer_create+timer_settime to
> send repeated SIGBUS. The resulting process can't be killed with
> SIGKILL.
> +Thomas for timers.

nice, I managed to kill it by strace ;-)

[jolsa@krava perf]$ strace -p 10725
strace: Process 10725 attached
--- stopped by SIGBUS ---
--- stopped by SIGBUS ---
--- stopped by SIGBUS ---
--- stopped by SIGBUS ---
+++ killed by SIGINT +++

+Oleg

jirka

Jiri Olsa

unread,
Feb 3, 2019, 10:21:22 AM2/3/19
to Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, Namhyung Kim, luca abeni, Thomas Gleixner, Oleg Nesterov, syzkaller
On Sat, Feb 02, 2019 at 07:30:45PM +0100, Jiri Olsa wrote:
> On Fri, Feb 01, 2019 at 06:06:03PM +0100, Dmitry Vyukov wrote:
> > On Fri, Feb 1, 2019 at 5:48 PM Dmitry Vyukov <dvy...@google.com> wrote:
> > >
> > > Hello,
> > >
> > > The following program creates an unkillable process that eats CPU.
> > > /proc/pid/stack is empty, I am not sure what other info I can provide.
> > >
> > > Tested is on upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a.
> > > Config is attached.
> >
> > Looking through other reproducers that create unkillable processes, I
> > think I found a much simpler reproducer (below). It's single threaded
> > and just setups SIGBUS handler and does timer_create+timer_settime to
> > send repeated SIGBUS. The resulting process can't be killed with
> > SIGKILL.
> > +Thomas for timers.
>
> nice, I managed to kill it by strace ;-)
>
> [jolsa@krava perf]$ strace -p 10725
> strace: Process 10725 attached
> --- stopped by SIGBUS ---
> --- stopped by SIGBUS ---
> --- stopped by SIGBUS ---
> --- stopped by SIGBUS ---
> +++ killed by SIGINT +++

fyi I can no longer reproduce the kill via strace.. not sure
what was different

jirka

Thomas Gleixner

unread,
Feb 4, 2019, 4:27:33 AM2/4/19
to Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov, Eric W. Biederman
On Fri, 1 Feb 2019, Dmitry Vyukov wrote:

> On Fri, Feb 1, 2019 at 5:48 PM Dmitry Vyukov <dvy...@google.com> wrote:
> >
> > Hello,
> >
> > The following program creates an unkillable process that eats CPU.
> > /proc/pid/stack is empty, I am not sure what other info I can provide.
> >
> > Tested is on upstream commit 4aa9fc2a435abe95a1e8d7f8c7b3d6356514b37a.
> > Config is attached.
>
> Looking through other reproducers that create unkillable processes, I
> think I found a much simpler reproducer (below). It's single threaded
> and just setups SIGBUS handler and does timer_create+timer_settime to
> send repeated SIGBUS. The resulting process can't be killed with
> SIGKILL.
> +Thomas for timers.

+Oleg, Eric

That's odd. With some tracing I can see that SIGKILL is generated and
queued, but its not delivered by some weird reason. I'm traveling in the
next days, so I won't be able to do much about it. Will look later this
week.

Thanks,

tglx

Dmitry Vyukov

unread,
Feb 4, 2019, 4:38:21 AM2/4/19
to Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov, Eric W. Biederman
Just a random though looking at the repro: can constant SIGBUS
delivery starve delivery of all other signals (incl SIGKILL)?

Thomas Gleixner

unread,
Feb 4, 2019, 12:38:59 PM2/4/19
to Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov, Eric W. Biederman
Indeed. SIGBUS is 7, SIGKILL is 9 and next_signal() delivers the lowest
number first....

Thanks,

tglx

Eric W. Biederman

unread,
Feb 4, 2019, 10:01:09 PM2/4/19
to Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
We do have the special case in complete_signal that causes most of the
signal delivery work of SIGKILL to happen when SIGKILL is queued.

I need to look at your reproducer. It would require being a per-thread
signal to cause problems in next_signal.

It is definitely worth fixing if there is any way for userspace to block
SIGKILL.

Eric

Eric W. Biederman

unread,
Feb 4, 2019, 11:27:33 PM2/4/19
to Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
Ugh.

The practical problem appears much worse.

Tracing the code I see that we attempt to deliver SIGBUS, I presume in a
per thread way.

At some point the delivery of SIGBUS fails. Then the kernel attempts
to synchronously force SIGSEGV. Which should be the end of it.

Unfortunately at that point our heuristic for dealing with syncrhonous
signals fails in next_signal and we attempt to deliver the timers
SIGBUS instead.

I suspect it is time to byte the bullet and handle the synchronous
unblockable signals differently. I will see if I can cook up an
appropriate patch.

Eric



Eric W. Biederman

unread,
Feb 5, 2019, 1:07:34 AM2/5/19
to Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
Playing with this some more what I see happening is:


SIGHUP and SIGSEGV get directed at sighup_handler.
Timer delivers SIGHUP
sighup_handler starts.
timer delivers SIGHUP (before sighup_handler finishes)
sighup_handler starts.
timer delivers SIGHUP (before sighup_handler finishes)
sighup_handler starts.
timer delivers SIGHUP (before sighup_handler finishes)
sighup_handler starts.
....
Up until the stack is full.
Then:
timer delivers SIGHUP
sighup_handler won't start
Attempt force_sigsegv
Confused kernel attempts to deliver SIGHUP (instead of a synchronous SIGSEGV)

If you unconfuse the kernel there is an attempt to deliver SIGSEGV
(the stack is full)
Then the kernel changes the SIGSEGV handler to SIG_DFL
Then SIGSEGV is successfully delivered terminating the application

Which suggests 2 fixes.
1) making SIGKILL something that can't get hidden behind other signals.
2) Having a 3rd queue of signals for the synchronous signals.
So that the synchronous signals can't be blocked by per thread signals.

I have prototyped the 2nd one and it is enough to stop the infinite spin
that causes problems here when the process stack fills up.

Fixing SIGKILL will probably bring more benefits.

Eric

Eric W. Biederman

unread,
Feb 5, 2019, 10:26:18 AM2/5/19
to Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov

Recently syzkaller was able to create unkillablle processes by
creating a timer that is delivered as a thread local signal on SIGHUP,
and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop
failing to deliver SIGHUP but always trying.

Upon examination it turns out part of the problem is actually most of
the solution. Since 2.5 complete_signal has found all fatal signals
and queued SIGKILL in every threads thread queue relying on
signal->group_exit_code to preserve the information of which was the
actual fatal signal.

The conversion of all fatal signals to SIGKILL results in the
synchronous signal heuristic in next_signal kicking in and preferring
SIGHUP to SIGKILL. Which is especially problematic as all
fatal signals have already been transformed into SIGKILL.

Now that we have task->jobctl we can do better and set a bit in
task->jobctl instead of reusing tsk->pending[SIGKILL]. This allows
giving already detected process exits a higher priority than any
pending signal.

Cc: sta...@vger.kernel.org
Reported-by: Dmitry Vyukov <dvy...@google.com>
Ref: ebf5ebe31d2c ("[PATCH] signal-fixes-2.5.59-A4")
History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Signed-off-by: "Eric W. Biederman" <ebie...@xmission.com>
---
Oleg, can you give this a quick review and see if I am missing anything?
Dmitry, can you verify this runs cleanly in your test setups?

fs/coredump.c | 2 +-
include/linux/sched/jobctl.h | 1 +
include/linux/sched/signal.h | 2 +-
kernel/signal.c | 10 ++++++++--
4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e55bfd..487995293ef0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -322,7 +322,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
- sigaddset(&t->pending.signal, SIGKILL);
+ t->jobctl |= JOBCTL_TASK_EXIT;
signal_wake_up(t, 1);
nr++;
}
diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h
index 98228bd48aee..ff7b3ea43f4c 100644
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -18,6 +18,7 @@ struct task_struct;
#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
+#define JOBCTL_TASK_EXIT 23 /* task is exiting */

#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 13789d10a50e..3f3edadf1ae1 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -354,7 +354,7 @@ static inline int signal_pending(struct task_struct *p)

static inline int __fatal_signal_pending(struct task_struct *p)
{
- return unlikely(sigismember(&p->pending.signal, SIGKILL));
+ return unlikely(p->jobctl & JOBCTL_TASK_EXIT);
}

static inline int fatal_signal_pending(struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9ca8e5278c8e..0577e37fdf43 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -989,7 +989,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
t = p;
do {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- sigaddset(&t->pending.signal, SIGKILL);
+ t->jobctl |= JOBCTL_TASK_EXIT;
signal_wake_up(t, 1);
} while_each_thread(p, t);
return;
@@ -1273,7 +1273,7 @@ int zap_other_threads(struct task_struct *p)
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
- sigaddset(&t->pending.signal, SIGKILL);
+ t->jobctl |= JOBCTL_TASK_EXIT;
signal_wake_up(t, 1);
}

@@ -2393,6 +2393,11 @@ bool get_signal(struct ksignal *ksig)
goto relock;
}

+ /* Has this task already been flagged for death? */
+ ksig->info.si_signo = signr = SIGKILL;
+ if (current->jobctl & JOBCTL_TASK_EXIT)
+ goto fatal;
+
for (;;) {
struct k_sigaction *ka;

@@ -2488,6 +2493,7 @@ bool get_signal(struct ksignal *ksig)
continue;
}

+ fatal:
spin_unlock_irq(&sighand->siglock);

/*
--
2.17.1

Dmitry Vyukov

unread,
Feb 6, 2019, 7:10:05 AM2/6/19
to Eric W. Biederman, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
Yes, this fixes the hang in my setup.
The process still hangs until I hit ^C or kill, but I guess this is an
intended behavior for such program.
Thanks for the quick fix.

Tested-by: Dmitry Vyukov <dvy...@google.com>

Oleg Nesterov

unread,
Feb 6, 2019, 1:08:01 PM2/6/19
to Eric W. Biederman, Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Eric, at al,

Sorry, I am on on vacation, can't even read this thread right now,
so I am not sure I understand the problem correctly...

On 02/05, Eric W. Biederman wrote:
>
> @@ -2393,6 +2393,11 @@ bool get_signal(struct ksignal *ksig)
> goto relock;
> }
>
> + /* Has this task already been flagged for death? */
> + ksig->info.si_signo = signr = SIGKILL;
> + if (current->jobctl & JOBCTL_TASK_EXIT)
> + goto fatal;
> +

Can't we simply change, say, next_signal() to return SIGKILL if it is
pending?

In any case, I am not sure we need JOBCTL_TASK_EXIT. Can't we rely on
signal_group_exit() ?

Oleg.

Eric W. Biederman

unread,
Feb 6, 2019, 4:47:33 PM2/6/19
to Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
It is actually not an intended behavior. The problem is that we are
still getting synchronous signals and ordinary signals confused.

The program should terminate with a SIGSEGV generated because
the stack fills up, and no additional signals can be delivered.

But that fix is a little more invsasive, and less serious. So
I am getting to it next.

Eric W. Biederman

unread,
Feb 6, 2019, 5:26:00 PM2/6/19
to Oleg Nesterov, Thomas Gleixner, Dmitry Vyukov, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Oleg Nesterov <ol...@redhat.com> writes:

> Eric, at al,
>
> Sorry, I am on on vacation, can't even read this thread right now,
> so I am not sure I understand the problem correctly...

That is fair. Please don't let me mess up your vacation.

The problem is an infinite stream of SIGHUP from a timer, making
processes unkillable.

Looking into that I see two bugs.
a) We handle fatal signals (esp SIGKILL) early in complete_signal and
this SIGHUP stream is messing up our handling.
b) We don't properly distinguish between synchronous and asynchrous
signals.

I am only aiming to fix the fatal signal issue with this change.

> On 02/05, Eric W. Biederman wrote:
>>
>> @@ -2393,6 +2393,11 @@ bool get_signal(struct ksignal *ksig)
>> goto relock;
>> }
>>
>> + /* Has this task already been flagged for death? */
>> + ksig->info.si_signo = signr = SIGKILL;
>> + if (current->jobctl & JOBCTL_TASK_EXIT)
>> + goto fatal;
>> +
>
> Can't we simply change, say, next_signal() to return SIGKILL if it is
> pending?

We could. But since this isn't necessarily SIGKILL we are dealing with,
it could just as easily be an unhandled SIGINT, so having SIGKILL in the
per task signal queue could still lead to other problems. I am afraid
that if continue abusing the per task signal queue other bugs of
confusion will occur.

> In any case, I am not sure we need JOBCTL_TASK_EXIT. Can't we rely on
> signal_group_exit() ?

Looking more closely yes I believe we can.

I thought de_thread would be a problem but signal_group_exit handles
that case and the core dump cases just fine.

It is a bit of a surprise but that would make:

> static inline int __fatal_signal_pending(struct task_struct *p)
> {
> return signal_group_exit(p->signal);
> }

I will respin and see if we can the change just depend on
signal_group_exit. One less abstraction to have to keep
in sync sounds more robust in the long run.

Eric

Eric W. Biederman

unread,
Feb 7, 2019, 1:43:07 AM2/7/19
to Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov

Dmitry,

Can you please verify the follow two patches. The first one is a
simplification of the one you have already tested and makes the
processes killable. The second patch improves our heuristic for
detecting synchronous like SIGSEGV when the stack overflows allowing us
to process them before SIGHUP.

Eric W. Biederman (2):
signal: Always notice exiting tasks
signal: Better detection of synchronous signals

Eric W. Biederman

unread,
Feb 7, 2019, 1:43:44 AM2/7/19
to Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov

Recently syzkaller was able to create unkillablle processes by
creating a timer that is delivered as a thread local signal on SIGHUP,
and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop
failing to deliver SIGHUP but always trying.

Upon examination it turns out part of the problem is actually most of
the solution. Since 2.5 signal delivery has found all fatal signals,
marked the signal group for death, and queued SIGKILL in every threads
thread queue relying on signal->group_exit_code to preserve the
information of which was the actual fatal signal.

The conversion of all fatal signals to SIGKILL results in the
synchronous signal heuristic in next_signal kicking in and preferring
SIGHUP to SIGKILL. Which is especially problematic as all
fatal signals have already been transformed into SIGKILL.

Instead of dequeueing signals and depending upon SIGKILL to
be the first signal dequeued, first test if the signal group
has already been marked for death. This guarantees that
nothing in the signal queue can prevent a process that needs
to exit from exiting.

Cc: sta...@vger.kernel.org
Reported-by: Dmitry Vyukov <dvy...@google.com>
Ref: ebf5ebe31d2c ("[PATCH] signal-fixes-2.5.59-A4")
History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Signed-off-by: "Eric W. Biederman" <ebie...@xmission.com>
---
kernel/signal.c | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 9ca8e5278c8e..5424cb0006bc 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2393,6 +2393,11 @@ bool get_signal(struct ksignal *ksig)
goto relock;
}

+ /* Has this task already been marked for death? */
+ ksig->info.si_signo = signr = SIGKILL;
+ if (signal_group_exit(signal))

Eric W. Biederman

unread,
Feb 7, 2019, 1:44:12 AM2/7/19
to Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov

Recently syzkaller was able to create unkillablle processes by
creating a timer that is delivered as a thread local signal on SIGHUP,
and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop failing
to deliver SIGHUP but always trying.

When the stack overflows delivery of SIGHUP fails and force_sigsegv is
called. Unfortunately because SIGSEGV is numerically higher than
SIGHUP next_signal tries again to deliver a SIGHUP.

From a quality of implementation standpoint attempting to deliver the
timer SIGHUP signal is wrong. We should attempt to deliver the
synchronous SIGSEGV signal we just forced.

We can make that happening in a fairly straight forward manner by
instead of just looking at the signal number we also look at the
si_code. In particular for exceptions (aka synchronous signals) the
si_code is always greater than 0.

That still has the potential to pick up a number of asynchronous
signals as in a few cases the same si_codes that are used
for synchronous signals are also used for asynchronous signals,
and SI_KERNEL is also included in the list of possible si_codes.

Still the heuristic is much better and timer signals are definitely
excluded. Which is enough to prevent all known ways for someone
sending a process signals fast enough to cause unexpected and
arguably incorrect behavior.

Cc: sta...@vger.kernel.org
Fixes: a27341cd5fcb ("Prioritize synchronous signals over 'normal' signals")
Signed-off-by: "Eric W. Biederman" <ebie...@xmission.com>
---
kernel/signal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 5424cb0006bc..99fa8ff06fd9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -688,6 +688,48 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in
}
EXPORT_SYMBOL_GPL(dequeue_signal);

+static int dequeue_synchronous_signal(kernel_siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+ struct sigpending *pending = &tsk->pending;
+ struct sigqueue *q, *sync = NULL;
+
+ /*
+ * Might a synchronous signal be in the queue?
+ */
+ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+ return 0;
+
+ /*
+ * Return the first synchronous signal in the queue.
+ */
+ list_for_each_entry(q, &pending->list, list) {
+ /* Synchronous signals have a postive si_code */
+ if ((q->info.si_code > SI_USER) &&
+ (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+ sync = q;
+ goto next;
+ }
+ }
+ return 0;
+next:
+ /*
+ * Check if there is another siginfo for the same signal.
+ */
+ list_for_each_entry_continue(q, &pending->list, list) {
+ if (q->info.si_signo == sync->info.si_signo)
+ goto still_pending;
+ }
+
+ sigdelset(&pending->signal, sync->info.si_signo);
+ recalc_sigpending();
+still_pending:
+ list_del_init(&sync->list);
+ copy_siginfo(info, &sync->info);
+ __sigqueue_free(sync);
+ return info->si_signo;
+}
+
/*
* Tell a process that it has a new active signal..
*
@@ -2411,7 +2453,15 @@ bool get_signal(struct ksignal *ksig)
goto relock;
}

- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ /*
+ * Signals generated by the execution of an instruction
+ * need to be delivered before any other pending signals
+ * so that the instruction pointer in the signal stack
+ * frame points to the faulting instruction.
+ */
+ signr = dequeue_synchronous_signal(&ksig->info);
+ if (!signr)
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);

if (!signr)
break; /* will return 0 */
--
2.17.1

Dmitry Vyukov

unread,
Feb 7, 2019, 6:47:07 AM2/7/19
to Eric W. Biederman, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Oleg Nesterov
With these 2 patches applied the test program immediately exits with
"Segmentation fault".
Tested-by: Dmitry Vyukov <dvy...@google.com>

Oleg Nesterov

unread,
Feb 11, 2019, 9:13:47 AM2/11/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
sorry again for delay...

On 02/07, Eric W. Biederman wrote:
>
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2393,6 +2393,11 @@ bool get_signal(struct ksignal *ksig)
> goto relock;
> }
>
> + /* Has this task already been marked for death? */
> + ksig->info.si_signo = signr = SIGKILL;
> + if (signal_group_exit(signal))
> + goto fatal;
> +
> for (;;) {
> struct k_sigaction *ka;
>
> @@ -2488,6 +2493,7 @@ bool get_signal(struct ksignal *ksig)
> continue;
> }
>
> + fatal:
> spin_unlock_irq(&sighand->siglock);

Eric, but this is wrong. At least this is the serious user-visible change.

Afaics, with this patch the tracee will never stop in PTRACE_EVENT_EXIT in case
of group_exit/exec, because schedule() in TASK_TRACED state won't block due to
__fatal_signal_pending().

Yes, yes, as I said many times the semantics of PTRACE_EVENT_EXIT was never really
defined, it depends on /dev/random, but still I don't think we should break it even
more.

Oleg.

Oleg Nesterov

unread,
Feb 11, 2019, 10:18:46 AM2/11/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Eric, I'll try to finally read the whole thread later, probably I missed
something, but...

On 02/07, Eric W. Biederman wrote:
>
> Recently syzkaller was able to create unkillablle processes by
> creating a timer that is delivered as a thread local signal on SIGHUP,
> and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop failing
> to deliver SIGHUP but always trying.
>
> When the stack overflows delivery of SIGHUP fails and force_sigsegv is
> called. Unfortunately because SIGSEGV is numerically higher than
> SIGHUP next_signal tries again to deliver a SIGHUP.

Confused... In this particular case next_signal() should return SIGSEGV
because it must be pending too and SYNCHRONOUS_MASK doesn't include SIGHUP.

Not that it really matters, the timer can deliver another SYNCHRONOUS_MASK
signal < SIGSEGV, just I am trying to understand what have I missed...

> + /*
> + * Check if there is another siginfo for the same signal.
> + */
> + list_for_each_entry_continue(q, &pending->list, list) {
> + if (q->info.si_signo == sync->info.si_signo)
> + goto still_pending;
> + }

But this must not be possible? SYNCHRONOUS_MASK doesn't include real-time
signals, we can't have 2 siginfo's for the same signal < SIGRTMIN.

Oleg.

Eric W. Biederman

unread,
Feb 11, 2019, 7:01:51 PM2/11/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Oleg Nesterov <ol...@redhat.com> writes:

> Eric, I'll try to finally read the whole thread later, probably I missed
> something, but...
>
> On 02/07, Eric W. Biederman wrote:
>>
>> Recently syzkaller was able to create unkillablle processes by
>> creating a timer that is delivered as a thread local signal on SIGHUP,
>> and receiving SIGHUP SA_NODEFERER. Ultimately causing a loop failing
>> to deliver SIGHUP but always trying.
>>
>> When the stack overflows delivery of SIGHUP fails and force_sigsegv is
>> called. Unfortunately because SIGSEGV is numerically higher than
>> SIGHUP next_signal tries again to deliver a SIGHUP.
>
> Confused... In this particular case next_signal() should return SIGSEGV
> because it must be pending too and SYNCHRONOUS_MASK doesn't include SIGHUP.
>
> Not that it really matters, the timer can deliver another SYNCHRONOUS_MASK
> signal < SIGSEGV, just I am trying to understand what have I missed...

Bah. It was SIGBUS. My brain kept thinking SIGHUP. My apologies this
mental error slipped into the patch description.

>> + /*
>> + * Check if there is another siginfo for the same signal.
>> + */
>> + list_for_each_entry_continue(q, &pending->list, list) {
>> + if (q->info.si_signo == sync->info.si_signo)
>> + goto still_pending;
>> + }
>
> But this must not be possible? SYNCHRONOUS_MASK doesn't include real-time
> signals, we can't have 2 siginfo's for the same signal < SIGRTMIN.

Yes for that reason it should be safe to strip that logic out at the
moment. I overlooked that when writing the code.

However. I am not certain that is a limit we actually want to honor
with synchronous signals. As it results in a louzy quality of
implementation.

We start with an instruction in the program being debugged. In
principle before that instruction starts we know that no signals
are pending because they were not delivered to that process.

If we for some reason send signal A to the process and at the same time
hit a fault that is reported as signal A. It is currently a race which
one wins. I think we could legitimately say that the fault happened
before signal A was enqueued, and deliver both. It is a bit murkier if
signal A was blocked.

If we let the enqueued signal A win (as we do today) we have SA_SIGNFO
that is not useful for describing the fault the instruction generated.
Which is a really lousy quality of implementation.

Which is a long way of saying I think that hunk of code is useful as it
allows us the possibility of fixing a lousy quality of implementation in
our code today.

Eric

Eric W. Biederman

unread,
Feb 11, 2019, 7:42:56 PM2/11/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
Well it changes PTRACE_EVENT_EXIT I grant that. It looks like that
changes makes PTRACE_EVENT_EXIT is less than useful.

The only way to perfectly preserve the previous semantics is probably to
do something like my JOBCTL_TASK_EXIT proposal.

That said I don't think even adding a JOBCTL_TASK_EXIT is enough to have
a reliable stop of ptrace_event_exit after a process has exited. As any
other pending signal can cause problems there as well.

I have received a report that strace -f in some cases is not noticing
children before they die and it looks like a stop in PTRACE_EVENT_EXIT
would fix that strace behavior.

Sigh.

Here I was trying for the simple minimal change and I hit this landmine.
Which leaves me with the question of what should be semantics of signal
handling after exit.

I think from dim memory of previous conversations the desired semantics
look like:
a) Ignore all signal state except for SIGKILL.
b) Letting SIGKILL wake up the process should be sufficient.

I will see if I can reproduce the strace failure and see if I can cook
up something minimal that addresses just that. If you have suggestions
I would love to hear them.

As this was a minimal fix for SIGKILL being broken I have already sent
the fix to Linus. So we are looking at an incremental fix at this point.

Eric

Eric W. Biederman

unread,
Feb 12, 2019, 3:19:13 AM2/12/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
ebie...@xmission.com (Eric W. Biederman) writes:

In my testing I found something that concerns me. Because we wind up
with SIGKILL in shard_pending we can not kill a process in do_exit that
has stopped at PTRACE_EVENT_EXIT. That bug seems to go back a long ways.

Other than that, it looks like we can do the following to fix the
regression I introduced.

Oleg any ideas on how to make PTRACE_EVENT_EXIT reliably killable?

diff --git a/kernel/signal.c b/kernel/signal.c
index 99fa8ff06fd9..a1f154dca73c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2544,6 +2544,9 @@ bool get_signal(struct ksignal *ksig)
}

fatal:
+ /* No more signals can be pending past this point */
+ sigdelset(&current->pending.signal, SIGKILL);
+ clear_tsk_thread_flag(current, TIF_SIGPENDING);
spin_unlock_irq(&sighand->siglock);

/*

Eric

Oleg Nesterov

unread,
Feb 12, 2019, 11:50:30 AM2/12/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
On 02/12, Eric W. Biederman wrote:
>
> > Here I was trying for the simple minimal change and I hit this landmine.
> > Which leaves me with the question of what should be semantics of signal
> > handling after exit.

Yes, currently it is undefined. Even signal_pending() is random.

> > I think from dim memory of previous conversations the desired semantics
> > look like:
> > a) Ignore all signal state except for SIGKILL.
> > b) Letting SIGKILL wake up the process should be sufficient.

signal_wake_up(true) to make fatal_signal_pending() == T, I think.

> Oleg any ideas on how to make PTRACE_EVENT_EXIT reliably killable?

My answer is very simple: PTRACE_EVENT_EXIT must not stop if the tracee was
killed by the "real" SIGKILL (not by group_exit/etc), that is all. But this
is another user-visible change, it can equally confuse, say, strace (albeit
not too much iiuc).

But this needs another discussion.

> diff --git a/kernel/signal.c b/kernel/signal.c
> index 99fa8ff06fd9..a1f154dca73c 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -2544,6 +2544,9 @@ bool get_signal(struct ksignal *ksig)
> }
>
> fatal:
> + /* No more signals can be pending past this point */
> + sigdelset(&current->pending.signal, SIGKILL);

Well, this is very confusing. In fact, this is not really correct. Say, we should
not remove the pending SIGKILL if we are going to call do_coredump(). This is
possible if ptrace_signal() was called, or after is_current_pgrp_orphaned() returns
false.

> + clear_tsk_thread_flag(current, TIF_SIGPENDING);

I don't understand this change, it looks irrelevant. Possibly makes sense, but
this connects to "semantics of signal handling after exit".

OK, we need a minimal incremental fix for now. I'd suggest to replace

ksig->info.si_signo = signr = SIGKILL;
if (signal_group_exit(signal))
goto fatal;

added by this patch with

if (__fatal_signal_pending(current)) {
ksig->info.si_signo = signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
goto fatal;
}

__fatal_signal_pending() is cheaper and looks more understandable.

Oleg.

Oleg Nesterov

unread,
Feb 12, 2019, 12:21:11 PM2/12/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller
On 02/11, Eric W. Biederman wrote:
I doubt this would be really useful but this doesn't matter right now,

> Which is a long way of saying I think that hunk of code is useful as it
> allows us the possibility of fixing a lousy quality of implementation in
> our code today.

If we ever rework the legacy_queue() logic we can easily add this hunk back.

Until then it complicates the code for no reason imo, just to confuse the reader.

Oleg.

Eric W. Biederman

unread,
Feb 12, 2019, 10:59:12 PM2/12/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
Oleg Nesterov <ol...@redhat.com> writes:

> On 02/12, Eric W. Biederman wrote:
>>
>> > Here I was trying for the simple minimal change and I hit this landmine.
>> > Which leaves me with the question of what should be semantics of signal
>> > handling after exit.
>
> Yes, currently it is undefined. Even signal_pending() is random.
>
>> > I think from dim memory of previous conversations the desired semantics
>> > look like:
>> > a) Ignore all signal state except for SIGKILL.
>> > b) Letting SIGKILL wake up the process should be sufficient.
>
> signal_wake_up(true) to make fatal_signal_pending() == T, I think.
>
>> Oleg any ideas on how to make PTRACE_EVENT_EXIT reliably killable?
>
> My answer is very simple: PTRACE_EVENT_EXIT must not stop if the tracee was
> killed by the "real" SIGKILL (not by group_exit/etc), that is all. But this
> is another user-visible change, it can equally confuse, say, strace (albeit
> not too much iiuc).
>
> But this needs another discussion.

Yes. Quite.

I will just point out that as described that logic will rebreak Ivan's
program.

>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index 99fa8ff06fd9..a1f154dca73c 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -2544,6 +2544,9 @@ bool get_signal(struct ksignal *ksig)
>> }
>>
>> fatal:
>> + /* No more signals can be pending past this point */
>> + sigdelset(&current->pending.signal, SIGKILL);
>
> Well, this is very confusing. In fact, this is not really correct. Say, we should
> not remove the pending SIGKILL if we are going to call do_coredump(). This is
> possible if ptrace_signal() was called, or after is_current_pgrp_orphaned() returns
> false.

I don't see bugs in it. But it is certainly subtle and that is not what
is needed right now.

The subtlety is that we will never have a per thread SIGKILL pending
unless signal_group_exit is true. So removing when it is not there is harmless.

>> + clear_tsk_thread_flag(current, TIF_SIGPENDING);
>
> I don't understand this change, it looks irrelevant. Possibly makes sense, but
> this connects to "semantics of signal handling after exit".

As on the other the location is too subtle for the regression fix.

The primary motivation is that dequeue_signal calls recalc_sigpending.
And in the common case that will result clearing the TIF_SIGPENDING
which will result in signal_pending being false.

I have not found a location that cares enough to cause a misbehavior
if we don't clear TIF_SIGPENDING but it is a practical change and there
might be. So if the word of the day is be very conservative and
avoid landminds I expect we need the clearing of TIF_SIGPENDING.

Hmm. Probably using recalc_sigpending() now that I think about it.

> OK, we need a minimal incremental fix for now. I'd suggest to replace
>
> ksig->info.si_signo = signr = SIGKILL;
> if (signal_group_exit(signal))
> goto fatal;
>
> added by this patch with
>
> if (__fatal_signal_pending(current)) {
> ksig->info.si_signo = signr = SIGKILL;
> sigdelset(&current->pending.signal, SIGKILL);
> goto fatal;
> }
>
> __fatal_signal_pending() is cheaper and looks more understandable.


I definitely agree that it is much less likely to cause a problem
if we move all of the work before jumping to fatal.

The cost of both __fatal_signal_pending and signal_group_exit is just
a cache line read. So not a big deal wither way.

On the other hand __fatal_signal_pending as currently implemented is
insanely subtle and arguably a bit confusing. It tests for a SIGKILL in
the current pending sigset, to discover the signal group property of if
a process as started exiting.

In the long run we need our data structures not to be subtle and
tricky to use. To do that we need a test of something in signal_struct
because it is a per signal group property. Further we need to remove
the abuse of the per thread SIGKILL.

Since signal_group_exit always implies __fatal_signal_pending in this
case and the reverse. I see no reason to use a function that requires
we maintain a huge amount of confusing and unnecessary machinery to keep
working.

All of that plus the signal_group_exit test has been tested and shown to
fix an ignored SIGKILL and the only practical problem is it doesn't do
one or two little things that dequeue_signal has done that made it
impossible to stop in PTRACE_EVENT_EXIT.

So for the regression fix let's just do the few little things that
dequeue_signal used to do. That gives us a strong guarantee that
nothing else was missed.

Eric

Eric W. Biederman

unread,
Feb 12, 2019, 11:09:22 PM2/12/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande

In the middle of do_exit() there is there is a call
"ptrace_event(PTRACE_EVENT_EXIT, code);" That call places the process
in TACKED_TRACED aka "(TASK_WAKEKILL | __TASK_TRACED)" and waits for
for the debugger to release the task or SIGKILL to be delivered.

Skipping past dequeue_signal when we know a fatal signal has already
been delivered resulted in SIGKILL remaining pending and
TIF_SIGPENDING remaining set. This in turn caused the
scheduler to not sleep in PTACE_EVENT_EXIT as it figured
a fatal signal was pending. This also caused ptrace_freeze_traced
in ptrace_check_attach to fail because it left a per thread
SIGKILL pending which is what fatal_signal_pending tests for.

This difference in signal state caused strace to report
strace: Exit of unknown pid NNNNN ignored

Therefore update the signal handling state like dequeue_signal
would when removing a per thread SIGKILL, by removing SIGKILL
from the per thread signal mask and clearing TIF_SIGPENDING.

Reported-by: Oleg Nesterov <ol...@redhat.com>
Reported-by: Ivan Delalande <col...@arista.com>
Cc: sta...@vger.kernel.org
Fixes: 35634ffa1751 ("signal: Always notice exiting tasks")
Signed-off-by: "Eric W. Biederman" <ebie...@xmission.com>
---

Ivan this change fixes the issues you reported to me. Can you confirm?
Oleg this looks like the most conservative regression fix I can manage.

kernel/signal.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 99fa8ff06fd9..57b7771e20d7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2436,9 +2436,12 @@ bool get_signal(struct ksignal *ksig)
}

/* Has this task already been marked for death? */
- ksig->info.si_signo = signr = SIGKILL;
- if (signal_group_exit(signal))
+ if (signal_group_exit(signal)) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ recalc_sigpending();
goto fatal;
+ }

for (;;) {
struct k_sigaction *ka;
--
2.17.1

Oleg Nesterov

unread,
Feb 13, 2019, 8:56:03 AM2/13/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
On 02/12, Eric W. Biederman wrote:
>
> Oleg this looks like the most conservative regression fix I can manage.

This is what I tried to suggest.

Except I still think that __fatal_signal_pending() would look better.

Yes, yes, in the long term we can possibly even kill fatal_signal_pending()
by various reasons I am not going to discuss right now (although I'd prefer
to kill signal_group_exit() instead).

But then we will probably need to reconsider sigdelset(SIGKILL) added by
this patch too. And until then

if (__fatal_signal_pending())
sigdelset(SIGKILL);

looks much more clear and understandable to me.

But I am not going to argue, feel free to add my ack.

Oleg Nesterov

unread,
Feb 13, 2019, 9:38:58 AM2/13/19
to Eric W. Biederman, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
sorry for noise, but after I read the changelog I have a minor nit,
feel free to ignore...

On 02/12, Eric W. Biederman wrote:
>
> Skipping past dequeue_signal when we know a fatal signal has already
> been delivered resulted in SIGKILL remaining pending and
> TIF_SIGPENDING remaining set. This in turn caused the
> scheduler to not sleep in PTACE_EVENT_EXIT as it figured
> a fatal signal was pending.

Yes, but the status of TIF_SIGPENDING doesn't matter. However I agree
with recalc_sigpending() added by this patch, simply because this is what
the "normal" dequeue_signal() paths do.

> This also caused ptrace_freeze_traced
> in ptrace_check_attach to fail because it left a per thread
> SIGKILL pending which is what fatal_signal_pending tests for.

this is possible too, but in the likely case ptrace_check_attach() won't
be even called exactly because the tracee won't stop and thus waitpid()
won't report WIFSTOPPED. And even if waitpid() "wins" the race and debugger
calls ptrace(), most probably ptrace_freeze_traced() will fail because
task_is_traced() will be false.

I think this part of the changelog looks a bit confusing. It doesn't matter
why ptrace_check_attach() fails, it must fail if the tracee didn't stop.

PTACE_EVENT_EXIT won't stop and thus this event won't be reported, that is all.

again, feel free to ignore.

Oleg.

Eric W. Biederman

unread,
Feb 13, 2019, 9:58:37 AM2/13/19
to Oleg Nesterov, Dmitry Vyukov, Thomas Gleixner, Ingo Molnar, Peter Zijlstra, LKML, Arnaldo Carvalho de Melo, Alexander Shishkin, jo...@redhat.com, Namhyung Kim, luca abeni, syzkaller, Ivan Delalande
There might be a better way to say it. What I meant to convey is that
in my testing I could get PTRACE_EVENT_EXIT to stop by clearing
TIF_SIGPENDING (and leaving SIGKILL in pending). That was insufficient
to fix the bug. Without SIGKILL in pending ptrace_check_attach would
fail when the process was stopped in PTRACE_EVENT_EXIT.



Or in short it really bugs me that we have to have signal_group_exit
and fatal_signal_pending not in agreement after in do_exit to make the code
work. It bothers me because fatal_signal_pending and signal_group_exit
are in all other cases just different ways to ask the same question.

Eric


Reply all
Reply to author
Forward
0 new messages