kvm: use-after-free in process

Dmitry Vyukov

unread,

Dec 11, 2016, 1:46:33 AM12/11/16

to Steve Rutherford, Paolo Bonzini, Radim Krčmář, KVM list, LKML, syzkaller

Hello,

I am getting the following use-after-free reports while running
syzkaller fuzzer.
On commit 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7).
Unfortunately it is not reproducible, but all reports look sane and
very similar, so I would assume that it is some hard to trigger race.
In all cases the use-after-free offset within struct kvm is 344 bytes.
This points to srcu field, which starts at 208 with size 360 (I have
some debug configs enabled).

BUG: KASAN: use-after-free in process_srcu+0x27a/0x280 at addr ffff88005e29a418
Read of size 8 by task kworker/3:1/1496
CPU: 3 PID: 1496 Comm: kworker/3:1 Not tainted 4.9.0-rc8+ #78
Hardware name: Google Google/Google, BIOS Google 01/01/2011
Workqueue: events_power_efficient process_srcu
ffff88006b1df3a0 ffffffff8348fb59 ffffffff00000003 1ffff1000d63be07
ffffed000d63bdff 0000000041b58ab3 ffffffff8957cf20 ffffffff8348f86b
ffff8800668dc440 ffffffff8816c000 1ffff1000d63be18 dffffc0000000000
Call Trace:
[< inline >] __dump_stack lib/dump_stack.c:15
[<ffffffff8348fb59>] dump_stack+0x2ee/0x3f5 lib/dump_stack.c:51
[<ffffffff819f1c21>] kasan_object_err+0x21/0x70 mm/kasan/report.c:163
[< inline >] print_address_description mm/kasan/report.c:201
[< inline >] kasan_report_error mm/kasan/report.c:285
[<ffffffff819f1ec1>] kasan_report+0x1a1/0x440 mm/kasan/report.c:305
[<ffffffff819f21d9>] __asan_report_load8_noabort+0x19/0x20
mm/kasan/report.c:331
[< inline >] rcu_batch_empty kernel/rcu/srcu.c:64
[< inline >] rcu_batch_dequeue kernel/rcu/srcu.c:75
[< inline >] srcu_invoke_callbacks kernel/rcu/srcu.c:624
[<ffffffff815ba91a>] process_srcu+0x27a/0x280 kernel/rcu/srcu.c:672
[<ffffffff814731c0>] process_one_work+0xb40/0x1ba0 kernel/workqueue.c:2096
[<ffffffff81474434>] worker_thread+0x214/0x18a0 kernel/workqueue.c:2230
[<ffffffff8148a058>] kthread+0x328/0x3e0 kernel/kthread.c:209
[<ffffffff8816c61a>] ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:433
Object at ffff88005e29a2c0, in cache kmalloc-16384 size: 16384
Allocated:
PID = 13066
[ 376.024345] [<ffffffff81270fdb>] save_stack_trace+0x1b/0x20
arch/x86/kernel/stacktrace.c:57
[ 376.024345] [<ffffffff819f0ea3>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
[ 376.024345] [< inline >] set_track mm/kasan/kasan.c:507
[ 376.024345] [<ffffffff819f116d>] kasan_kmalloc+0xad/0xe0
mm/kasan/kasan.c:598
[ 376.024345] [<ffffffff819ed4ec>]
kmem_cache_alloc_trace+0x12c/0x710 mm/slab.c:3635
[ 376.024345] [< inline >] kvm_arch_alloc_vm include/linux/slab.h:490
[ 376.024345] [< inline >] kvm_create_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:610
[ 376.024345] [< inline >] kvm_dev_ioctl_create_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:3164
[ 376.024345] [<ffffffff810789d5>] kvm_dev_ioctl+0x1b5/0x1100
arch/x86/kvm/../../../virt/kvm/kvm_main.c:3208
[ 376.024345] [< inline >] vfs_ioctl fs/ioctl.c:43
[ 376.024345] [<ffffffff81abdf24>] do_vfs_ioctl+0x1c4/0x1630 fs/ioctl.c:679
[ 376.024345] [< inline >] SYSC_ioctl fs/ioctl.c:694
[ 376.024345] [<ffffffff81abf424>] SyS_ioctl+0x94/0xc0 fs/ioctl.c:685
[ 376.024345] [<ffffffff8816c385>] entry_SYSCALL_64_fastpath+0x23/0xc6
Freed:
PID = 13064
[ 376.024345] [<ffffffff81270fdb>] save_stack_trace+0x1b/0x20
arch/x86/kernel/stacktrace.c:57
[ 376.024345] [<ffffffff819f0ea3>] save_stack+0x43/0xd0 mm/kasan/kasan.c:495
[ 376.024345] [< inline >] set_track mm/kasan/kasan.c:507
[ 376.024345] [<ffffffff819f17c2>] kasan_slab_free+0x72/0xc0
mm/kasan/kasan.c:571
[ 376.024345] [< inline >] __cache_free mm/slab.c:3511
[ 376.024345] [<ffffffff819ef3e8>] kfree+0xc8/0x2a0 mm/slab.c:3828
[ 376.024345] [< inline >] kvm_arch_free_vm
include/linux/kvm_host.h:774
[ 376.024345] [< inline >] kvm_destroy_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:739
[ 376.024345] [<ffffffff8106fcd9>] kvm_put_kvm+0x489/0x5f0
arch/x86/kvm/../../../virt/kvm/kvm_main.c:754
[ 376.024345] [<ffffffff8106ff97>] kvm_vm_release+0x47/0x60
arch/x86/kvm/../../../virt/kvm/kvm_main.c:765
[ 376.024345] [<ffffffff81a77f7e>] __fput+0x34e/0x910 fs/file_table.c:208
[ 376.024345] [<ffffffff81a785ca>] ____fput+0x1a/0x20 fs/file_table.c:244
[ 376.024345] [<ffffffff81483c20>] task_work_run+0x1a0/0x280
kernel/task_work.c:116
[ 376.024345] [< inline >] exit_task_work include/linux/task_work.h:21
[ 376.024345] [<ffffffff814129e2>] do_exit+0x1842/0x2650 kernel/exit.c:828
[ 376.024345] [<ffffffff814139ae>] do_group_exit+0x14e/0x420 kernel/exit.c:932
[ 376.024345] [<ffffffff81442b43>] get_signal+0x663/0x1880
kernel/signal.c:2307
[ 376.024345] [<ffffffff81239b45>] do_signal+0xc5/0x2190
arch/x86/kernel/signal.c:807
[ 376.024345] [<ffffffff8100666a>] exit_to_usermode_loop+0x1ea/0x2d0
arch/x86/entry/common.c:156
[ 376.024345] [< inline >] prepare_exit_to_usermode
arch/x86/entry/common.c:190
[ 376.024345] [<ffffffff81009693>]
syscall_return_slowpath+0x4d3/0x570 arch/x86/entry/common.c:259
[ 376.024345] [<ffffffff8816c426>] entry_SYSCALL_64_fastpath+0xc4/0xc6
Memory state around the buggy address:
ffff88005e29a300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88005e29a380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88005e29a400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88005e29a480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88005e29a500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================

Vegard Nossum

unread,

Dec 11, 2016, 3:40:31 AM12/11/16

to Dmitry Vyukov, Steve Rutherford, Paolo Bonzini, Radim Krčmář, KVM list, LKML, syzkaller

On 11 December 2016 at 07:46, Dmitry Vyukov <dvy...@google.com> wrote:
> Hello,
>
> I am getting the following use-after-free reports while running
> syzkaller fuzzer.
> On commit 318c8932ddec5c1c26a4af0f3c053784841c598e (Dec 7).
> Unfortunately it is not reproducible, but all reports look sane and
> very similar, so I would assume that it is some hard to trigger race.
> In all cases the use-after-free offset within struct kvm is 344 bytes.
> This points to srcu field, which starts at 208 with size 360 (I have
> some debug configs enabled).

[...]

> [ 376.024345] [<ffffffff81a77f7e>] __fput+0x34e/0x910 fs/file_table.c:208
> [ 376.024345] [<ffffffff81a785ca>] ____fput+0x1a/0x20 fs/file_table.c:244

I've been hitting what I think is a struct file refcounting bug which
causes similar symptoms as you have here (the struct file is freed
while somebody still has an active reference to it).

> [ 376.024345] [<ffffffff81483c20>] task_work_run+0x1a0/0x280
> kernel/task_work.c:116
> [ 376.024345] [< inline >] exit_task_work include/linux/task_work.h:21
> [ 376.024345] [<ffffffff814129e2>] do_exit+0x1842/0x2650 kernel/exit.c:828
> [ 376.024345] [<ffffffff814139ae>] do_group_exit+0x14e/0x420 kernel/exit.c:932
> [ 376.024345] [<ffffffff81442b43>] get_signal+0x663/0x1880
> kernel/signal.c:2307
> [ 376.024345] [<ffffffff81239b45>] do_signal+0xc5/0x2190
> arch/x86/kernel/signal.c:807

Was this or any other process by any chance killed by the OOM killer?
That seems to be a pattern in the crashes I've seen. If not, do you
know what killed this process?

Vegard

Dmitry Vyukov

unread,

Dec 11, 2016, 3:50:12 AM12/11/16

to syzkaller, Steve Rutherford, Paolo Bonzini, Radim Krčmář, KVM list, LKML

Difficult to say as I can't reproduce them.
I've looked at the logs I have and there are no OOM kills, only some
kvm-related messages:

[ 372.188708] kvm [12528]: vcpu0, guest rIP: 0xfff0
kvm_set_msr_common: MSR_IA32_DEBUGCTLMSR 0x2, nop
[ 372.321334] kvm [12528]: vcpu0, guest rIP: 0xfff0 unhandled wrmsr:
0x0 data 0x0
[ 372.426831] kvm [12593]: vcpu512, guest rIP: 0xfff0 unhandled
wrmsr: 0x5 data 0x200
[ 372.646417] irq bypass consumer (token ffff880052f74780)
registration fails: -16
[ 373.001273] pit: kvm: requested 1676 ns i8254 timer period limited
to 500000 ns
[ 375.541449] kvm [13011]: vcpu0, guest rIP: 0x110000 unhandled
wrmsr: 0x0 data 0x2
[ 376.005387] ==================================================================
[ 376.024345] BUG: KASAN: use-after-free in process_srcu+0x27a/0x280
at addr ffff88005e29a418

[ 720.214985] kvm: vcpu 0: requested 244148 ns lapic timer period
limited to 500000 ns
[ 720.271334] kvm: vcpu 0: requested 244148 ns lapic timer period
limited to 500000 ns
[ 720.567985] kvm_vm_ioctl_assign_device: host device not found
[ 721.094589] kvm [22114]: vcpu0, guest rIP: 0x2 unhandled wrmsr: 0x6 data 0x8
[ 723.829467] ==================================================================
[ 723.829467] BUG: KASAN: use-after-free in process_srcu+0x27a/0x280
at addr ffff88005a4d10d8

Logs capture ~3-4 seconds before the crash.
However, syzkaller test processes tend to consume lots of memory from
time to time and cause low memory conditions.

Kills are usually caused by my test driver that kills test processes
after short time.

However, I do see other assorted bugs caused by kvm that are induced
by OOM kills:
https://groups.google.com/d/msg/syzkaller/ytVPh93HLnI/KhZdengZBwAJ

Steve Rutherford

unread,

Jan 12, 2017, 10:31:07 PM1/12/17

to Dmitry Vyukov, syzkaller, Paolo Bonzini, Radim Krčmář, KVM list, LKML

I'm not that familiar with the kernel's workqueues, but this seems
like the classic "callback outlives the memory it references"
use-after-free, where the process_srcu callback is outliving struct
kvm (which contains the srcu_struct). If that's right, then calling
srcu_barrier (which should wait for all of the call_srcu callbacks to
complete, which are what enqueue the process_srcu callbacks) before
cleanup_srcu_struct in kvm_destroy_vm probably fixes this.

The corresponding patch to virt/kvm/kvm_main.c looks something like:
static void kvm_destroy_vm(struct kvm *kvm)
{
...
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
kvm_free_memslots(kvm, kvm->memslots[i]);
+ srcu_barrier(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->irq_srcu);
+ srcu_barrier(&kvm->srcu);
cleanup_srcu_struct(&kvm->srcu);
...

Since we don't have a repro, this obviously won't be readily testable.
I find srcu subtle enough that I don't trust my reasoning fully (in
particular, I don't trust that waiting for all of the call_srcu
callbacks to complete also waits for all of the process_srcu
callbacks). Someone else know if that's the case?

Steve

Dmitry Vyukov

unread,

Jan 13, 2017, 4:20:01 AM1/13/17

to Steve Rutherford, syzkaller, Paolo Bonzini, Radim Krčmář, KVM list, LKML

On Fri, Jan 13, 2017 at 4:30 AM, Steve Rutherford
<sruth...@google.com> wrote:
> I'm not that familiar with the kernel's workqueues, but this seems
> like the classic "callback outlives the memory it references"
> use-after-free, where the process_srcu callback is outliving struct
> kvm (which contains the srcu_struct). If that's right, then calling
> srcu_barrier (which should wait for all of the call_srcu callbacks to
> complete, which are what enqueue the process_srcu callbacks) before
> cleanup_srcu_struct in kvm_destroy_vm probably fixes this.
>
> The corresponding patch to virt/kvm/kvm_main.c looks something like:
> static void kvm_destroy_vm(struct kvm *kvm)
> {
> ...
> for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
> kvm_free_memslots(kvm, kvm->memslots[i]);
> + srcu_barrier(&kvm->irq_srcu);
> cleanup_srcu_struct(&kvm->irq_srcu);
> + srcu_barrier(&kvm->srcu);
> cleanup_srcu_struct(&kvm->srcu);
> ...
>
>
> Since we don't have a repro, this obviously won't be readily testable.
> I find srcu subtle enough that I don't trust my reasoning fully (in
> particular, I don't trust that waiting for all of the call_srcu
> callbacks to complete also waits for all of the process_srcu
> callbacks). Someone else know if that's the case?

From the function description it looks like it should do the trick:

514 /**
515 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete.
516 * @sp: srcu_struct on which to wait for in-flight callbacks.
517 */
518 void srcu_barrier(struct srcu_struct *sp)

I see this failure happening several times per day. I've applied your
patch locally and will check if I see these failures happening.

Dmitry Vyukov

unread,

Jan 15, 2017, 12:11:30 PM1/15/17

to Steve Rutherford, syzkaller, Paolo Bonzini, Radim Krčmář, KVM list, LKML

I have not seen the crash in 3 days, when usually I see several
crashes per night. So I think we can consider that the patch fixes the
crash:

Tested-by: Dmitry Vyukov <dvy...@google.com>

Dmitry Vyukov

unread,

Jan 16, 2017, 4:34:47 PM1/16/17

to Steve Rutherford, syzkaller, Paolo Bonzini, Radim Krčmář, KVM list, LKML

Unfortunately I hit it again with the patch applied. It definitely
happens less frequently now, but still happens:

BUG: KASAN: use-after-free in rcu_batch_empty kernel/rcu/srcu.c:64
[inline] at addr ffff88006ada0218
BUG: KASAN: use-after-free in rcu_batch_dequeue kernel/rcu/srcu.c:75
[inline] at addr ffff88006ada0218
BUG: KASAN: use-after-free in srcu_invoke_callbacks
kernel/rcu/srcu.c:624 [inline] at addr ffff88006ada0218
BUG: KASAN: use-after-free in process_srcu+0x275/0x280
kernel/rcu/srcu.c:672 at addr ffff88006ada0218
Read of size 8 by task kworker/2:2/1073
CPU: 2 PID: 1073 Comm: kworker/2:2 Not tainted 4.10.0-rc3+ #164
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Workqueue: events_power_efficient process_srcu
Call Trace:
__dump_stack lib/dump_stack.c:15 [inline]
dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
kasan_object_err+0x1c/0x70 mm/kasan/report.c:165
print_address_description mm/kasan/report.c:203 [inline]
kasan_report_error mm/kasan/report.c:287 [inline]
kasan_report+0x1b6/0x460 mm/kasan/report.c:307
__asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:333

rcu_batch_empty kernel/rcu/srcu.c:64 [inline]
rcu_batch_dequeue kernel/rcu/srcu.c:75 [inline]

srcu_invoke_callbacks kernel/rcu/srcu.c:624 [inline]
process_srcu+0x275/0x280 kernel/rcu/srcu.c:672
process_one_work+0xbd0/0x1c10 kernel/workqueue.c:2098
worker_thread+0x223/0x1990 kernel/workqueue.c:2232
kthread+0x326/0x3f0 kernel/kthread.c:227
ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:427
Object at ffff88006ada00c0, in cache kmalloc-16384 size: 16384
Allocated:
PID = 32313
[<ffffffff812b2686>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
[<ffffffff81a0e3d3>] save_stack+0x43/0xd0 mm/kasan/kasan.c:502
[<ffffffff81a0e69a>] set_track mm/kasan/kasan.c:514 [inline]
[<ffffffff81a0e69a>] kasan_kmalloc+0xaa/0xd0 mm/kasan/kasan.c:605
[<ffffffff81a0aeeb>] kmem_cache_alloc_trace+0x10b/0x670 mm/slab.c:3629
[<ffffffff8107910e>] kvm_arch_alloc_vm include/linux/slab.h:490 [inline]
[<ffffffff8107910e>] kvm_create_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:613 [inline]
[<ffffffff8107910e>] kvm_dev_ioctl_create_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:3175 [inline]
[<ffffffff8107910e>] kvm_dev_ioctl+0x1be/0x11b0
arch/x86/kvm/../../../virt/kvm/kvm_main.c:3219
[<ffffffff81aa55bf>] vfs_ioctl fs/ioctl.c:43 [inline]
[<ffffffff81aa55bf>] do_vfs_ioctl+0x1bf/0x1790 fs/ioctl.c:683
[<ffffffff81aa6c1f>] SYSC_ioctl fs/ioctl.c:698 [inline]
[<ffffffff81aa6c1f>] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:689
[<ffffffff841c9941>] entry_SYSCALL_64_fastpath+0x1f/0xc2
Freed:
PID = 32310
[<ffffffff812b2686>] save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
[<ffffffff81a0e3d3>] save_stack+0x43/0xd0 mm/kasan/kasan.c:502
[<ffffffff81a0ed0f>] set_track mm/kasan/kasan.c:514 [inline]
[<ffffffff81a0ed0f>] kasan_slab_free+0x6f/0xb0 mm/kasan/kasan.c:578
[<ffffffff81a0cbc3>] __cache_free mm/slab.c:3505 [inline]
[<ffffffff81a0cbc3>] kfree+0xd3/0x250 mm/slab.c:3822
[<ffffffff81070579>] kvm_arch_free_vm include/linux/kvm_host.h:781 [inline]
[<ffffffff81070579>] kvm_destroy_vm
arch/x86/kvm/../../../virt/kvm/kvm_main.c:744 [inline]
[<ffffffff81070579>] kvm_put_kvm+0x719/0x9a0
arch/x86/kvm/../../../virt/kvm/kvm_main.c:759
[<ffffffff81070952>] kvm_vm_release+0x42/0x50
arch/x86/kvm/../../../virt/kvm/kvm_main.c:770
[<ffffffff81a5f872>] __fput+0x332/0x7f0 fs/file_table.c:208
[<ffffffff81a5fdb5>] ____fput+0x15/0x20 fs/file_table.c:244
[<ffffffff814ca25a>] task_work_run+0x18a/0x260 kernel/task_work.c:116
[<ffffffff81454a57>] exit_task_work include/linux/task_work.h:21 [inline]
[<ffffffff81454a57>] do_exit+0x18e7/0x28a0 kernel/exit.c:839
[<ffffffff8145a469>] do_group_exit+0x149/0x420 kernel/exit.c:943
[<ffffffff81489730>] get_signal+0x7e0/0x1820 kernel/signal.c:2313
[<ffffffff8127ca92>] do_signal+0xd2/0x2190 arch/x86/kernel/signal.c:807
[<ffffffff81007900>] exit_to_usermode_loop+0x200/0x2a0
arch/x86/entry/common.c:156
[<ffffffff81009413>] prepare_exit_to_usermode
arch/x86/entry/common.c:190 [inline]
[<ffffffff81009413>] syscall_return_slowpath+0x4d3/0x570
arch/x86/entry/common.c:259
[<ffffffff841c99e2>] entry_SYSCALL_64_fastpath+0xc0/0xc2

Memory state around the buggy address:

ffff88006ada0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88006ada0180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>ffff88006ada0200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^
ffff88006ada0280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88006ada0300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
==================================================================
Disabling lock debugging due to kernel taint
Kernel panic - not syncing: panic_on_warn set ...

CPU: 2 PID: 1073 Comm: kworker/2:2 Tainted: G B 4.10.0-rc3+ #164
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Workqueue: events_power_efficient process_srcu
Call Trace:
__dump_stack lib/dump_stack.c:15 [inline]
dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
panic+0x1fb/0x412 kernel/panic.c:179
kasan_end_report+0x5b/0x60 mm/kasan/report.c:144
kasan_report_error mm/kasan/report.c:291 [inline]
kasan_report+0x386/0x460 mm/kasan/report.c:307
__asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:333

rcu_batch_empty kernel/rcu/srcu.c:64 [inline]
rcu_batch_dequeue kernel/rcu/srcu.c:75 [inline]

srcu_invoke_callbacks kernel/rcu/srcu.c:624 [inline]
process_srcu+0x275/0x280 kernel/rcu/srcu.c:672
process_one_work+0xbd0/0x1c10 kernel/workqueue.c:2098
worker_thread+0x223/0x1990 kernel/workqueue.c:2232
kthread+0x326/0x3f0 kernel/kthread.c:227
ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:427
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: disabled

Paolo Bonzini

unread,

Jan 16, 2017, 4:48:27 PM1/16/17

to Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

Try this:

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..ef5599c65299 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -283,6 +283,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
{
if (WARN_ON(srcu_readers_active(sp)))
return; /* Leakage unless caller handles error. */
+ flush_delayed_work(&sp->work);
free_percpu(sp->per_cpu_ref);
sp->per_cpu_ref = NULL;
}

I think it should subsume Steve's patch, but I'm not 101% sure. We
will have to run this through Paul.

Paolo

Dmitry Vyukov

unread,

Jan 17, 2017, 4:48:13 AM1/17/17

to Paolo Bonzini, Paul McKenney, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

Hi +Pual,

I am seeing use-after-frees in process_srcu as struct srcu_struct is
already freed. Before freeing struct srcu_struct, code does
cleanup_srcu_struct(&kvm->irq_srcu). We also tried to do:

+ srcu_barrier(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->irq_srcu);

It reduced rate of use-after-frees, but did not eliminate them
completely. The full threaded is here:
https://groups.google.com/forum/#!msg/syzkaller/i48YZ8mwePY/0PQ8GkQTBwAJ

Does Paolo's fix above make sense to you? Namely adding
flush_delayed_work(&sp->work) to cleanup_srcu_struct()?

Dmitry Vyukov

unread,

Jan 17, 2017, 4:56:55 AM1/17/17

to Paolo Bonzini, Paul McKenney, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

I am not sure about interaction of flush_delayed_work and
srcu_reschedule... flush_delayed_work probably assumes that no work is
queued concurrently, but what if srcu_reschedule queues another work
concurrently... can't it happen that flush_delayed_work will miss that
newly scheduled work?

Meanwhile I will apply this change instead of Steve's change and see
what happens.

Paolo Bonzini

unread,

Jan 17, 2017, 6:08:32 AM1/17/17

to Dmitry Vyukov, Paul McKenney, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On 17/01/2017 10:56, Dmitry Vyukov wrote:
>> I am seeing use-after-frees in process_srcu as struct srcu_struct is
>> already freed. Before freeing struct srcu_struct, code does
>> cleanup_srcu_struct(&kvm->irq_srcu). We also tried to do:
>>
>> + srcu_barrier(&kvm->irq_srcu);
>> cleanup_srcu_struct(&kvm->irq_srcu);
>>
>> It reduced rate of use-after-frees, but did not eliminate them
>> completely. The full threaded is here:
>> https://groups.google.com/forum/#!msg/syzkaller/i48YZ8mwePY/0PQ8GkQTBwAJ
>>
>> Does Paolo's fix above make sense to you? Namely adding
>> flush_delayed_work(&sp->work) to cleanup_srcu_struct()?
>
> I am not sure about interaction of flush_delayed_work and
> srcu_reschedule... flush_delayed_work probably assumes that no work is
> queued concurrently, but what if srcu_reschedule queues another work
> concurrently... can't it happen that flush_delayed_work will miss that
> newly scheduled work?

Newly scheduled callbacks would be a bug in SRCU usage, but my patch is
indeed insufficient. Because of SRCU's two-phase algorithm, it's possible
that the first flush_delayed_work doesn't invoke all callbacks. Instead I
would propose this (still untested, but this time with a commit message):

---------------- 8< --------------
From: Paolo Bonzini <pbon...@redhat.com>
Subject: [PATCH] srcu: wait for all callbacks before deeming SRCU "cleaned up"

Even though there are no concurrent readers, it is possible that the
work item is queued for delayed processing when cleanup_srcu_struct is
called. The work item needs to be flushed before returning, or a
use-after-free can ensue.

Furthermore, because of SRCU's two-phase algorithm it may take up to
two executions of srcu_advance_batches before all callbacks are invoked.
This can happen if the first flush_delayed_work happens as follows

srcu_read_lock
process_srcu
srcu_advance_batches
...
if (!try_check_zero(sp, idx^1, trycount))
// there is a reader
return;
srcu_invoke_callbacks
...
srcu_read_unlock
cleanup_srcu_struct
flush_delayed_work
srcu_reschedule
queue_delayed_work

Now flush_delayed_work returns but srcu_reschedule will *not* have cleared
sp->running to false.

Not-tested-by: Paolo Bonzini <pbon...@redhat.com>
Signed-off-by: Paolo Bonzini <pbon...@redhat.com>

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 9b9cdd549caa..9470f1ba2ef2 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -283,6 +283,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)

{
if (WARN_ON(srcu_readers_active(sp)))
return; /* Leakage unless caller handles error. */
+

+ /*
+ * No readers active, so any pending callbacks will rush through the two
+ * batches before sp->running becomes false. No risk of busy-waiting.
+ */
+ while (sp->running)
+ flush_delayed_work(&sp->work);
+

free_percpu(sp->per_cpu_ref);
sp->per_cpu_ref = NULL;
}

Thanks,

Paolo

Dmitry Vyukov

unread,

Jan 17, 2017, 6:13:52 AM1/17/17

to Paolo Bonzini, Paul McKenney, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On Tue, Jan 17, 2017 at 12:08 PM, Paolo Bonzini <pbon...@redhat.com> wrote:
>
>
> On 17/01/2017 10:56, Dmitry Vyukov wrote:
>>> I am seeing use-after-frees in process_srcu as struct srcu_struct is
>>> already freed. Before freeing struct srcu_struct, code does
>>> cleanup_srcu_struct(&kvm->irq_srcu). We also tried to do:
>>>
>>> + srcu_barrier(&kvm->irq_srcu);
>>> cleanup_srcu_struct(&kvm->irq_srcu);
>>>
>>> It reduced rate of use-after-frees, but did not eliminate them
>>> completely. The full threaded is here:
>>> https://groups.google.com/forum/#!msg/syzkaller/i48YZ8mwePY/0PQ8GkQTBwAJ
>>>
>>> Does Paolo's fix above make sense to you? Namely adding
>>> flush_delayed_work(&sp->work) to cleanup_srcu_struct()?
>>
>> I am not sure about interaction of flush_delayed_work and
>> srcu_reschedule... flush_delayed_work probably assumes that no work is
>> queued concurrently, but what if srcu_reschedule queues another work
>> concurrently... can't it happen that flush_delayed_work will miss that
>> newly scheduled work?
>
> Newly scheduled callbacks would be a bug in SRCU usage, but my patch is

I mean not srcu callbacks, but the sp->work being rescheduled.
Consider that callbacks are already scheduled. We call
flush_delayed_work, it waits for completion of process_srcu. But that
process_srcu schedules sp->work again in srcu_reschedule.

Unsynchronized accesses to shared state make me nervous. running is
meant to be protected with sp->queue_lock.
At least we will get back to you with a KTSAN report.

> free_percpu(sp->per_cpu_ref);
> sp->per_cpu_ref = NULL;
> }
>
>
> Thanks,
>
> Paolo
>

> --
> You received this message because you are subscribed to the Google Groups "syzkaller" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller+...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.

Paolo Bonzini

unread,

Jan 17, 2017, 7:03:31 AM1/17/17

to Dmitry Vyukov, Paul McKenney, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

I think it could just be

while (flush_delayed_work(&sp->work));

but let's wait for Paul.

Paolo

Paul E. McKenney

unread,

Jan 17, 2017, 3:34:42 PM1/17/17

to Paolo Bonzini, Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On Tue, Jan 17, 2017 at 01:03:28PM +0100, Paolo Bonzini wrote:
>
>
> On 17/01/2017 12:13, Dmitry Vyukov wrote:
> > On Tue, Jan 17, 2017 at 12:08 PM, Paolo Bonzini <pbon...@redhat.com> wrote:
> >>
> >>
> >> On 17/01/2017 10:56, Dmitry Vyukov wrote:
> >>>> I am seeing use-after-frees in process_srcu as struct srcu_struct is
> >>>> already freed. Before freeing struct srcu_struct, code does
> >>>> cleanup_srcu_struct(&kvm->irq_srcu). We also tried to do:
> >>>>
> >>>> + srcu_barrier(&kvm->irq_srcu);
> >>>> cleanup_srcu_struct(&kvm->irq_srcu);
> >>>>
> >>>> It reduced rate of use-after-frees, but did not eliminate them
> >>>> completely. The full threaded is here:
> >>>> https://groups.google.com/forum/#!msg/syzkaller/i48YZ8mwePY/0PQ8GkQTBwAJ
> >>>>
> >>>> Does Paolo's fix above make sense to you? Namely adding
> >>>> flush_delayed_work(&sp->work) to cleanup_srcu_struct()?

Yes, we do need a flush_delayed_work(), good catch!

But doing multiple of them should not be necessary because there shouldn't
be any callbacks at all once the srcu_barrier() returns, and the only
time SRCU queues more work is if there is at least one callback pending.
The code is making sure that no new call_srcu() invocations happen before
it does the srcu_barrier(), right?

So if you are seing failures even with the single flush_delayed_work(),
it would be interesting to set a flag in the srcu_struct at
cleanup_srcu_struct time, and then splat if srcu_reschedule() does its
queue_delayed_work() when that flag is set.

> >>> I am not sure about interaction of flush_delayed_work and
> >>> srcu_reschedule... flush_delayed_work probably assumes that no work is
> >>> queued concurrently, but what if srcu_reschedule queues another work
> >>> concurrently... can't it happen that flush_delayed_work will miss that
> >>> newly scheduled work?
> >>
> >> Newly scheduled callbacks would be a bug in SRCU usage, but my patch is
> >
> > I mean not srcu callbacks, but the sp->work being rescheduled.
> > Consider that callbacks are already scheduled. We call
> > flush_delayed_work, it waits for completion of process_srcu. But that
> > process_srcu schedules sp->work again in srcu_reschedule.

It only does this if there are callbacks still on the srcu_struct, so
if you are seeing this, we either have a bug in SRCU that finds callbacks
when none are present or we have a usage bug that is creating new callbacks
after src_barrier() starts.

Do any of your callback functions invoke call_srcu()? (Hey, I have to ask!)

But srcu_reschedule() sets sp->running to false if there are no callbacks.
And at that point, there had better be no callbacks.

> >> Not-tested-by: Paolo Bonzini <pbon...@redhat.com>
> >> Signed-off-by: Paolo Bonzini <pbon...@redhat.com>
> >>
> >> diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
> >> index 9b9cdd549caa..9470f1ba2ef2 100644
> >> --- a/kernel/rcu/srcu.c
> >> +++ b/kernel/rcu/srcu.c
> >> @@ -283,6 +283,14 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
> >> {
> >> if (WARN_ON(srcu_readers_active(sp)))
> >> return; /* Leakage unless caller handles error. */
> >> +
> >> + /*
> >> + * No readers active, so any pending callbacks will rush through the two
> >> + * batches before sp->running becomes false. No risk of busy-waiting.
> >> + */
> >> + while (sp->running)
> >> + flush_delayed_work(&sp->work);
> >
> > Unsynchronized accesses to shared state make me nervous. running is
> > meant to be protected with sp->queue_lock.
>
> I think it could just be
>
> while (flush_delayed_work(&sp->work));
>
> but let's wait for Paul.

If it needs to be more than just a single flush_delayed_work(), we have
some other bug somewhere. ;-)

Thanx, Paul

Paolo Bonzini

unread,

Jan 18, 2017, 3:53:22 AM1/18/17

to pau...@linux.vnet.ibm.com, Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On 17/01/2017 21:34, Paul E. McKenney wrote:
> Do any of your callback functions invoke call_srcu()? (Hey, I have to ask!)

No, we only use synchronize_srcu and synchronize_srcu_expedited, so our
only callback comes from there.

There must be no callbacks in batch_queue and in batch_check0, and of
course srcu_invoke_callbacks will have emptied batch_done as well.

However, I'm not sure that batch_check1 is always empty after the first
flush_delayed_work *if there's no srcu_barrier* in the caller.
srcu_advance_batches's second call to try_check_zero could have failed,
and then srcu_reschedule will requeue the work item to advance
batch_check1 into batch_done.

If this is incorrect, then one flush_delayed_work is enough. If it is
correct, the possible alternatives are:

* srcu_barrier in the caller, flush_delayed_work+WARN_ON(sp->running) in
cleanup_srcu_struct. I strongly dislike this one---because we don't use
call_srcu at all, there should be no reason to use srcu_barrier in KVM
code. Plus I think all other users have the same issue.

* srcu_barrier+flush_delayed_work+WARN_ON(sp->running) in
cleanup_srcu_struct

* flush_delayed_work+flush_delayed_work+WARN_ON(sp->running) in
cleanup_srcu_struct

* while(flush_delayed_work) in cleanup_srcu_struct

* "while(sp->running) flush_delayed_work" in cleanup_srcu_struct

Paolo

Paul E. McKenney

unread,

Jan 18, 2017, 5:29:25 PM1/18/17

to Paolo Bonzini, Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On Wed, Jan 18, 2017 at 09:53:19AM +0100, Paolo Bonzini wrote:
>
>
> On 17/01/2017 21:34, Paul E. McKenney wrote:
> > Do any of your callback functions invoke call_srcu()? (Hey, I have to ask!)
>
> No, we only use synchronize_srcu and synchronize_srcu_expedited, so our
> only callback comes from there.

OK, so the next question is whether your code makes sure that all of its
synchronize_srcu() and synchronize_srcu_expedited() calls return before
the call to cleanup_srcu_struct().

You should only need srcu_barrier() if there were calls to call_srcu().
Given that you only have synchronize_srcu() and synchronize_srcu_expedited(),
you -don't- need srcu_barrier(). What you need instead is to make sure
that all synchronize_srcu() and synchronize_srcu_expedited() have
returned before the call to cleanup_srcu_struct().

> If this is incorrect, then one flush_delayed_work is enough. If it is
> correct, the possible alternatives are:
>
> * srcu_barrier in the caller, flush_delayed_work+WARN_ON(sp->running) in
> cleanup_srcu_struct. I strongly dislike this one---because we don't use
> call_srcu at all, there should be no reason to use srcu_barrier in KVM
> code. Plus I think all other users have the same issue.
>
> * srcu_barrier+flush_delayed_work+WARN_ON(sp->running) in
> cleanup_srcu_struct
>
> * flush_delayed_work+flush_delayed_work+WARN_ON(sp->running) in
> cleanup_srcu_struct
>
> * while(flush_delayed_work) in cleanup_srcu_struct
>
> * "while(sp->running) flush_delayed_work" in cleanup_srcu_struct

My current thought is flush_delayed_work() followed by a warning if
there are any callbacks still posted, and also as you say sp->running.

Thoughts?

Thanx, Paul

Paolo Bonzini

unread,

Jan 19, 2017, 4:27:12 AM1/19/17

to pau...@linux.vnet.ibm.com, Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

On 18/01/2017 23:15, Paul E. McKenney wrote:
> On Wed, Jan 18, 2017 at 09:53:19AM +0100, Paolo Bonzini wrote:
>>
>>
>> On 17/01/2017 21:34, Paul E. McKenney wrote:
>>> Do any of your callback functions invoke call_srcu()? (Hey, I have to ask!)
>>
>> No, we only use synchronize_srcu and synchronize_srcu_expedited, so our
>> only callback comes from there.
>
> OK, so the next question is whether your code makes sure that all of its
> synchronize_srcu() and synchronize_srcu_expedited() calls return before
> the call to cleanup_srcu_struct().

It certainly should! Or at least that would be our bug.

> You should only need srcu_barrier() if there were calls to call_srcu().
> Given that you only have synchronize_srcu() and synchronize_srcu_expedited(),
> you -don't- need srcu_barrier(). What you need instead is to make sure
> that all synchronize_srcu() and synchronize_srcu_expedited() have
> returned before the call to cleanup_srcu_struct().

Ok, good.

>> If this is incorrect, then one flush_delayed_work is enough. If it is
>> correct, the possible alternatives are:
>>
>> * srcu_barrier in the caller, flush_delayed_work+WARN_ON(sp->running) in
>> cleanup_srcu_struct. I strongly dislike this one---because we don't use
>> call_srcu at all, there should be no reason to use srcu_barrier in KVM
>> code. Plus I think all other users have the same issue.
>>
>> * srcu_barrier+flush_delayed_work+WARN_ON(sp->running) in
>> cleanup_srcu_struct
>>
>> * flush_delayed_work+flush_delayed_work+WARN_ON(sp->running) in
>> cleanup_srcu_struct
>>
>> * while(flush_delayed_work) in cleanup_srcu_struct
>>
>> * "while(sp->running) flush_delayed_work" in cleanup_srcu_struct
>
> My current thought is flush_delayed_work() followed by a warning if
> there are any callbacks still posted, and also as you say sp->running.

Yes, that would work for KVM and anyone else who doesn't use call_srcu
(and order synchronize_srcu correctly against destruction).

On the other hand, users of call_srcu, such as rcutorture, _do_ need to
place an srcu_barrier before cleanup_srcu_struct, or they need two
flush_delayed_work() calls back to back in cleanup_srcu_struct. Do you
agree?

Thanks,

Paolo

Paul McKenney

unread,

Jan 19, 2017, 4:52:26 PM1/19/17

to Paolo Bonzini, Paul McKenney, Dmitry Vyukov, Steve Rutherford, syzkaller, Radim Krčmář, KVM list, LKML

(Trouble with VPN, so replying from gmail.)

The reason I am resisting the notion of placing an srcu_barrier() in
the cleanup_srcu_struct path is that most users don't use call_srcu(),
and I don't feel that we should be inflicting the srcu_barrier()
performance penalty on them.

So I agree with the user invoking call_srcu() after preventing future
calls to call_srcu(), and with there being a flush_delayed_work() in
cleanup_srcu_struct(). As in the untested (and probably
whitespace-mangled) patch below.

Thoughts?

Thanx, Paul

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index f2abfbae258c..5813ed848821 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -65,6 +65,17 @@ static inline bool rcu_batch_empty(struct rcu_batch *b)
}

/*
+ * Are all batches empty for the specified srcu_struct?
+ */
+static inline bool rcu_all_batches_empty(struct srcu_struct *sp)
+{
+ return rcu_batch_empty(&sp->batch_done) &&
+ rcu_batch_empty(&sp->batch_check1) &&
+ rcu_batch_empty(&sp->batch_check0) &&
+ rcu_batch_empty(&sp->batch_queue);
+}
+
+/*
* Remove the callback at the head of the specified rcu_batch structure
* and return a pointer to it, or return NULL if the structure is empty.
*/
@@ -251,6 +262,11 @@ void cleanup_srcu_struct(struct srcu_struct *sp)

{
if (WARN_ON(srcu_readers_active(sp)))
return; /* Leakage unless caller handles error. */

+ if (WARN_ON(!rcu_all_batches_empty(sp)))
+ return; /* Leakage unless caller handles error. */
+ flush_delayed_work(&sp->work);
+ if (WARN_ON(sp->running))
+ return; /* Caller forgot to stop doing call_srcu()? */

free_percpu(sp->per_cpu_ref);
sp->per_cpu_ref = NULL;
}

@@ -610,15 +626,9 @@ static void srcu_reschedule(struct srcu_struct *sp)
{
bool pending = true;

- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
+ if (rcu_all_batches_empty(sp)) {
spin_lock_irq(&sp->queue_lock);
- if (rcu_batch_empty(&sp->batch_done) &&
- rcu_batch_empty(&sp->batch_check1) &&
- rcu_batch_empty(&sp->batch_check0) &&
- rcu_batch_empty(&sp->batch_queue)) {
+ if (rcu_all_batches_empty(sp)) {
sp->running = false;
pending = false;
}

Reply all

Reply to author

Forward

kvm: use-after-free in process_srcu

Dmitry Vyukov

Vegard Nossum

Dmitry Vyukov

Steve Rutherford

Dmitry Vyukov

Dmitry Vyukov

Dmitry Vyukov

Paolo Bonzini

Dmitry Vyukov

Dmitry Vyukov

Paolo Bonzini

Dmitry Vyukov

Paolo Bonzini

Paul E. McKenney

Paolo Bonzini

Paul E. McKenney

Paolo Bonzini

Paul McKenney