mm: BUG in expand_downwards

50 views
Skip to first unread message

Dmitry Vyukov

unread,
Jan 27, 2016, 5:51:24 AM1/27/16
to Andrew Morton, Kirill A. Shutemov, Oleg Nesterov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Konstantin Khlebnikov, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Hello,

The following program trigger BUG in expand_downwards (run it in tight
parallel loop for a while):

// autogenerated by syzkaller (http://github.com/google/syzkaller)
#include <pthread.h>
#include <stdint.h>
#include <string.h>
#include <sys/syscall.h>
#include <unistd.h>

long r[10];

void* thr(void* arg)
{
switch ((long)arg) {
case 0:
r[0] = syscall(SYS_mmap, 0x20731000ul, 0x2000ul, 0x2ul, 0x22132ul,
0xfffffffffffffffful, 0x0ul);
break;
case 1:
r[1] = syscall(SYS_mremap, 0x20732000ul, 0x1000ul, 0x3000ul, 0x3ul,
0x2015b000ul, 0);
break;
case 2:
r[2] = syscall(SYS_fanotify_init, 0x20ul, 0x40002ul, 0, 0, 0, 0);
break;
case 3:
r[3] = syscall(SYS_mremap, 0x2015c000ul, 0x1000ul, 0x4000ul, 0x3ul,
0x204eb000ul, 0);
break;
case 4:
r[4] = syscall(SYS_munlock, 0x204eb000ul, 0x4000ul, 0, 0, 0, 0);
break;
case 5:
r[5] = syscall(SYS_mremap, 0x204ec000ul, 0x1000ul, 0x1000ul, 0x3ul,
0x2015d000ul, 0);
break;
case 6:
r[6] = syscall(SYS_mmap, 0x20000000ul, 0x1000ul, 0x3ul, 0x32ul,
0xfffffffffffffffful, 0x0ul);
break;
case 7:
r[7] = syscall(SYS_sigaltstack, 0x204ec000ul, 0x20000000ul, 0, 0, 0,
0);
break;
case 8:
memcpy((void*)0x2015bffa, "\x2e\x2f\x62\x75\x73\x00", 6);
r[9] = syscall(SYS_mkdir, 0x2015bffaul, 0x108ul, 0, 0, 0, 0);
break;
}
return 0;
}

int main()
{
long i;
pthread_t th[9];

srand(getpid());
memset(r, -1, sizeof(r));
for (i = 0; i < 9; i++) {
pthread_create(&th[i], 0, thr, (void*)i);
usleep(1000);
}
for (i = 0; i < 9; i++) {
pthread_create(&th[i], 0, thr, (void*)i);
usleep(rand()%1000);
}
usleep(10000);
return 0;
}



=====================================
[ BUG: bad unlock balance detected! ]
4.5.0-rc1+ #292 Not tainted
-------------------------------------
a.out/7506 is trying to release lock (&anon_vma->rwsem) at:
[< inline >] vma_unlock_anon_vma include/linux/rmap.h:123
[<ffffffff816f8e5d>] validate_mm+0x12d/0x850 mm/mmap.c:461
but there are no more locks to release!

other info that might help us debug this:
1 lock held by a.out/7506:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff8127dfd7>]
__do_page_fault+0x737/0x960 arch/x86/mm/fault.c:1184

stack backtrace:
CPU: 1 PID: 7506 Comm: a.out Not tainted 4.5.0-rc1+ #292
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
00000000ffffffff ffff88005d107b50 ffffffff829e79ad ffff8800603b4ce8
ffff88005f15df00 ffffffff816f8e5d ffff88005d107b80 ffffffff81450884
ffff8800603b4ce8 ffff88005f15e720 ffff88005f15e720 00000000ffffffff
Call Trace:
[< inline >] __dump_stack lib/dump_stack.c:15
[<ffffffff829e79ad>] dump_stack+0x6f/0xa2 lib/dump_stack.c:50
[<ffffffff81450884>] print_unlock_imbalance_bug+0x174/0x1a0
kernel/locking/lockdep.c:3261
[< inline >] __lock_release kernel/locking/lockdep.c:3385
[<ffffffff8145c687>] lock_release+0x937/0xce0 kernel/locking/lockdep.c:3606
[<ffffffff814490ba>] up_write+0x1a/0x60 kernel/locking/rwsem.c:91
[< inline >] vma_unlock_anon_vma include/linux/rmap.h:123
[<ffffffff816f8e5d>] validate_mm+0x12d/0x850 mm/mmap.c:461
[<ffffffff817034be>] expand_downwards+0x19e/0xb30 mm/mmap.c:2283
[<ffffffff81703ecb>] expand_stack+0x7b/0xe0 mm/mmap.c:2338
[<ffffffff8127df88>] __do_page_fault+0x6e8/0x960 arch/x86/mm/fault.c:1217
[<ffffffff8127e358>] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331
[<ffffffff8126f514>] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264
[<ffffffff8645bdf8>] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986
[< inline >] SYSC_sigaltstack kernel/signal.c:3157
[<ffffffff8138600c>] SyS_sigaltstack+0x6c/0x90 kernel/signal.c:3155
[<ffffffff86459ab6>] entry_SYSCALL_64_fastpath+0x16/0x7a
arch/x86/entry/entry_64.S:185
mmap: free gap 1000, correct 0
mm ffff8800603a4e00 mmap ffff88005fc794c0 seqnum 12 task_size 140737488351232
get_unmapped_area ffffffff811af470
mmap_base 140584500215808 mmap_legacy_base 47048170172416
highest_vm_end 140737437655040
pgd ffff88005fc20000 mm_users 2 mm_count 1 nr_ptes 25 nr_pmds 3 map_count 49
hiwater_rss 7 hiwater_vm 813a total_vm 913e locked_vm 2
pinned_vm 0 data_vm 902e exec_vm d1 stack_vm 2a
start_code 400000 end_code 4ce986 start_data 6cee90 end_data 6d0d88
start_brk 934000 brk 957000 start_stack 7ffffcf5e660
arg_start 7ffffcf5feae arg_end 7ffffcf5feb6 env_start 7ffffcf5feb6
env_end 7ffffcf5fff0
binfmt ffffffff8797c260 flags cd core_state (null)
ioctx_table (null)
owner ffff88005d99df00 exe_file ffff880033321680
mmu_notifier_mm (null)
flags: 0x0()
------------[ cut here ]------------
kernel BUG at mm/mmap.c:481!
invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN
Modules linked in:
CPU: 1 PID: 9534 Comm: a.out Not tainted 4.5.0-rc1+ #292
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff880031522f80 ti: ffff8800314f8000 task.ti: ffff8800314f8000
RIP: 0010:[<ffffffff816f922c>] [<ffffffff816f922c>] validate_mm+0x4fc/0x850
RSP: 0018:ffff8800314ffc60 EFLAGS: 00010286
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffffed000629ff22
RBP: ffff8800314ffcc8 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000003 R11: 1ffffffff12e43c4 R12: 0000000000000031
R13: 0000000000000000 R14: 00007ffffcfa4000 R15: dffffc0000000000
FS: 00007fdc59313700(0000) GS:ffff88003ed00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000000204ec000 CR3: 000000005fc20000 CR4: 00000000000006e0
Stack:
0000000000486f70 ffff8800603a4e60 ffff8800603a4e00 0000003100000001
0000000000000000 0000000000041000 ffff88005fc78538 ffff88005fc78530
ffff880032870a60 0000000000000000 ffff880031e61e00 000000001f6a9000
Call Trace:
[<ffffffff817034be>] expand_downwards+0x19e/0xb30 mm/mmap.c:2283
[<ffffffff81703ecb>] expand_stack+0x7b/0xe0 mm/mmap.c:2338
[<ffffffff8127df88>] __do_page_fault+0x6e8/0x960 arch/x86/mm/fault.c:1217
[<ffffffff8127e358>] trace_do_page_fault+0xe8/0x420 arch/x86/mm/fault.c:1331
[<ffffffff8126f514>] do_async_page_fault+0x14/0xd0 arch/x86/kernel/kvm.c:264
[<ffffffff8645bdf8>] async_page_fault+0x28/0x30 arch/x86/entry/entry_64.S:986
[< inline >] SYSC_sigaltstack kernel/signal.c:3157
[<ffffffff8138600c>] SyS_sigaltstack+0x6c/0x90 kernel/signal.c:3155
[<ffffffff86459ab6>] entry_SYSCALL_64_fastpath+0x16/0x7a
arch/x86/entry/entry_64.S:185
Code: 58 60 44 39 e3 74 73 e8 03 0f e7 ff 44 89 e2 89 de 48 c7 c7 a0
dc 5a 86 e8 e7 c3 f4 ff e8 ed 0e e7 ff 48 8b 7d a8 e8 24 00 fe ff <0f>
0b e8 dd 0e e7 ff 8b 55 b4 44 89 e6 48 c7 c7 40 dc 5a 86 e8
RIP [<ffffffff816f922c>] validate_mm+0x4fc/0x850 mm/mmap.c:481
RSP <ffff8800314ffc60>
---[ end trace 5282aed32b3391b4 ]---


And there seems to be some locking issue in validate_mm.

On commit 92e963f50fc74041b5e9e744c330dca48e04f08d (Jan 24).

Konstantin Khlebnikov

unread,
Jan 27, 2016, 6:49:55 AM1/27/16
to Dmitry Vyukov, Andrew Morton, Kirill A. Shutemov, Oleg Nesterov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
It seems anon_vma appeared between lock and unlock.

This should fix the bug and make code faster (write lock isn't required here)

--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -453,12 +453,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;

while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;

- vma_lock_anon_vma(vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- vma_unlock_anon_vma(vma);
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;

Dmitry Vyukov

unread,
Jan 27, 2016, 7:24:58 AM1/27/16
to Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Oleg Nesterov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Now testing with this patch. Thanks for quick fix!

Dmitry Vyukov

unread,
Jan 27, 2016, 9:43:03 AM1/27/16
to Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Oleg Nesterov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Hit the same BUG with this patch.
Please try to reproduce it locally and test.

Oleg Nesterov

unread,
Jan 27, 2016, 2:41:38 PM1/27/16
to Dmitry Vyukov, Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Do you mean the same "bad unlock balance detected" BUG? this should be "obviously"
fixed by the patch above...

Or you mean the 2nd VM_BUG_ON_MM() ?

> Please try to reproduce it locally and test.

tried to reproduce, doesn't work.

Oleg.

Dmitry Vyukov

unread,
Jan 27, 2016, 4:12:05 PM1/27/16
to Oleg Nesterov, Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Chen Gang, Michal Hocko, Piotr Kwapulinski, Andrea Arcangeli, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Sorry, I meant only the second once. The mm bug.
I guess you need at least CONFIG_DEBUG_VM. Run it in a tight parallel
loop with CPU oversubscription (e.g. 32 parallel processes on 2 cores)
for at least an hour.

Andrea Arcangeli

unread,
Jan 28, 2016, 10:47:50 AM1/28/16
to Dmitry Vyukov, Oleg Nesterov, Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Chen Gang, Michal Hocko, Piotr Kwapulinski, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
Hello,

On Wed, Jan 27, 2016 at 10:11:44PM +0100, Dmitry Vyukov wrote:
> Sorry, I meant only the second once. The mm bug.
> I guess you need at least CONFIG_DEBUG_VM. Run it in a tight parallel
> loop with CPU oversubscription (e.g. 32 parallel processes on 2 cores)
> for at least an hour.

Does this help for the mm bug?

From 0cc410ae59800444ca929e3dc48e4f1580a95be6 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarc...@redhat.com>
Date: Thu, 28 Jan 2016 16:34:44 +0100
Subject: [PATCH 1/1] mm: validate_mm browse_rb SMP race condition

The mmap_sem for reading in validate_mm called from expand_stack is
not enough to prevent the argumented rbtree rb_subtree_gap information
to change from under us because expand_stack may be running from other
threads concurrently which will hold the mmap_sem for reading too.

The argumented rbtree is updated with vma_gap_update under the
page_table_lock so use it in browse_rb() too to avoid false positives.

Reported-by: Dmitry Vyukov <dvy...@google.com>
Signed-off-by: Andrea Arcangeli <aarc...@redhat.com>
---
mm/mmap.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index f384def..8389e03 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -389,8 +389,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}

#ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
@@ -413,12 +414,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
+ spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
+ spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
@@ -474,7 +477,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(&mm->mm_rb);
+ i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);

Dmitry Vyukov

unread,
Jan 29, 2016, 5:39:44 AM1/29/16
to Andrea Arcangeli, Oleg Nesterov, Konstantin Khlebnikov, Andrew Morton, Kirill A. Shutemov, Chen Gang, Michal Hocko, Piotr Kwapulinski, linu...@kvack.org, LKML, Hugh Dickins, Sasha Levin, syzkaller, Kostya Serebryany, Alexander Potapenko
On Thu, Jan 28, 2016 at 4:47 PM, Andrea Arcangeli <aarc...@redhat.com> wrote:
> Hello,
>
> On Wed, Jan 27, 2016 at 10:11:44PM +0100, Dmitry Vyukov wrote:
>> Sorry, I meant only the second once. The mm bug.
>> I guess you need at least CONFIG_DEBUG_VM. Run it in a tight parallel
>> loop with CPU oversubscription (e.g. 32 parallel processes on 2 cores)
>> for at least an hour.
>
> Does this help for the mm bug?

Yes, it seems to fix the issue.
I will also run fuzzer with this patch and report if I see it again.
Reply all
Reply to author
Forward
0 new messages