[syzbot] kernel BUG in __page_mapcount

44 views
Skip to first unread message

syzbot

unread,
May 30, 2021, 8:53:25 PM5/30/21
to ak...@linux-foundation.org, chinwe...@mediatek.com, ja...@google.com, linux-...@vger.kernel.org, linux-...@vger.kernel.org, syzkall...@googlegroups.com, vba...@suse.cz, wal...@google.com
Hello,

syzbot found the following issue on:

HEAD commit: 7ac3a1c1 Merge tag 'mtd/fixes-for-5.13-rc4' of git://git.k..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=14559d5bd00000
kernel config: https://syzkaller.appspot.com/x/.config?x=f9f3fc7daa178986
dashboard link: https://syzkaller.appspot.com/bug?extid=1f52b3a18d5633fa7f82
compiler: Debian clang version 11.0.1-2
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=11132d5bd00000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+1f52b3...@syzkaller.appspotmail.com

__mmput+0x111/0x370 kernel/fork.c:1096
exit_mm+0x67e/0x7d0 kernel/exit.c:502
do_exit+0x6b9/0x23d0 kernel/exit.c:813
do_group_exit+0x168/0x2d0 kernel/exit.c:923
get_signal+0x1770/0x2180 kernel/signal.c:2835
arch_do_signal_or_restart+0x8e/0x6c0 arch/x86/kernel/signal.c:789
handle_signal_work kernel/entry/common.c:147 [inline]
exit_to_user_mode_loop kernel/entry/common.c:171 [inline]
exit_to_user_mode_prepare+0xac/0x200 kernel/entry/common.c:208
__syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline]
syscall_exit_to_user_mode+0x26/0x70 kernel/entry/common.c:301
------------[ cut here ]------------
kernel BUG at include/linux/page-flags.h:686!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 10694 Comm: syz-executor.0 Not tainted 5.13.0-rc3-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:PageDoubleMap include/linux/page-flags.h:686 [inline]
RIP: 0010:__page_mapcount+0x2b3/0x2d0 mm/util.c:728
Code: e8 72 25 cf ff 4c 89 ff 48 c7 c6 40 fb 39 8a e8 03 4c 04 00 0f 0b e8 5c 25 cf ff 4c 89 ff 48 c7 c6 40 fc 39 8a e8 ed 4b 04 00 <0f> 0b e8 46 25 cf ff 4c 89 ff 48 c7 c6 80 fc 39 8a e8 d7 4b 04 00
RSP: 0018:ffffc90001ff7460 EFLAGS: 00010246
RAX: e8070b6faabf8b00 RBX: 00fff0000008001d RCX: ffff888047280000
RDX: 0000000000000000 RSI: 000000000000ffff RDI: 000000000000ffff
RBP: 0000000000000000 R08: ffffffff81ce2584 R09: ffffed1017363f24
R10: ffffed1017363f24 R11: 0000000000000000 R12: 1ffffd4000265001
R13: 00000000ffffffff R14: dffffc0000000000 R15: ffffea0001328000
FS: 00007f6e83636700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000568000 CR3: 000000002b559000 CR4: 00000000001506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
page_mapcount include/linux/mm.h:873 [inline]
smaps_account+0x79d/0x980 fs/proc/task_mmu.c:467
smaps_pte_entry fs/proc/task_mmu.c:533 [inline]
smaps_pte_range+0x6ed/0xfc0 fs/proc/task_mmu.c:596
walk_pmd_range mm/pagewalk.c:89 [inline]
walk_pud_range mm/pagewalk.c:160 [inline]
walk_p4d_range mm/pagewalk.c:193 [inline]
walk_pgd_range mm/pagewalk.c:229 [inline]
__walk_page_range+0xd64/0x1ad0 mm/pagewalk.c:331
walk_page_vma+0x3c2/0x500 mm/pagewalk.c:482
smap_gather_stats fs/proc/task_mmu.c:769 [inline]
show_smaps_rollup+0x49d/0xc20 fs/proc/task_mmu.c:872
seq_read_iter+0x43a/0xcf0 fs/seq_file.c:227
seq_read+0x445/0x5c0 fs/seq_file.c:159
do_loop_readv_writev fs/read_write.c:761 [inline]
do_iter_read+0x464/0x660 fs/read_write.c:803
vfs_readv fs/read_write.c:921 [inline]
do_preadv+0x1f7/0x340 fs/read_write.c:1013
do_syscall_64+0x3f/0xb0 arch/x86/entry/common.c:47
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x4665d9
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f6e83636188 EFLAGS: 00000246 ORIG_RAX: 0000000000000127
RAX: ffffffffffffffda RBX: 000000000056bf80 RCX: 00000000004665d9
RDX: 0000000000000001 RSI: 0000000020000780 RDI: 0000000000000004
RBP: 00000000004bfcb9 R08: 0000000000000003 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000056bf80
R13: 00007ffeb607b5df R14: 00007f6e83636300 R15: 0000000000022000
Modules linked in:
---[ end trace e65a33e7d2bffb07 ]---
RIP: 0010:PageDoubleMap include/linux/page-flags.h:686 [inline]
RIP: 0010:__page_mapcount+0x2b3/0x2d0 mm/util.c:728
Code: e8 72 25 cf ff 4c 89 ff 48 c7 c6 40 fb 39 8a e8 03 4c 04 00 0f 0b e8 5c 25 cf ff 4c 89 ff 48 c7 c6 40 fc 39 8a e8 ed 4b 04 00 <0f> 0b e8 46 25 cf ff 4c 89 ff 48 c7 c6 80 fc 39 8a e8 d7 4b 04 00
RSP: 0018:ffffc90001ff7460 EFLAGS: 00010246
RAX: e8070b6faabf8b00 RBX: 00fff0000008001d RCX: ffff888047280000
RDX: 0000000000000000 RSI: 000000000000ffff RDI: 000000000000ffff
RBP: 0000000000000000 R08: ffffffff81ce2584 R09: ffffed1017363f24
R10: ffffed1017363f24 R11: 0000000000000000 R12: 1ffffd4000265001
R13: 00000000ffffffff R14: dffffc0000000000 R15: ffffea0001328000
FS: 00007f6e83636700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000568000 CR3: 000000002b559000 CR4: 00000000001506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzk...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this issue, for details see:
https://goo.gl/tpsmEJ#testing-patches

Jann Horn

unread,
Jun 7, 2021, 11:13:40 AM6/7/21
to syzbot, syzkaller-bugs
On Mon, May 31, 2021 at 2:53 AM syzbot
<syzbot+1f52b3...@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit: 7ac3a1c1 Merge tag 'mtd/fixes-for-5.13-rc4' of git://git.k..
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=14559d5bd00000
> kernel config: https://syzkaller.appspot.com/x/.config?x=f9f3fc7daa178986
> dashboard link: https://syzkaller.appspot.com/bug?extid=1f52b3a18d5633fa7f82
> compiler: Debian clang version 11.0.1-2
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=11132d5bd00000
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+1f52b3...@syzkaller.appspotmail.com

#syz test: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
7ac3a1c1
0001-HACK-avoid-page_mapcount-on-migration-entry.patch

syzbot

unread,
Jun 7, 2021, 1:24:07 PM6/7/21
to ja...@google.com, syzkall...@googlegroups.com
Hello,

syzbot has tested the proposed patch and the reproducer did not trigger any issue:

Reported-and-tested-by: syzbot+1f52b3...@syzkaller.appspotmail.com

Tested on:

commit: 7ac3a1c1 Merge tag 'mtd/fixes-for-5.13-rc4' of git://git.k..
git tree: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel config: https://syzkaller.appspot.com/x/.config?x=610e67ed2ff93125
dashboard link: https://syzkaller.appspot.com/bug?extid=1f52b3a18d5633fa7f82
compiler: Debian clang version 11.0.1-2
patch: https://syzkaller.appspot.com/x/patch.diff?x=126a243fd00000

Note: testing is done by a robot and is best-effort only.

Jann Horn

unread,
Jun 7, 2021, 1:27:52 PM6/7/21
to Matthew Wilcox, Linux-MM, Zi Yan, Peter Xu, Kirill A. Shutemov, Konstantin Khlebnikov, Andrew Morton, chinwe...@mediatek.com, kernel list, syzkaller-bugs, Vlastimil Babka, Michel Lespinasse, syzbot
+some folks who I think might work on THP-related stuff
-fsdevel because this doesn't have much to do with filesystem stuff

=== Short summary ===
I believe the issue here is a race between /proc/*/smaps and
split_huge_page_to_list():

The codepath for /proc/*/smaps walks the pagetables and (e.g. in
smaps_account()) calls page_mapcount() not just on pages from normal
PTEs but also on migration entries (since commit b1d4d9e0cbd0a
"proc/smaps: carefully handle migration entries", from Linux v3.5).
page_mapcount() expects compound pages to be stable.

The split_huge_page_to_list() path first protects the compound page by
locking it and replacing all its PTEs with migration entries (since
the THP rewrite in v4.5, I think?), then does the actual splitting
using __split_huge_page().

So there's a mismatch of expectations here:
The smaps code expects that migration entries point to stable compound
pages, while the THP code expects that it's okay to split a compound
page while it has migration entries.


I'm not sure what the best way to fix it is; I guess the smaps code is
at fault here, and the following options might be approaches to fixing
it?

1. skip migration entries completely in smaps?
2. let smaps assume that the mapcount is 1 for all migration entries?
3. try to be fully accurate by waiting for the migration entry to
change back to normal?
probably a bad idea, way too messy...

Note that the mapcount of a page that is being migrated doesn't really
represent how many MMs are using the page anyway, so we wouldn't
really be making the current situation much worse by fudging the
mapcount to 1 in the smaps code...


By the way, I think the /proc/*/pagemap code (pte_to_pagemap_entry())
has the same issue?


I have asked syzkaller to test what happens if smaps_account() is
hacked up to avoid page_mapcount() for migration entries, and that
seems to fix the crash:
https://groups.google.com/g/syzkaller-bugs/c/9AZZCz4OtvE/m/WZDYXEKKAgAJ

Original syzkaller report with some more details below:

On Mon, May 31, 2021 at 2:53 AM syzbot
<syzbot+1f52b3...@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit: 7ac3a1c1 Merge tag 'mtd/fixes-for-5.13-rc4' of git://git.k..
> git tree: upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=14559d5bd00000
> kernel config: https://syzkaller.appspot.com/x/.config?x=f9f3fc7daa178986
> dashboard link: https://syzkaller.appspot.com/bug?extid=1f52b3a18d5633fa7f82
> compiler: Debian clang version 11.0.1-2
> syz repro: https://syzkaller.appspot.com/x/repro.syz?x=11132d5bd00000

The "syz repro" link shows that this appears to be some kind of race
involving mmap(), madvise(..., MADV_FREE), and
/proc/$pid/smaps_rollup.
Note the '"threaded":true,"collide":true,"repeat":true' bit, which
indicates that this is some kind of race that syzbot is repeatedly
trying to hit by executing operations on multiple threads
simultaneously.

The MADV_FREE path involves split_huge_page(page); and in the linked
console output we can see the page state:

[ 294.055186][T10694] page:ffffea0001328000 refcount:1 mapcount:0
mapping:0000000000000000 index:0x20000 pfn:0x4ca00
[ 294.069056][T10694] memcg:ffff88814011c000
[ 294.073365][T10694] anon flags:
0xfff0000008001d(locked|uptodate|dirty|lru|swapbacked|node=0|zone=1|lastcpupid=0x7ff)
[ 294.084590][T10694] raw: 00fff0000008001d ffffea00013e0508
ffffea0001328048 ffff888045983e01
[ 294.093279][T10694] raw: 0000000000020000 0000000000000000
00000001ffffffff ffff88814011c000
[ 294.102469][T10694] page dumped because: VM_BUG_ON_PAGE(!PageHead(page))
[ 294.109422][T10694] page_owner tracks the page as allocated
[ 294.115444][T10694] page last allocated via order 0, migratetype
Movable, gfp_mask
0x3d20ca(GFP_TRANSHUGE_LIGHT|__GFP_NORETRY|__GFP_THISNODE), pid 10690,
ts 293625395715, free_ts 293374863045
[ 294.133087][T10694] get_page_from_freelist+0x779/0xa20
[ 294.138623][T10694] __alloc_pages+0x26c/0x5f0
[ 294.143337][T10694] alloc_pages_vma+0x9c7/0xe70
[ 294.148655][T10694] do_huge_pmd_anonymous_page+0x5b9/0xce0
[ 294.155039][T10694] handle_mm_fault+0x207e/0x2620
[ 294.160759][T10694] do_user_addr_fault+0x8ce/0x1120
[ 294.165961][T10694] exc_page_fault+0xa1/0x1e0
[ 294.171144][T10694] asm_exc_page_fault+0x1e/0x30

The message "page last allocated via order 0" is kinda misleading; as
you can see from the stacktrace and the GFP_TRANSHUGE_LIGHT flag, this
page was actually allocated as part of a hugepage, but was then later
split: __split_page_owner() fixes up the order in the page_owner
metadata, and is called from __split_huge_page() via
split_page_owner().

> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+1f52b3...@syzkaller.appspotmail.com
[...]
> ------------[ cut here ]------------
> kernel BUG at include/linux/page-flags.h:686!
> invalid opcode: 0000 [#1] PREEMPT SMP KASAN
> CPU: 1 PID: 10694 Comm: syz-executor.0 Not tainted 5.13.0-rc3-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> RIP: 0010:PageDoubleMap include/linux/page-flags.h:686 [inline]
> RIP: 0010:__page_mapcount+0x2b3/0x2d0 mm/util.c:728
> Code: e8 72 25 cf ff 4c 89 ff 48 c7 c6 40 fb 39 8a e8 03 4c 04 00 0f 0b e8 5c 25 cf ff 4c 89 ff 48 c7 c6 40 fc 39 8a e8 ed 4b 04 00 <0f> 0b e8 46 25 cf ff 4c 89 ff 48 c7 c6 80 fc 39 8a e8 d7 4b 04 00
> RSP: 0018:ffffc90001ff7460 EFLAGS: 00010246
> RAX: e8070b6faabf8b00 RBX: 00fff0000008001d RCX: ffff888047280000
> RDX: 0000000000000000 RSI: 000000000000ffff RDI: 000000000000ffff
> RBP: 0000000000000000 R08: ffffffff81ce2584 R09: ffffed1017363f24
> R10: ffffed1017363f24 R11: 0000000000000000 R12: 1ffffd4000265001
> R13: 00000000ffffffff R14: dffffc0000000000 R15: ffffea0001328000
> FS: 00007f6e83636700(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000568000 CR3: 000000002b559000 CR4: 00000000001506e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
> page_mapcount include/linux/mm.h:873 [inline]

This is page_mapcount():

870 static inline int page_mapcount(struct page *page)
871 {
872 if (unlikely(PageCompound(page)))
873 return __page_mapcount(page);
<===========================
874 return atomic_read(&page->_mapcount) + 1;
875 }

So the page that page_mapcount() was called on is/was marked as
PageCompound() at that time, but then, in __page_mapcount we crash:

714 /* Slow path of page_mapcount() for compound pages */
715 int __page_mapcount(struct page *page)
716 {
717 int ret;
718
719 ret = atomic_read(&page->_mapcount) + 1;
720 /*
721 * For file THP page->_mapcount contains total number of mapping
722 * of the page: no need to look into compound_mapcount.
723 */
724 if (!PageAnon(page) && !PageHuge(page))
725 return ret;
726 page = compound_head(page);
727 ret += atomic_read(compound_mapcount_ptr(page)) + 1;
728 if (PageDoubleMap(page)) <===========================
729 ret--;
730 return ret;
731 }

which means the supposed compound head page is not a compound page (anymore).

> smaps_account+0x79d/0x980 fs/proc/task_mmu.c:467
> smaps_pte_entry fs/proc/task_mmu.c:533 [inline]
> smaps_pte_range+0x6ed/0xfc0 fs/proc/task_mmu.c:596

In here we're holding the PTE lock, so either the page of interest has
a non-zero (non-compound) mapcount, *OR* there is a migration PTE for
the page.

split_huge_page_to_list() takes a locked page and unmaps it from all
pagetables with unmap_page(), but because TTU_SPLIT_FREEZE is set,
try_to_unmap_one() will not simply clear the PTE but instead replace
it with a migration entry, so the /proc/*/smaps code can still observe
the page after that point.

Matthew Wilcox

unread,
Jun 7, 2021, 2:04:24 PM6/7/21
to Jann Horn, Linux-MM, Zi Yan, Peter Xu, Kirill A. Shutemov, Konstantin Khlebnikov, Andrew Morton, chinwe...@mediatek.com, kernel list, syzkaller-bugs, Vlastimil Babka, Michel Lespinasse, syzbot
On Mon, Jun 07, 2021 at 07:27:23PM +0200, Jann Horn wrote:
> === Short summary ===
> I believe the issue here is a race between /proc/*/smaps and
> split_huge_page_to_list():
>
> The codepath for /proc/*/smaps walks the pagetables and (e.g. in
> smaps_account()) calls page_mapcount() not just on pages from normal
> PTEs but also on migration entries (since commit b1d4d9e0cbd0a
> "proc/smaps: carefully handle migration entries", from Linux v3.5).
> page_mapcount() expects compound pages to be stable.
>
> The split_huge_page_to_list() path first protects the compound page by
> locking it and replacing all its PTEs with migration entries (since
> the THP rewrite in v4.5, I think?), then does the actual splitting
> using __split_huge_page().
>
> So there's a mismatch of expectations here:
> The smaps code expects that migration entries point to stable compound
> pages, while the THP code expects that it's okay to split a compound
> page while it has migration entries.

Will it be a colossal performance penalty if we always get the page
refcount after looking it up? That will cause split_huge_page() to
fail to split the page if it hits this race.

Jann Horn

unread,
Jun 7, 2021, 3:55:37 PM6/7/21
to Matthew Wilcox, Linux-MM, Zi Yan, Peter Xu, Kirill A. Shutemov, Konstantin Khlebnikov, Andrew Morton, chinwe...@mediatek.com, kernel list, syzkaller-bugs, Vlastimil Babka, Michel Lespinasse, syzbot
Hmm - but with that approach I'm not sure you could even easily take a
refcount on a page whose refcount may be frozen and which may be in
the middle of being shattered? get_page_unless_zero() is wrong because
you can't take references on tail pages, right? (Or can you?) And
try_get_page() is wrong because it bugs out if the refcount is zero -
and even if it didn't do that, you might end up holding a reference on
the head page while the page you're actually interested in is a tail
page?

I guess if it was really necessary, it'd be possible to do some kind
of retry thing that grabs a reference on the compound head, then
checks that the tail page is still associated with the compound head,
and if not, drops the compound head and tries again?

Matthew Wilcox

unread,
Jun 7, 2021, 4:21:51 PM6/7/21
to Jann Horn, Linux-MM, Zi Yan, Peter Xu, Kirill A. Shutemov, Konstantin Khlebnikov, Andrew Morton, chinwe...@mediatek.com, kernel list, syzkaller-bugs, Vlastimil Babka, Michel Lespinasse, syzbot
Right; that's how get_user_page_fast() works -- see
try_get_compound_head(). If it can't get the reference, it just fails.
I suspect for smaps, we can just choose to not count the page. It'll be
an inaccuracy in the stats, but I don't think that's a big deal.

Kirill A. Shutemov

unread,
Jun 7, 2021, 4:49:30 PM6/7/21
to Jann Horn, Matthew Wilcox, Linux-MM, Zi Yan, Peter Xu, Kirill A. Shutemov, Konstantin Khlebnikov, Andrew Morton, chinwe...@mediatek.com, kernel list, syzkaller-bugs, Vlastimil Babka, Michel Lespinasse, syzbot
On Mon, Jun 07, 2021 at 07:27:23PM +0200, Jann Horn wrote:
> 2. let smaps assume that the mapcount is 1 for all migration entries?

I believe that what we effectively do for migration entries to
non-compound pages:

for (i = 0; i < nr; i++, page++) {
int mapcount = page_mapcount(page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
if (mapcount >= 2)
pss /= mapcount;
smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
mapcount < 2);
}

For non-compound pages with page_count(page) != 1 (== 1 handled
separately) we would have nr == 1 and will look into mapcount, which for
pages under migration is 0. The code above will handle mapcount == 0 as
mapcount == 1. I think it would not be a stretch to do the same for
compound pages here.

I guess we should take an additional argument to smaps_account() which would
indicate that we deal with migration entry and handle it as mapcount == 1.

Hm. Do we need the same for device-private entries?

--
Kirill A. Shutemov

syzbot

unread,
Dec 21, 2021, 12:24:21 PM12/21/21
to ak...@linux-foundation.org, apo...@nvidia.com, chinwe...@mediatek.com, fghee...@gmail.com, ja...@google.com, khleb...@yandex-team.ru, kirill....@linux.intel.com, kir...@shutemov.name, linux-...@vger.kernel.org, linux-...@vger.kernel.org, linu...@kvack.org, pet...@redhat.com, pet...@infradead.org, syzkall...@googlegroups.com, tonymaris...@yandex.com, vba...@suse.cz, wal...@google.com, wi...@infradead.org, z...@nvidia.com
syzbot has found a reproducer for the following issue on:

HEAD commit: 6e0567b73052 Merge tag 'for-linus' of git://git.kernel.org..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=14c192b3b00000
kernel config: https://syzkaller.appspot.com/x/.config?x=ae22d1ee4fbca18
dashboard link: https://syzkaller.appspot.com/bug?extid=1f52b3a18d5633fa7f82
compiler: gcc (Debian 10.2.1-6) 10.2.1 20210110, GNU ld (GNU Binutils for Debian) 2.35.2
syz repro: https://syzkaller.appspot.com/x/repro.syz?x=133200fdb00000
C reproducer: https://syzkaller.appspot.com/x/repro.c?x=17c3102db00000

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+1f52b3...@syzkaller.appspotmail.com

__mmput+0x122/0x4b0 kernel/fork.c:1113
mmput+0x56/0x60 kernel/fork.c:1134
exit_mm kernel/exit.c:507 [inline]
do_exit+0xb27/0x2b40 kernel/exit.c:819
do_group_exit+0x125/0x310 kernel/exit.c:929
get_signal+0x47d/0x2220 kernel/signal.c:2852
arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868
handle_signal_work kernel/entry/common.c:148 [inline]
exit_to_user_mode_loop kernel/entry/common.c:172 [inline]
exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207
__syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline]
syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300
do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86
entry_SYSCALL_64_after_hwframe+0x44/0xae
------------[ cut here ]------------
kernel BUG at include/linux/page-flags.h:785!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 4392 Comm: syz-executor560 Not tainted 5.16.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
page_mapcount include/linux/mm.h:837 [inline]
smaps_account+0x470/0xb10 fs/proc/task_mmu.c:466
smaps_pte_entry fs/proc/task_mmu.c:538 [inline]
smaps_pte_range+0x611/0x1250 fs/proc/task_mmu.c:601
walk_pmd_range mm/pagewalk.c:128 [inline]
walk_pud_range mm/pagewalk.c:205 [inline]
walk_p4d_range mm/pagewalk.c:240 [inline]
walk_pgd_range mm/pagewalk.c:277 [inline]
__walk_page_range+0xe23/0x1ea0 mm/pagewalk.c:379
walk_page_vma+0x277/0x350 mm/pagewalk.c:530
smap_gather_stats.part.0+0x148/0x260 fs/proc/task_mmu.c:768
smap_gather_stats fs/proc/task_mmu.c:741 [inline]
show_smap+0xc6/0x440 fs/proc/task_mmu.c:822
seq_read_iter+0xbb0/0x1240 fs/seq_file.c:272
seq_read+0x3e0/0x5b0 fs/seq_file.c:162
vfs_read+0x1b5/0x600 fs/read_write.c:479
ksys_read+0x12d/0x250 fs/read_write.c:619
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x7faa2af6c969
Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 11 15 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007faa2aefd288 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
RAX: ffffffffffffffda RBX: 00007faa2aff4418 RCX: 00007faa2af6c969
RDX: 0000000000002025 RSI: 0000000020000100 RDI: 0000000000000003
RBP: 00007faa2aff4410 R08: 00007faa2aefd700 R09: 0000000000000000
R10: 00007faa2aefd700 R11: 0000000000000246 R12: 00007faa2afc20ac
R13: 00007fff7e6632bf R14: 00007faa2aefd400 R15: 0000000000022000
</TASK>
Modules linked in:
---[ end trace 24ec93ff95e4ac3d ]---
RIP: 0010:PageDoubleMap include/linux/page-flags.h:785 [inline]
RIP: 0010:__page_mapcount+0x2d2/0x350 mm/util.c:744
Code: e8 d3 16 d1 ff 48 c7 c6 c0 00 b6 89 48 89 ef e8 94 4e 04 00 0f 0b e8 bd 16 d1 ff 48 c7 c6 60 01 b6 89 48 89 ef e8 7e 4e 04 00 <0f> 0b e8 a7 16 d1 ff 48 c7 c6 a0 01 b6 89 4c 89 f7 e8 68 4e 04 00
RSP: 0018:ffffc90002b6f7b8 EFLAGS: 00010293
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: ffff888019619d00 RSI: ffffffff81a68c12 RDI: 0000000000000003
RBP: ffffea0001bdc2c0 R08: 0000000000000029 R09: 00000000ffffffff
R10: ffffffff8903e29f R11: 00000000ffffffff R12: 00000000ffffffff
R13: 00000000ffffea00 R14: ffffc90002b6fb30 R15: ffffea0001bd8001
FS: 00007faa2aefd700(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff7e663318 CR3: 0000000018c6e000 CR4: 00000000003506e0

Yang Shi

unread,
Dec 21, 2021, 1:24:40 PM12/21/21
to syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, wal...@google.com, Matthew Wilcox, Zi Yan
It seems the THP is split during smaps walk. The reproducer does call
MADV_FREE on partial THP which may split the huge page.

The below fix (untested) should be able to fix it.

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad667dbc96f5..97feb15a2448 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -433,6 +433,7 @@ static void smaps_account(struct mem_size_stats
*mss, struct page *page,
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
+ struct page *head = compound_head(page);

/*
* First accumulate quantities that depend only on |size| and the type
@@ -462,6 +463,11 @@ static void smaps_account(struct mem_size_stats
*mss, struct page *page,
locked, true);
return;
}
+
+ /* Lost the race with THP split */
+ if (!get_page_unless_zero(head))
+ return;
+
for (i = 0; i < nr; i++, page++) {
int mapcount = page_mapcount(page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
@@ -470,6 +476,8 @@ static void smaps_account(struct mem_size_stats
*mss, struct page *page,
smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked,
mapcount < 2);
}
+
+ put_page(head);

Matthew Wilcox

unread,
Dec 21, 2021, 1:40:35 PM12/21/21
to Yang Shi, syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, wal...@google.com, Zi Yan
On Tue, Dec 21, 2021 at 10:24:27AM -0800, Yang Shi wrote:
> It seems the THP is split during smaps walk. The reproducer does call
> MADV_FREE on partial THP which may split the huge page.
>
> The below fix (untested) should be able to fix it.

Did you read the rest of the thread on this? If the page is being
migrated, we should still account it ... also, you've changed the
refcount, so this:

if (page_count(page) == 1) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
}

will never trigger.

Yang Shi

unread,
Dec 21, 2021, 2:07:16 PM12/21/21
to Matthew Wilcox, syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, wal...@google.com, Zi Yan
On Tue, Dec 21, 2021 at 10:40 AM Matthew Wilcox <wi...@infradead.org> wrote:
>
> On Tue, Dec 21, 2021 at 10:24:27AM -0800, Yang Shi wrote:
> > It seems the THP is split during smaps walk. The reproducer does call
> > MADV_FREE on partial THP which may split the huge page.
> >
> > The below fix (untested) should be able to fix it.
>
> Did you read the rest of the thread on this? If the page is being
> migrated, we should still account it ... also, you've changed the

Yes, the being migrated pages may be skipped. We should be able to add
a new flag to smaps_account() to indicate this is a migration entry
then don't elevate the page count.

> refcount, so this:
>
> if (page_count(page) == 1) {
> smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
> locked, true);
> return;
> }
>
> will never trigger.

The get_page_unless_zero() is called after this block.

Yang Shi

unread,
Dec 21, 2021, 8:43:03 PM12/21/21
to Matthew Wilcox, syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, Zi Yan
On Tue, Dec 21, 2021 at 11:07 AM Yang Shi <shy8...@gmail.com> wrote:
>
> On Tue, Dec 21, 2021 at 10:40 AM Matthew Wilcox <wi...@infradead.org> wrote:
> >
> > On Tue, Dec 21, 2021 at 10:24:27AM -0800, Yang Shi wrote:
> > > It seems the THP is split during smaps walk. The reproducer does call
> > > MADV_FREE on partial THP which may split the huge page.
> > >
> > > The below fix (untested) should be able to fix it.
> >
> > Did you read the rest of the thread on this? If the page is being
> > migrated, we should still account it ... also, you've changed the
>
> Yes, the being migrated pages may be skipped. We should be able to add
> a new flag to smaps_account() to indicate this is a migration entry
> then don't elevate the page count.

It seems not that straightforward. THP split converts PTEs to
migration entries too. So we can't tell if it is real migration or
just in the middle of THP split.

We just need to serialize against THP split for PTE mapped subpages.
So in real life workload it might be ok to skip accounting migration
pages? Typically the migration is a transient state, so the under
accounting should be transient too. Or account migration pages
separately, just like swap entries?

I may revisit this after the holiday. If you have any better ideas,
please feel free to propose.

Yang Shi

unread,
Jan 5, 2022, 2:05:57 PM1/5/22
to Matthew Wilcox, syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, wal...@google.com, Zi Yan
On Tue, Dec 21, 2021 at 10:40 AM Matthew Wilcox <wi...@infradead.org> wrote:
>
> On Tue, Dec 21, 2021 at 10:24:27AM -0800, Yang Shi wrote:
> > It seems the THP is split during smaps walk. The reproducer does call
> > MADV_FREE on partial THP which may split the huge page.
> >
> > The below fix (untested) should be able to fix it.
>
> Did you read the rest of the thread on this? If the page is being

I just revisited this. Now I see what you mean about "the rest of the
thread". My gmail client doesn't put them in the same thread, sigh...

Yeah, try_get_compound_head() seems like the right way.

Or we just simply treat migration entries as mapcount == 1 as Kirill
suggested or just skip migration entries since they are transient or
show migration entries separately.

Yang Shi

unread,
Jan 11, 2022, 6:14:25 PM1/11/22
to Matthew Wilcox, syzbot, Andrew Morton, Alistair Popple, chinwe...@mediatek.com, fghee...@gmail.com, Jann Horn, Konstantin Khlebnikov, Kirill A. Shutemov, Kirill A. Shutemov, Linux FS-devel Mailing List, Linux Kernel Mailing List, Linux MM, Peter Xu, Peter Zijlstra, syzkall...@googlegroups.com, tonymaris...@yandex.com, Vlastimil Babka, Zi Yan
On Wed, Jan 5, 2022 at 11:05 AM Yang Shi <shy8...@gmail.com> wrote:
>
> On Tue, Dec 21, 2021 at 10:40 AM Matthew Wilcox <wi...@infradead.org> wrote:
> >
> > On Tue, Dec 21, 2021 at 10:24:27AM -0800, Yang Shi wrote:
> > > It seems the THP is split during smaps walk. The reproducer does call
> > > MADV_FREE on partial THP which may split the huge page.
> > >
> > > The below fix (untested) should be able to fix it.
> >
> > Did you read the rest of the thread on this? If the page is being
>
> I just revisited this. Now I see what you mean about "the rest of the
> thread". My gmail client doesn't put them in the same thread, sigh...
>
> Yeah, try_get_compound_head() seems like the right way.
>
> Or we just simply treat migration entries as mapcount == 1 as Kirill
> suggested or just skip migration entries since they are transient or
> show migration entries separately.

I think Kirill's suggestion makes some sense. The migration entry's
mapcount is 0 so "pss /= mapcount" is not called at all, so the
migration entry is actually treated like mapcount == 1. This doesn't
change the behavior. Not like swap entry, we actually can't tell how
many references for the migration entry.

But we should handle private device entry differently since its
mapcount is inc'ed when it is shared between processes. The regular
migration entry could be identified by is_migration_entry() easily.
Using try_get_compound_head() seems overkilling IMHO.

I just came up with the below patch (built and running the producer
didn't trigger the bug for me so far). If it looks fine, I will submit
it in a formal patch with more comments.

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad667dbc96f5..6a48bbb51efa 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -429,7 +429,8 @@ static void smaps_page_accumulate(struct
mem_size_stats *mss,
}

static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -457,7 +458,7 @@ static void smaps_account(struct mem_size_stats
*mss, struct page *page,
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -506,6 +507,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;

if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
@@ -525,8 +527,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent))
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
page = pfn_swap_entry_to_page(swpent);
+ }
} else {
smaps_pte_hole_lookup(addr, walk);
return;
@@ -535,7 +540,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;

- smaps_account(mss, page, false, pte_young(*pte),
pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+ locked, migration);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -546,6 +552,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;

if (pmd_present(*pmd)) {
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -553,8 +560,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);

- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = pfn_swap_entry_to_page(entry);
+ }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -566,7 +575,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd),
pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
Reply all
Reply to author
Forward
0 new messages