[syzbot] [perf?] KCSAN: data-race in perf_event_set_state / perf_mmap_rb

0 views
Skip to first unread message

syzbot

unread,
2:36 AMĀ (7 hours ago)Ā 2:36 AM
to ac...@kernel.org, adrian...@intel.com, alexander...@linux.intel.com, iro...@google.com, james...@linaro.org, jo...@kernel.org, linux-...@vger.kernel.org, linux-pe...@vger.kernel.org, mark.r...@arm.com, mi...@redhat.com, namh...@kernel.org, pet...@infradead.org, syzkall...@googlegroups.com
Hello,

syzbot found the following issue on:

HEAD commit: c537e12daeec Merge tag 'bpf-fixes' of git://git.kernel.org..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1133a5fc580000
kernel config: https://syzkaller.appspot.com/x/.config?x=c160236e1ef1e401
dashboard link: https://syzkaller.appspot.com/bug?extid=5334e6bdc43f6d1dcb7d
compiler: Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/036ac5d12a14/disk-c537e12d.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/07ddd15f46f8/vmlinux-c537e12d.xz
kernel image: https://storage.googleapis.com/syzbot-assets/7866e67b7a58/bzImage-c537e12d.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+5334e6...@syzkaller.appspotmail.com

==================================================================
BUG: KCSAN: data-race in perf_event_set_state / perf_mmap_rb

write to 0xffff88812279f1a0 of 8 bytes by task 12011 on cpu 1:
perf_event_update_time kernel/events/core.c:737 [inline]
perf_mmap_rb+0x71c/0x910 kernel/events/core.c:7037
perf_mmap+0x1ce/0x2f0 kernel/events/core.c:7164
vfs_mmap include/linux/fs.h:2053 [inline]
mmap_file mm/internal.h:167 [inline]
__mmap_new_file_vma mm/vma.c:2421 [inline]
__mmap_new_vma mm/vma.c:2484 [inline]
__mmap_region mm/vma.c:2708 [inline]
mmap_region+0x1045/0x1410 mm/vma.c:2786
do_mmap+0x9b3/0xbe0 mm/mmap.c:558
vm_mmap_pgoff+0x17a/0x2e0 mm/util.c:581
ksys_mmap_pgoff+0x268/0x310 mm/mmap.c:604
x64_sys_call+0x16bb/0x3000 arch/x86/include/generated/asm/syscalls_64.h:10
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xca/0x2b0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

read to 0xffff88812279f1a0 of 8 bytes by task 12005 on cpu 0:
__perf_update_times kernel/events/core.c:720 [inline]
perf_event_update_time kernel/events/core.c:735 [inline]
perf_event_set_state+0x153/0x440 kernel/events/core.c:754
event_sched_out+0x2d4/0x4d0 kernel/events/core.c:2391
group_sched_out kernel/events/core.c:2415 [inline]
__pmu_ctx_sched_out+0x3e7/0x530 kernel/events/core.c:3458
ctx_sched_out+0x273/0x2d0 kernel/events/core.c:3539
task_ctx_sched_out+0x4d/0x70 kernel/events/core.c:2859
perf_event_context_sched_out kernel/events/core.c:3746 [inline]
__perf_event_task_sched_out+0x286/0x370 kernel/events/core.c:3846
perf_event_task_sched_out include/linux/perf_event.h:1654 [inline]
prepare_task_switch kernel/sched/core.c:5045 [inline]
context_switch kernel/sched/core.c:5201 [inline]
__schedule+0xbf0/0xcd0 kernel/sched/core.c:6863
__schedule_loop kernel/sched/core.c:6945 [inline]
schedule+0x5f/0xd0 kernel/sched/core.c:6960
schedule_preempt_disabled+0x10/0x20 kernel/sched/core.c:7017
__mutex_lock_common kernel/locking/mutex.c:692 [inline]
__mutex_lock+0x4ff/0xe20 kernel/locking/mutex.c:776
__mutex_lock_slowpath+0xa/0x10 kernel/locking/mutex.c:1065
mutex_lock+0x89/0x90 kernel/locking/mutex.c:290
perf_poll+0x180/0x1f0 kernel/events/core.c:6150
vfs_poll include/linux/poll.h:82 [inline]
select_poll_one fs/select.c:480 [inline]
do_select+0x8f1/0xf40 fs/select.c:536
core_sys_select+0x3dc/0x6e0 fs/select.c:677
do_pselect fs/select.c:759 [inline]
__do_sys_pselect6 fs/select.c:798 [inline]
__se_sys_pselect6+0x213/0x280 fs/select.c:789
__x64_sys_pselect6+0x78/0x90 fs/select.c:789
x64_sys_call+0x2e98/0x3000 arch/x86/include/generated/asm/syscalls_64.h:271
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0xca/0x2b0 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f

value changed: 0x000000000038c145 -> 0x00000000003929d3

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 12005 Comm: syz.4.2772 Tainted: G W syzkaller #0 PREEMPT(voluntary)
Tainted: [W]=WARN
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025
==================================================================


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzk...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

Dmitry Vyukov

unread,
2:38 AMĀ (7 hours ago)Ā 2:38 AM
to syzbot, ac...@kernel.org, adrian...@intel.com, alexander...@linux.intel.com, iro...@google.com, james...@linaro.org, jo...@kernel.org, linux-...@vger.kernel.org, linux-pe...@vger.kernel.org, mark.r...@arm.com, mi...@redhat.com, namh...@kernel.org, pet...@infradead.org, syzkall...@googlegroups.com
LLM concluded this is a harmful race:

======

Because `perf_mmap_rb()` does not hold the `perf_event_context` lock
(`ctx->lock`), which is the intended protection for these timing
fields, it races with the `event_sched_out()` path (which does hold
`ctx->lock`).

The race on `total_time_enabled` and `total_time_running` involves
non-atomic read-modify-write operations. If both threads read the same
old value of `total_time_enabled` before either writes back the
updated value, one of the updates (representing a chunk of time the
event was enabled) will be lost. Additionally, the race on
`event->tstamp` can lead to inconsistent state where the timestamp and
the total time counters are out of sync, causing further errors in
subsequent time calculations.

### Conclusion

This data race is **harmful**. While it does not lead to an immediate
kernel crash or memory corruption, it causes permanent inaccuracy in
the performance counters. For a subsystem dedicated to high-precision
performance monitoring, lost timing updates constitute a significant
functional bug. The race is also highly reproducible in scenarios
where one task maps an event while another task using the same event
is forced to schedule out (e.g., by blocking on the same
`mmap_mutex`).

Peter Zijlstra

unread,
4:18 AMĀ (5 hours ago)Ā 4:18 AM
to Dmitry Vyukov, syzbot, ac...@kernel.org, adrian...@intel.com, alexander...@linux.intel.com, iro...@google.com, james...@linaro.org, jo...@kernel.org, linux-...@vger.kernel.org, linux-pe...@vger.kernel.org, mark.r...@arm.com, mi...@redhat.com, namh...@kernel.org, syzkall...@googlegroups.com
Yeah, fair enough. Let me go stare at that.

Peter Zijlstra

unread,
5:33 AMĀ (4 hours ago)Ā 5:33 AM
to Dmitry Vyukov, syzbot, ac...@kernel.org, adrian...@intel.com, alexander...@linux.intel.com, iro...@google.com, james...@linaro.org, jo...@kernel.org, linux-...@vger.kernel.org, linux-pe...@vger.kernel.org, mark.r...@arm.com, mi...@redhat.com, namh...@kernel.org, syzkall...@googlegroups.com
On Fri, Feb 06, 2026 at 10:18:38AM +0100, Peter Zijlstra wrote:

> > Because `perf_mmap_rb()` does not hold the `perf_event_context` lock
> > (`ctx->lock`), which is the intended protection for these timing
> > fields, it races with the `event_sched_out()` path (which does hold
> > `ctx->lock`).
> >
> > The race on `total_time_enabled` and `total_time_running` involves
> > non-atomic read-modify-write operations. If both threads read the same
> > old value of `total_time_enabled` before either writes back the
> > updated value, one of the updates (representing a chunk of time the
> > event was enabled) will be lost. Additionally, the race on
> > `event->tstamp` can lead to inconsistent state where the timestamp and
> > the total time counters are out of sync, causing further errors in
> > subsequent time calculations.
>
> Yeah, fair enough. Let me go stare at that.

I ended up with the below. It boots and passes 'perf test' with lockdep
on. No further testing was done.

Can you throw this at the robot?

---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..a5b724cb6b42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1356,7 +1356,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* mmap_lock
* perf_event::mmap_mutex
+ * perf_buffer::event_lock
* perf_buffer::aux_mutex
+ * perf_event_context::lock
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -1582,6 +1584,8 @@ static u64 perf_event_time(struct perf_event *event)
if (unlikely(!ctx))
return 0;

+ lockdep_assert_held(&ctx->lock);
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);

@@ -6157,9 +6161,15 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)

static void _perf_event_reset(struct perf_event *event)
{
+ /*
+ * Must disable PMU to stop the event from triggering during
+ * perf_event_update_userpage().
+ */
+ perf_pmu_disable(event->pmu);
(void)perf_event_read(event, false);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
+ perf_pmu_enable(event->pmu);
}

/* Assume it's not an event with inherit set. */
@@ -6504,15 +6514,9 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}

-static void perf_event_init_userpage(struct perf_event *event)
+static void perf_event_init_userpage(struct perf_event *event, struct perf_buffer *rb)
{
struct perf_event_mmap_page *userpg;
- struct perf_buffer *rb;
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;

userpg = rb->user_page;

@@ -6521,9 +6525,6 @@ static void perf_event_init_userpage(struct perf_event *event)
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
userpg->data_offset = PAGE_SIZE;
userpg->data_size = perf_data_size(rb);
-
-unlock:
- rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
@@ -6536,17 +6537,11 @@ void __weak arch_perf_update_userpage(
* the seqlock logic goes bad. We can not serialize this because the arch
* code calls this from NMI context.
*/
-void perf_event_update_userpage(struct perf_event *event)
+static void __perf_event_update_userpage(struct perf_event *event, struct perf_buffer *rb)
{
struct perf_event_mmap_page *userpg;
- struct perf_buffer *rb;
u64 enabled, running, now;

- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
@@ -6582,7 +6577,16 @@ void perf_event_update_userpage(struct perf_event *event)
barrier();
++userpg->lock;
preempt_enable();
-unlock:
+}
+
+void perf_event_update_userpage(struct perf_event *event)
+{
+ struct perf_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb)
+ __perf_event_update_userpage(event, rb);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
@@ -6978,6 +6982,7 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
+ struct perf_event_context *ctx = event->ctx;
long extra = 0, user_extra = nr_pages;
struct perf_buffer *rb;
int rb_flags = 0;
@@ -7032,11 +7037,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;

- ring_buffer_attach(event, rb);
+ scoped_guard (raw_spinlock_irq, &ctx->lock) {
+ ctx_time_update_event(ctx, event);
+ perf_event_update_time(event);
+ }

- perf_event_update_time(event);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
+ /*
+ * Initialize before setting event->rb to ensure it cannot nest
+ * if the event is already active.
+ */
+ perf_event_init_userpage(event, rb);
+ __perf_event_update_userpage(event, rb);
+
+ ring_buffer_attach(event, rb);

perf_mmap_account(vma, user_extra, extra);
refcount_set(&event->mmap_count, 1);
Reply all
Reply to author
Forward
0 new messages