On Fri, Feb 06, 2026 at 10:18:38AM +0100, Peter Zijlstra wrote:
> > Because `perf_mmap_rb()` does not hold the `perf_event_context` lock
> > (`ctx->lock`), which is the intended protection for these timing
> > fields, it races with the `event_sched_out()` path (which does hold
> > `ctx->lock`).
> >
> > The race on `total_time_enabled` and `total_time_running` involves
> > non-atomic read-modify-write operations. If both threads read the same
> > old value of `total_time_enabled` before either writes back the
> > updated value, one of the updates (representing a chunk of time the
> > event was enabled) will be lost. Additionally, the race on
> > `event->tstamp` can lead to inconsistent state where the timestamp and
> > the total time counters are out of sync, causing further errors in
> > subsequent time calculations.
>
> Yeah, fair enough. Let me go stare at that.
I ended up with the below. It boots and passes 'perf test' with lockdep
on. No further testing was done.
Can you throw this at the robot?
---
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5b5cb620499e..a5b724cb6b42 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1356,7 +1356,9 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::lock
* mmap_lock
* perf_event::mmap_mutex
+ * perf_buffer::event_lock
* perf_buffer::aux_mutex
+ * perf_event_context::lock
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -1582,6 +1584,8 @@ static u64 perf_event_time(struct perf_event *event)
if (unlikely(!ctx))
return 0;
+ lockdep_assert_held(&ctx->lock);
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
@@ -6157,9 +6161,15 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
static void _perf_event_reset(struct perf_event *event)
{
+ /*
+ * Must disable PMU to stop the event from triggering during
+ * perf_event_update_userpage().
+ */
+ perf_pmu_disable(event->pmu);
(void)perf_event_read(event, false);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
+ perf_pmu_enable(event->pmu);
}
/* Assume it's not an event with inherit set. */
@@ -6504,15 +6514,9 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void perf_event_init_userpage(struct perf_event *event)
+static void perf_event_init_userpage(struct perf_event *event, struct perf_buffer *rb)
{
struct perf_event_mmap_page *userpg;
- struct perf_buffer *rb;
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
userpg = rb->user_page;
@@ -6521,9 +6525,6 @@ static void perf_event_init_userpage(struct perf_event *event)
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
userpg->data_offset = PAGE_SIZE;
userpg->data_size = perf_data_size(rb);
-
-unlock:
- rcu_read_unlock();
}
void __weak arch_perf_update_userpage(
@@ -6536,17 +6537,11 @@ void __weak arch_perf_update_userpage(
* the seqlock logic goes bad. We can not serialize this because the arch
* code calls this from NMI context.
*/
-void perf_event_update_userpage(struct perf_event *event)
+static void __perf_event_update_userpage(struct perf_event *event, struct perf_buffer *rb)
{
struct perf_event_mmap_page *userpg;
- struct perf_buffer *rb;
u64 enabled, running, now;
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
@@ -6582,7 +6577,16 @@ void perf_event_update_userpage(struct perf_event *event)
barrier();
++userpg->lock;
preempt_enable();
-unlock:
+}
+
+void perf_event_update_userpage(struct perf_event *event)
+{
+ struct perf_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (rb)
+ __perf_event_update_userpage(event, rb);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
@@ -6978,6 +6982,7 @@ static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
+ struct perf_event_context *ctx = event->ctx;
long extra = 0, user_extra = nr_pages;
struct perf_buffer *rb;
int rb_flags = 0;
@@ -7032,11 +7037,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
- ring_buffer_attach(event, rb);
+ scoped_guard (raw_spinlock_irq, &ctx->lock) {
+ ctx_time_update_event(ctx, event);
+ perf_event_update_time(event);
+ }
- perf_event_update_time(event);
- perf_event_init_userpage(event);
- perf_event_update_userpage(event);
+ /*
+ * Initialize before setting event->rb to ensure it cannot nest
+ * if the event is already active.
+ */
+ perf_event_init_userpage(event, rb);
+ __perf_event_update_userpage(event, rb);
+
+ ring_buffer_attach(event, rb);
perf_mmap_account(vma, user_extra, extra);
refcount_set(&event->mmap_count, 1);