[syzbot] [kernel?] KCSAN: data-race in next_expiry_recalc / update_process_times (2)

4 views

Skip to first unread message

syzbot

unread,

Aug 13, 2024, 4:40:27 PMAug 13

to anna-...@linutronix.de, fred...@kernel.org, linux-...@vger.kernel.org, syzkall...@googlegroups.com, tg...@linutronix.de

Hello,

syzbot found the following issue on:

HEAD commit: 6b4aa469f049 Merge tag '6.11-rc3-ksmbd-fixes' of git://git..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=157bd96b980000
kernel config: https://syzkaller.appspot.com/x/.config?x=31ece081c16313f0
dashboard link: https://syzkaller.appspot.com/bug?extid=bf285fcc0a048e028118
compiler: Debian clang version 15.0.6, GNU ld (GNU Binutils for Debian) 2.40

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/cf019ab0b1a3/disk-6b4aa469.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/b052d8a52fbd/vmlinux-6b4aa469.xz
kernel image: https://storage.googleapis.com/syzbot-assets/07bf313382f0/bzImage-6b4aa469.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+bf285f...@syzkaller.appspotmail.com

==================================================================
BUG: KCSAN: data-race in next_expiry_recalc / update_process_times

write to 0xffff888237c1de58 of 8 bytes by interrupt on cpu 1:
next_expiry_recalc+0x187/0x1e0 kernel/time/timer.c:1967
__run_timers kernel/time/timer.c:2414 [inline]
__run_timer_base+0x2ee/0x640 kernel/time/timer.c:2428
timer_expire_remote+0x2f/0x40 kernel/time/timer.c:2180
tmigr_handle_remote_cpu kernel/time/timer_migration.c:930 [inline]
tmigr_handle_remote_up kernel/time/timer_migration.c:1021 [inline]
__walk_groups kernel/time/timer_migration.c:533 [inline]
tmigr_handle_remote+0x4f6/0x940 kernel/time/timer_migration.c:1080
run_timer_softirq+0x5f/0x70 kernel/time/timer.c:2451
handle_softirqs+0xc3/0x280 kernel/softirq.c:554
__do_softirq kernel/softirq.c:588 [inline]
invoke_softirq kernel/softirq.c:428 [inline]
__irq_exit_rcu kernel/softirq.c:637 [inline]
irq_exit_rcu+0x3e/0x90 kernel/softirq.c:649
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline]
sysvec_apic_timer_interrupt+0x73/0x80 arch/x86/kernel/apic/apic.c:1043
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
syscall_enter_from_user_mode_work include/linux/entry-common.h:165 [inline]
syscall_enter_from_user_mode include/linux/entry-common.h:198 [inline]
do_syscall_64+0x9a/0x1c0 arch/x86/entry/common.c:79
entry_SYSCALL_64_after_hwframe+0x77/0x7f

read to 0xffff888237c1de58 of 8 bytes by interrupt on cpu 0:
run_local_timers kernel/time/timer.c:2466 [inline]
update_process_times+0x8a/0x180 kernel/time/timer.c:2484
tick_sched_handle kernel/time/tick-sched.c:276 [inline]
tick_nohz_handler+0x250/0x2d0 kernel/time/tick-sched.c:297
__run_hrtimer kernel/time/hrtimer.c:1689 [inline]
__hrtimer_run_queues+0x20d/0x5e0 kernel/time/hrtimer.c:1753
hrtimer_interrupt+0x210/0x7b0 kernel/time/hrtimer.c:1815
local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1032 [inline]
__sysvec_apic_timer_interrupt+0x5c/0x1a0 arch/x86/kernel/apic/apic.c:1049
instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline]
sysvec_apic_timer_interrupt+0x6e/0x80 arch/x86/kernel/apic/apic.c:1043
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
native_safe_halt arch/x86/include/asm/irqflags.h:48 [inline]
arch_safe_halt arch/x86/include/asm/irqflags.h:106 [inline]
acpi_safe_halt+0x21/0x30 drivers/acpi/processor_idle.c:111
acpi_idle_do_entry+0x1d/0x30 drivers/acpi/processor_idle.c:568
acpi_idle_enter+0x96/0xb0 drivers/acpi/processor_idle.c:702
cpuidle_enter_state+0xcf/0x270 drivers/cpuidle/cpuidle.c:267
cpuidle_enter+0x40/0x70 drivers/cpuidle/cpuidle.c:388
call_cpuidle kernel/sched/idle.c:155 [inline]
cpuidle_idle_call kernel/sched/idle.c:230 [inline]
do_idle+0x195/0x230 kernel/sched/idle.c:326
cpu_startup_entry+0x25/0x30 kernel/sched/idle.c:424
rest_init+0xef/0xf0 init/main.c:747
start_kernel+0x581/0x5e0 init/main.c:1103
x86_64_start_reservations+0x2a/0x30 arch/x86/kernel/head64.c:507
x86_64_start_kernel+0x9a/0xa0 arch/x86/kernel/head64.c:488
common_startup_64+0x12c/0x137

value changed: 0x00000000fffff045 -> 0x00000000fffff048

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.11.0-rc3-syzkaller-00010-g6b4aa469f049 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 06/27/2024
==================================================================

---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzk...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

Anna-Maria Behnsen

unread,

Aug 29, 2024, 12:15:24 PMAug 29

to linux-...@vger.kernel.org, Frederic Weisbecker, Thomas Gleixner, syzkall...@googlegroups.com

Global timers could be expired remotely when the target CPU is idle. After
a remote timer expiry, the remote timer_base->next_expiry value is updated
while holding the timer_base->lock. When the formerly idle CPU becomes
active at the same time and checks whether timers need to expire, this
check is done lockless as it is on the local CPU. This could lead to a data
race, which was reported by sysbot:

https://lore.kernel.org/r/00000000000091...@google.com

When the value is read lockless but changed by the remote CPU, only two non
critical scenarios could happen:

1) The already update value is read -> everything is perfect

2) The old value is read -> a superfluous timer soft interrupt is raised

The same situation could happen when enqueueing a new first pinned timer by
a remote CPU also with non critical scenarios:

1) The already update value is read -> everything is perfect

2) The old value is read -> when the CPU is idle, an IPI is executed
nevertheless and when the CPU isn't idle, the updated value will be visible
on the next tick and the timer might be late one jiffie.

As this is very unlikely to happen, the overhead of doing the check under
the lock is a way more effort, than a superfluous timer soft interrupt or a
possible 1 jiffie delay of the timer.

Document and annotate this non critical behavior in the code by using
READ/WRITE_ONCE() pair when accessing timer_base->next_expiry.

Reported-by: syzbot+bf285f...@syzkaller.appspotmail.com
Signed-off-by: Anna-Maria Behnsen <anna-...@linutronix.de>
Closes: https://lore.kernel.org/lkml/00000000000091...@google.com
---
kernel/time/timer.c | 41 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 18aa759c3cae..71b96a9bf6e8 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* Set the next expiry time and kick the CPU so it
* can reevaluate the wheel:
*/
- base->next_expiry = bucket_expiry;
+ WRITE_ONCE(base->next_expiry, bucket_expiry);
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);
@@ -1964,7 +1964,7 @@ static void next_expiry_recalc(struct timer_base *base)
clk += adj;
}

- base->next_expiry = next;
+ WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}
@@ -2018,7 +2018,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
+ WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);

return base->next_expiry;
}
@@ -2462,8 +2462,39 @@ static void run_local_timers(void)
hrtimer_run_queues();

for (int i = 0; i < NR_BASES; i++, base++) {
- /* Raise the softirq only if required. */
- if (time_after_eq(jiffies, base->next_expiry) ||
+ /*
+ * Raise the softirq only if required.
+ *
+ * timer_base::next_expiry can be written by a remote CPU while
+ * holding the lock. If this write happens at the same time than
+ * the lockless local read, sanity checker could complain about
+ * data corruption.
+ *
+ * There are two possible situations where
+ * timer_base::next_expiry is written by a remote CPU:
+ *
+ * 1. Remote CPU expires global timers of this CPU and updates
+ * timer_base::next_expiry of BASE_LOCAL afterwards in
+ * next_timer_interrupt() or timer_recalc_next_expiry(). The
+ * worst outcome is a superfluous raise of the timer softirq
+ * when the not yet updated value is read.
+ *
+ * 2. A new first pinned timer is enqueued by a remote CPU and
+ * therefore timer_base::next_expiry of BASE_GLOBAL is
+ * updated. When this update is missed, this isn't a problem, as
+ * an IPI is executed nevertheless when the CPU was idle
+ * before. When the CPU wasn't idle but the update is missed,
+ * then the timer would expire one jiffie late - bad luck.
+ *
+ * Those unlikely corner cases where the worst outcome is only a
+ * one jiffie delay or a superfluous raise of the softirq are
+ * not that expensive as doing the check always while holding
+ * the lock.
+ *
+ * Possible remote writers are using WRITE_ONCE(). Local reader
+ * uses therefore READ_ONCE().
+ */
+ if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
(i == BASE_DEF && tmigr_requires_handle_remote())) {
raise_softirq(TIMER_SOFTIRQ);
return;
--
2.39.2

Frederic Weisbecker

unread,

Sep 1, 2024, 6:21:12 PMSep 1

to Anna-Maria Behnsen, linux-...@vger.kernel.org, Thomas Gleixner, syzkall...@googlegroups.com

Reviewed-by: Frederic Weisbecker <fred...@kernel.org>

Just a few nits:

BASE_GLOBAL ?

> + * next_timer_interrupt() or timer_recalc_next_expiry(). The
> + * worst outcome is a superfluous raise of the timer softirq
> + * when the not yet updated value is read.
> + *
> + * 2. A new first pinned timer is enqueued by a remote CPU and
> + * therefore timer_base::next_expiry of BASE_GLOBAL is

BASE_LOCAL ?

Thanks.

Anna-Maria Behnsen

unread,

Sep 3, 2024, 2:55:54 AMSep 3

to Frederic Weisbecker, linux-...@vger.kernel.org, Thomas Gleixner, syzkall...@googlegroups.com

Thanks for the review. Yes you are right, those base names should be
switched...

> Thanks.

Thanks,

Anna-Maria

Anna-Maria Behnsen

unread,

Sep 4, 2024, 5:13:57 AMSep 4

to linux-...@vger.kernel.org, Frederic Weisbecker, Thomas Gleixner, syzkall...@googlegroups.com

1) The already updated value is read -> everything is perfect

2) The old value is read -> a superfluous timer soft interrupt is raised

The same situation could happen when enqueueing a new first pinned timer by
a remote CPU also with non critical scenarios:

1) The already updated value is read -> everything is perfect

2) The old value is read -> when the CPU is idle, an IPI is executed
nevertheless and when the CPU isn't idle, the updated value will be visible
on the next tick and the timer might be late one jiffie.

As this is very unlikely to happen, the overhead of doing the check under
the lock is a way more effort, than a superfluous timer soft interrupt or a
possible 1 jiffie delay of the timer.

Document and annotate this non critical behavior in the code by using
READ/WRITE_ONCE() pair when accessing timer_base->next_expiry.

Reported-by: syzbot+bf285f...@syzkaller.appspotmail.com

Closes: https://lore.kernel.org/lkml/00000000000091...@google.com
Signed-off-by: Anna-Maria Behnsen <anna-...@linutronix.de>
Reviewed-by: Frederic Weisbecker <fred...@kernel.org>

---
kernel/time/timer.c | 41 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c

index d8eb368a5a5b..c74d78aa5543 100644

--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -672,7 +672,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
* Set the next expiry time and kick the CPU so it
* can reevaluate the wheel:
*/
- base->next_expiry = bucket_expiry;
+ WRITE_ONCE(base->next_expiry, bucket_expiry);
base->timers_pending = true;
base->next_expiry_recalc = false;
trigger_dyntick_cpu(base, timer);

@@ -1966,7 +1966,7 @@ static void next_expiry_recalc(struct timer_base *base)

clk += adj;
}

- base->next_expiry = next;
+ WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}

@@ -2020,7 +2020,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,

* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- base->next_expiry = basej + NEXT_TIMER_MAX_DELTA;
+ WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);

return base->next_expiry;
}

@@ -2464,8 +2464,39 @@ static void run_local_timers(void)

hrtimer_run_queues();

for (int i = 0; i < NR_BASES; i++, base++) {
- /* Raise the softirq only if required. */
- if (time_after_eq(jiffies, base->next_expiry) ||
+ /*
+ * Raise the softirq only if required.
+ *
+ * timer_base::next_expiry can be written by a remote CPU while
+ * holding the lock. If this write happens at the same time than
+ * the lockless local read, sanity checker could complain about
+ * data corruption.
+ *
+ * There are two possible situations where
+ * timer_base::next_expiry is written by a remote CPU:
+ *
+ * 1. Remote CPU expires global timers of this CPU and updates

+ * timer_base::next_expiry of BASE_GLOBAL afterwards in

+ * next_timer_interrupt() or timer_recalc_next_expiry(). The
+ * worst outcome is a superfluous raise of the timer softirq
+ * when the not yet updated value is read.
+ *
+ * 2. A new first pinned timer is enqueued by a remote CPU and