[PATCH v2 3/5] riscv: mm: Rename new_vmalloc into new_valid_map_cpus

0 views
Skip to first unread message

Vivian Wang

unread,
Mar 3, 2026, 12:30:51 AMMar 3
to Paul Walmsley, Palmer Dabbelt, Alexandre Ghiti, Alexander Potapenko, Marco Elver, Dmitry Vyukov, Yunhui Cui, linux...@lists.infradead.org, linux-...@vger.kernel.org, kasa...@googlegroups.com, Palmer Dabbelt, Vivian Wang
Since this mechanism is now used for the kfence pool, which comes from
the linear mapping and not vmalloc, rename new_vmalloc into
new_valid_map_cpus to avoid misleading readers.

No functional change intended.

Signed-off-by: Vivian Wang <wangr...@iscas.ac.cn>
---
arch/riscv/include/asm/cacheflush.h | 6 +++---
arch/riscv/kernel/entry.S | 38 ++++++++++++++++++-------------------
arch/riscv/mm/init.c | 2 +-
3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index b1a2ac665792..8c7a0ef2635a 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -41,7 +41,7 @@ do { \
} while (0)

#ifdef CONFIG_64BIT
-extern u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
+extern u64 new_valid_map_cpus[NR_CPUS / sizeof(u64) + 1];
extern char _end[];
static inline void mark_new_valid_map(void)
{
@@ -52,8 +52,8 @@ static inline void mark_new_valid_map(void)
* the only place this can happen is in handle_exception() where
* an sfence.vma is emitted.
*/
- for (i = 0; i < ARRAY_SIZE(new_vmalloc); ++i)
- new_vmalloc[i] = -1ULL;
+ for (i = 0; i < ARRAY_SIZE(new_valid_map_cpus); ++i)
+ new_valid_map_cpus[i] = -1ULL;
}
#define flush_cache_vmap flush_cache_vmap
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index ced7a2b160ce..9c6acfd09141 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -20,44 +20,44 @@

.section .irqentry.text, "ax"

-.macro new_vmalloc_check
+.macro new_valid_map_cpus_check
REG_S a0, TASK_TI_A0(tp)
csrr a0, CSR_CAUSE
/* Exclude IRQs */
- blt a0, zero, .Lnew_vmalloc_restore_context_a0
+ blt a0, zero, .Lnew_valid_map_cpus_restore_context_a0

REG_S a1, TASK_TI_A1(tp)
- /* Only check new_vmalloc if we are in page/protection fault */
+ /* Only check new_valid_map_cpus if we are in page/protection fault */
li a1, EXC_LOAD_PAGE_FAULT
- beq a0, a1, .Lnew_vmalloc_kernel_address
+ beq a0, a1, .Lnew_valid_map_cpus_kernel_address
li a1, EXC_STORE_PAGE_FAULT
- beq a0, a1, .Lnew_vmalloc_kernel_address
+ beq a0, a1, .Lnew_valid_map_cpus_kernel_address
li a1, EXC_INST_PAGE_FAULT
- bne a0, a1, .Lnew_vmalloc_restore_context_a1
+ bne a0, a1, .Lnew_valid_map_cpus_restore_context_a1

-.Lnew_vmalloc_kernel_address:
+.Lnew_valid_map_cpus_kernel_address:
/* Is it a kernel address? */
csrr a0, CSR_TVAL
- bge a0, zero, .Lnew_vmalloc_restore_context_a1
+ bge a0, zero, .Lnew_valid_map_cpus_restore_context_a1

/* Check if a new vmalloc mapping appeared that could explain the trap */
REG_S a2, TASK_TI_A2(tp)
/*
* Computes:
- * a0 = &new_vmalloc[BIT_WORD(cpu)]
+ * a0 = &new_valid_map_cpus[BIT_WORD(cpu)]
* a1 = BIT_MASK(cpu)
*/
lw a2, TASK_TI_CPU(tp)
/*
- * Compute the new_vmalloc element position:
+ * Compute the new_valid_map_cpus element position:
* (cpu / 64) * 8 = (cpu >> 6) << 3
*/
srli a1, a2, 6
slli a1, a1, 3
- la a0, new_vmalloc
+ la a0, new_valid_map_cpus
add a0, a0, a1
/*
- * Compute the bit position in the new_vmalloc element:
+ * Compute the bit position in the new_valid_map_cpus element:
* bit_pos = cpu % 64 = cpu - (cpu / 64) * 64 = cpu - (cpu >> 6) << 6
* = cpu - ((cpu >> 6) << 3) << 3
*/
@@ -67,12 +67,12 @@
li a2, 1
sll a1, a2, a1

- /* Check the value of new_vmalloc for this cpu */
+ /* Check the value of new_valid_map_cpus for this cpu */
REG_L a2, 0(a0)
and a2, a2, a1
- beq a2, zero, .Lnew_vmalloc_restore_context
+ beq a2, zero, .Lnew_valid_map_cpus_restore_context

- /* Atomically reset the current cpu bit in new_vmalloc */
+ /* Atomically reset the current cpu bit in new_valid_map_cpus */
amoxor.d a0, a1, (a0)

/* Only emit a sfence.vma if the uarch caches invalid entries */
@@ -84,11 +84,11 @@
csrw CSR_SCRATCH, x0
sret

-.Lnew_vmalloc_restore_context:
+.Lnew_valid_map_cpus_restore_context:
REG_L a2, TASK_TI_A2(tp)
-.Lnew_vmalloc_restore_context_a1:
+.Lnew_valid_map_cpus_restore_context_a1:
REG_L a1, TASK_TI_A1(tp)
-.Lnew_vmalloc_restore_context_a0:
+.Lnew_valid_map_cpus_restore_context_a0:
REG_L a0, TASK_TI_A0(tp)
.endm

@@ -146,7 +146,7 @@ SYM_CODE_START(handle_exception)
* could "miss" the new mapping and traps: in that case, we only need
* to retry the access, no sfence.vma is required.
*/
- new_vmalloc_check
+ new_valid_map_cpus_check
#endif

REG_S sp, TASK_TI_KERNEL_SP(tp)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 811e03786c56..9922c22a2a5f 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -37,7 +37,7 @@

#include "../kernel/head.h"

-u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
+u64 new_valid_map_cpus[NR_CPUS / sizeof(u64) + 1];

struct kernel_mapping kernel_map __ro_after_init;
EXPORT_SYMBOL(kernel_map);

--
2.53.0

Vivian Wang

unread,
Mar 3, 2026, 12:30:51 AMMar 3
to Paul Walmsley, Palmer Dabbelt, Alexandre Ghiti, Alexander Potapenko, Marco Elver, Dmitry Vyukov, Yunhui Cui, linux...@lists.infradead.org, linux-...@vger.kernel.org, kasa...@googlegroups.com, Palmer Dabbelt, sta...@vger.kernel.org, Vivian Wang, Yanko Kaneti
kfence_unprotect() on RISC-V doesn't flush TLBs, because we can't send
IPIs in some contexts where kfence objects are allocated. This leads to
spurious faults and kfence false positives.

Avoid these spurious faults using the same "new_vmalloc" mechanism,
which I have renamed new_valid_map_cpus to avoid confusion, since the
kfence pool comes from the linear mapping, not vmalloc.

Commit b3431a8bb336 ("riscv: Fix IPIs usage in kfence_protect_page()")
only seemed to consider false negatives, which are indeed tolerable.
False positives on the other hand are not okay since they waste
developer time (or just my time somehow?) and spam kmsg making
diagnosing other problems difficult.

Patch 2 is the implementation to poke (what was called) new_vmalloc upon
kfence_unprotect(). Patch 1 is some refactoring that patch 2 depends on.
Patch 3 through 5 are some additional refactoring and minor fixes.

How this was found
------------------

This came up after a user reported some nonsensical kfence
use-after-free reports relating to k1_emac on SpacemiT K1, like this:

[ 64.160199] ==================================================================
[ 64.164773] BUG: KFENCE: use-after-free read in sk_skb_reason_drop+0x22/0x1e8
[ 64.164773]
[ 64.173365] Use-after-free read at 0xffffffd77fecc0cc (in kfence-#101):
[ 64.179962] sk_skb_reason_drop+0x22/0x1e8
[ 64.179972] dev_kfree_skb_any_reason+0x32/0x3c

[...]

[ 64.181440] kfence-#101: 0xffffffd77fecc000-0xffffffd77fecc0cf, size=208, cache=skbuff_head_cache
[ 64.181440]
[ 64.181450] allocated by task 142 on cpu 1 at 63.665866s (0.515583s ago):
[ 64.181476] __alloc_skb+0x66/0x244
[ 64.181484] alloc_skb_with_frags+0x3a/0x1ac

[...]

[ 64.182917] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 7.0.0-rc1-dirty #34 PREEMPTLAZY
[ 64.182926] Hardware name: Banana Pi BPI-F3 (DT)
[ 64.183111] ==================================================================

In particular, these supposed use-after-free accesses:

- Were never reported by KASAN despite being rather easy to reproduce
- Never contain a "freed by task" section
- Never happen on the same CPU as the "allocated by task" info
- And, most importantly, were not found to have been caused by the
object being freed by anyone at that point

An interesting corollary of this observation is that the SpacemiT X60
CPU *does* cache invalid PTEs, and for a significant amount of time, or
at least long enough to be observable in practice. Or maybe only in an
wfi, given how most of these reports I've seen had the faulting CPU in
an IRQ?

---
Changes in v2:
- Reordered patches 1 through 3 to minimize what needs to be backported
- (New patch 4) Change the bitmap to use DECLARE_BITMAP (Alexander)
- (New patch 5) Additional fix
- Link to v1: https://lore.kernel.org/r/20260302-handle-kfence-protect-...@iscas.ac.cn

---
Vivian Wang (5):
riscv: mm: Extract helper mark_new_valid_map()
riscv: kfence: Call mark_new_valid_map() for kfence_unprotect()
riscv: mm: Rename new_vmalloc into new_valid_map_cpus
riscv: mm: Use the bitmap API for new_valid_map_cpus
riscv: mm: Unconditionally sfence.vma for spurious fault

arch/riscv/include/asm/cacheflush.h | 25 +++++++++---------
arch/riscv/include/asm/kfence.h | 7 +++--
arch/riscv/kernel/entry.S | 51 ++++++++++++++++++++-----------------
arch/riscv/mm/init.c | 2 +-
4 files changed, 47 insertions(+), 38 deletions(-)
---
base-commit: 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f
change-id: 20260228-handle-kfence-protect-spurious-fault-62100afb9734

Best regards,
--
Vivian "dramforever" Wang

Vivian Wang

unread,
Mar 3, 2026, 12:30:52 AMMar 3
to Paul Walmsley, Palmer Dabbelt, Alexandre Ghiti, Alexander Potapenko, Marco Elver, Dmitry Vyukov, Yunhui Cui, linux...@lists.infradead.org, linux-...@vger.kernel.org, kasa...@googlegroups.com, Palmer Dabbelt, Vivian Wang
The bitmap was defined with incorrect size. Fix it by using the proper
bitmap API in C code. The corresponding assembly code is still okay and
remains unchanged.

Signed-off-by: Vivian Wang <wangr...@iscas.ac.cn>
---
arch/riscv/include/asm/cacheflush.h | 8 +++-----
arch/riscv/mm/init.c | 2 +-
2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 8c7a0ef2635a..8cfe59483a8f 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -41,19 +41,17 @@ do { \
} while (0)

#ifdef CONFIG_64BIT
-extern u64 new_valid_map_cpus[NR_CPUS / sizeof(u64) + 1];
+/* This is accessed in assembly code. cpumask_var_t would be too complex. */
+extern DECLARE_BITMAP(new_valid_map_cpus, NR_CPUS);
extern char _end[];
static inline void mark_new_valid_map(void)
{
- int i;
-
/*
* We don't care if concurrently a cpu resets this value since
* the only place this can happen is in handle_exception() where
* an sfence.vma is emitted.
*/
- for (i = 0; i < ARRAY_SIZE(new_valid_map_cpus); ++i)
- new_valid_map_cpus[i] = -1ULL;
+ bitmap_fill(new_valid_map_cpus, NR_CPUS);
}
#define flush_cache_vmap flush_cache_vmap
static inline void flush_cache_vmap(unsigned long start, unsigned long end)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 9922c22a2a5f..a2fc70f72269 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -37,7 +37,7 @@

#include "../kernel/head.h"

-u64 new_valid_map_cpus[NR_CPUS / sizeof(u64) + 1];
+DECLARE_BITMAP(new_valid_map_cpus, NR_CPUS);

Vivian Wang

unread,
Mar 3, 2026, 12:30:54 AMMar 3
to Paul Walmsley, Palmer Dabbelt, Alexandre Ghiti, Alexander Potapenko, Marco Elver, Dmitry Vyukov, Yunhui Cui, linux...@lists.infradead.org, linux-...@vger.kernel.org, kasa...@googlegroups.com, Palmer Dabbelt, sta...@vger.kernel.org, Vivian Wang
Svvptc does not guarantee that it's safe to just return here. Since we
have already cleared our bit, if, theoretically, the bounded timeframe
for the accessed page to become valid still hasn't happened after sret,
we could fault again and actually crash.

Hopefully, these spurious faults should be rare enough that this is an
acceptable slowdown.

Cc: <sta...@vger.kernel.org>
Fixes: 503638e0babf ("riscv: Stop emitting preventive sfence.vma for new vmalloc mappings")
Signed-off-by: Vivian Wang <wangr...@iscas.ac.cn>
---
arch/riscv/kernel/entry.S | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 9c6acfd09141..34717bd1fa91 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -75,8 +75,11 @@
/* Atomically reset the current cpu bit in new_valid_map_cpus */
amoxor.d a0, a1, (a0)

- /* Only emit a sfence.vma if the uarch caches invalid entries */
- ALTERNATIVE("sfence.vma", "nop", 0, RISCV_ISA_EXT_SVVPTC, 1)
+ /*
+ * A sfence.vma is required here. Even if we had Svvptc, there's no
+ * guarantee that after returning we wouldn't just fault again.
+ */
+ sfence.vma

REG_L a0, TASK_TI_A0(tp)
REG_L a1, TASK_TI_A1(tp)

--
2.53.0

Reply all
Reply to author
Forward
0 new messages