[PATCH] mm: kfence: allocate kfence_metadata at runtime

0 views
Skip to first unread message

Peng Zhang

unread,
Jul 9, 2023, 11:27:35 PM7/9/23
to gli...@google.com, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
kfence_metadata is currently a static array. For the purpose of
allocating scalable __kfence_pool, we first change it to runtime
allocation of metadata. Since the size of an object of kfence_metadata
is 1160 bytes, we can save at least 72 pages (with default 256 objects)
without enabling kfence.

Below is the numbers obtained in qemu (with default 256 objects).
before: Memory: 8134692K/8388080K available (3668K bss)
after: Memory: 8136740K/8388080K available (1620K bss)
More than expected, it saves 2MB memory.

Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
---
mm/kfence/core.c | 102 ++++++++++++++++++++++++++++++++-------------
mm/kfence/kfence.h | 5 ++-
2 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index dad3c0eb70a0..b9fec1c46e3d 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -116,7 +116,7 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
* backing pages (in __kfence_pool).
*/
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
-struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+struct kfence_metadata *kfence_metadata;

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
@@ -643,13 +643,56 @@ static unsigned long kfence_init_pool(void)
return addr;
}

+static int kfence_alloc_metadata(void)
+{
+ unsigned long nr_pages = KFENCE_METADATA_SIZE / PAGE_SIZE;
+
+#ifdef CONFIG_CONTIG_ALLOC
+ struct page *pages;
+
+ pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node,
+ NULL);
+ if (pages)
+ kfence_metadata = page_to_virt(pages);
+#else
+ if (nr_pages > MAX_ORDER_NR_PAGES) {
+ pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
+ return -EINVAL;
+ }
+ kfence_metadata = alloc_pages_exact(KFENCE_METADATA_SIZE,
+ GFP_KERNEL);
+#endif
+
+ if (!kfence_metadata)
+ return -ENOMEM;
+
+ memset(kfence_metadata, 0, KFENCE_METADATA_SIZE);
+ return 0;
+}
+
+static void kfence_free_metadata(void)
+{
+ if (WARN_ON(!kfence_metadata))
+ return;
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata)),
+ KFENCE_METADATA_SIZE / PAGE_SIZE);
+#else
+ free_pages_exact((void *)kfence_metadata, KFENCE_METADATA_SIZE);
+#endif
+ kfence_metadata = NULL;
+}
+
static bool __init kfence_init_pool_early(void)
{
- unsigned long addr;
+ unsigned long addr = (unsigned long)__kfence_pool;

if (!__kfence_pool)
return false;

+ if (!kfence_alloc_metadata())
+ goto free_pool;
+
addr = kfence_init_pool();

if (!addr) {
@@ -663,6 +706,7 @@ static bool __init kfence_init_pool_early(void)
return true;
}

+ kfence_free_metadata();
/*
* Only release unprotected pages, and do not try to go back and change
* page attributes due to risk of failing to do so as well. If changing
@@ -670,31 +714,12 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
+free_pool:
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
}

-static bool kfence_init_pool_late(void)
-{
- unsigned long addr, free_size;
-
- addr = kfence_init_pool();
-
- if (!addr)
- return true;
-
- /* Same as above. */
- free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
-#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
-#else
- free_pages_exact((void *)addr, free_size);
-#endif
- __kfence_pool = NULL;
- return false;
-}
-
/* === DebugFS Interface ==================================================== */

static int stats_show(struct seq_file *seq, void *v)
@@ -896,6 +921,10 @@ void __init kfence_init(void)
static int kfence_init_late(void)
{
const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+ unsigned long addr = (unsigned long)__kfence_pool;
+ unsigned long free_size = KFENCE_POOL_SIZE;
+ int ret;
+
#ifdef CONFIG_CONTIG_ALLOC
struct page *pages;

@@ -913,15 +942,29 @@ static int kfence_init_late(void)
return -ENOMEM;
#endif

- if (!kfence_init_pool_late()) {
- pr_err("%s failed\n", __func__);
- return -EBUSY;
+ ret = kfence_alloc_metadata();
+ if (!ret)
+ goto free_pool;
+
+ addr = kfence_init_pool();
+ if (!addr) {
+ kfence_init_enable();
+ kfence_debugfs_init();
+ return 0;
}

- kfence_init_enable();
- kfence_debugfs_init();
+ pr_err("%s failed\n", __func__);
+ kfence_free_metadata();
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+ ret = -EBUSY;

- return 0;
+free_pool:
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)addr, free_size);
+#endif
+ return ret;
}

static int kfence_enable_late(void)
@@ -941,6 +984,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
struct kfence_metadata *meta;
int i;

+ if (!__kfence_pool)
+ return;
+
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
bool in_use;

diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 392fb273e7bd..f46fbb03062b 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -102,7 +102,10 @@ struct kfence_metadata {
#endif
};

-extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
+ CONFIG_KFENCE_NUM_OBJECTS)
+
+extern struct kfence_metadata *kfence_metadata;

static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
--
2.20.1

Marco Elver

unread,
Jul 10, 2023, 6:20:29 AM7/10/23
to Peng Zhang, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev
On Mon, 10 Jul 2023 at 05:27, 'Peng Zhang' via kasan-dev
<kasa...@googlegroups.com> wrote:
>
> kfence_metadata is currently a static array. For the purpose of
> allocating scalable __kfence_pool, we first change it to runtime
> allocation of metadata. Since the size of an object of kfence_metadata
> is 1160 bytes, we can save at least 72 pages (with default 256 objects)
> without enabling kfence.
>
> Below is the numbers obtained in qemu (with default 256 objects).
> before: Memory: 8134692K/8388080K available (3668K bss)
> after: Memory: 8136740K/8388080K available (1620K bss)
> More than expected, it saves 2MB memory.
>
> Signed-off-by: Peng Zhang <zhangp...@bytedance.com>

Seems like a reasonable optimization, but see comments below.

Also with this patch applied on top of v6.5-rc1, KFENCE just doesn't
init at all anymore (early init). Please fix.
Does this mean that KFENCE won't work at all if we can't allocate the
metadata? I.e. it won't work either in early nor late init modes?

I know we already have this limitation for _late init_ of the KFENCE pool.

So I have one major question: when doing _early init_, what is the
maximum size of the KFENCE pool (#objects) with this change?

> + return -EINVAL;
> + }
> + kfence_metadata = alloc_pages_exact(KFENCE_METADATA_SIZE,
> + GFP_KERNEL);
> +#endif
> +
> + if (!kfence_metadata)
> + return -ENOMEM;
> +
> + memset(kfence_metadata, 0, KFENCE_METADATA_SIZE);

memzero_explicit, or pass __GFP_ZERO to alloc_pages?
You moved this from kfence_init_pool_late - that did "__kfence_pool =
NULL" which is missing now.

Marco Elver

unread,
Jul 10, 2023, 6:21:38 AM7/10/23
to Peng Zhang, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev
On Mon, Jul 10, 2023 at 12:19PM +0200, Marco Elver wrote:
> On Mon, 10 Jul 2023 at 05:27, 'Peng Zhang' via kasan-dev
> <kasa...@googlegroups.com> wrote:
> >
> > kfence_metadata is currently a static array. For the purpose of
> > allocating scalable __kfence_pool, we first change it to runtime
> > allocation of metadata. Since the size of an object of kfence_metadata
> > is 1160 bytes, we can save at least 72 pages (with default 256 objects)
> > without enabling kfence.
> >
> > Below is the numbers obtained in qemu (with default 256 objects).
> > before: Memory: 8134692K/8388080K available (3668K bss)
> > after: Memory: 8136740K/8388080K available (1620K bss)
> > More than expected, it saves 2MB memory.
> >
> > Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
>
> Seems like a reasonable optimization, but see comments below.
>
> Also with this patch applied on top of v6.5-rc1, KFENCE just doesn't
> init at all anymore (early init). Please fix.

Forgot to attach .config -- attached config.

All I see is:

[ 0.303465] rcu: RCU calculated value of scheduler-enlistment delay is 100 jiffies.
[ 0.304783] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=8
[ 0.316800] NR_IRQS: 4352, nr_irqs: 488, preallocated irqs: 16
[ 0.318140] rcu: srcu_init: Setting srcu_struct sizes based on contention.
[ 0.320001] kfence: kfence_init failed
[ 0.326880] Console: colour VGA+ 80x25
[ 0.327585] printk: console [ttyS0] enabled
[ 0.327585] printk: console [ttyS0] enabled

around KFENCE initialization.
.config

Alexander Potapenko

unread,
Jul 10, 2023, 6:38:21 AM7/10/23
to Peng Zhang, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev
On Mon, Jul 10, 2023 at 5:27 AM 'Peng Zhang' via kasan-dev
<kasa...@googlegroups.com> wrote:
>
> kfence_metadata is currently a static array. For the purpose of
> allocating scalable __kfence_pool, we first change it to runtime
> allocation of metadata. Since the size of an object of kfence_metadata
> is 1160 bytes, we can save at least 72 pages (with default 256 objects)
> without enabling kfence.
>
> Below is the numbers obtained in qemu (with default 256 objects).
> before: Memory: 8134692K/8388080K available (3668K bss)
> after: Memory: 8136740K/8388080K available (1620K bss)
> More than expected, it saves 2MB memory.

Do you have an understanding of where these 2MB come from?
According to your calculations (which seem valid) the gain should be
290K, so either 2MB is irrelevant to your change (then these numbers
should be omitted), or there's some hidden cost that we do not know
about.

Peng Zhang

unread,
Jul 10, 2023, 6:52:34 AM7/10/23
to Marco Elver, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
Thanks for your review and testing, I'll take a look at the issues later.

Peng Zhang

unread,
Jul 10, 2023, 6:58:42 AM7/10/23
to Alexander Potapenko, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
I don't know why the 2MB memory was saved, but it looks like it has to
do with the .bss section, maybe removing this array affected the linker?

Peng Zhang

unread,
Jul 12, 2023, 4:16:38 AM7/12/23
to gli...@google.com, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
kfence_metadata is currently a static array. For the purpose of
allocating scalable __kfence_pool, we first change it to runtime
allocation of metadata. Since the size of an object of kfence_metadata
is 1160 bytes, we can save at least 72 pages (with default 256 objects)
without enabling kfence.

Below is the numbers obtained in qemu (with default 256 objects).
before: Memory: 8134692K/8388080K available (3668K bss)
after: Memory: 8136740K/8388080K available (1620K bss)
More than expected, it saves 2MB memory. It can be seen that the size
of the .bss section has changed, possibly because it affects the linker.

Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
---
Changes since v1:
- Fix a stupid problem of not being able to initialize kfence. The problem is
that I slightly modified the patch before sending it out, but it has not been
tested. I'm extremely sorry.
- Drop kfence_alloc_metadata() and kfence_free_metadata() because they are no
longer reused.
- Allocate metadata from memblock during early initialization. Fixed the issue
of allocating metadata size that cannot exceed the limit of the buddy system
during early initialization.
- Fix potential UAF in kfence_shutdown_cache().

v1: https://lore.kernel.org/lkml/20230710032714.26...@bytedance.com/

include/linux/kfence.h | 5 +-
mm/kfence/core.c | 124 ++++++++++++++++++++++++++++-------------
mm/kfence/kfence.h | 5 +-
mm/mm_init.c | 2 +-
4 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 726857a4b680..68e71562bfa7 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -59,9 +59,10 @@ static __always_inline bool is_kfence_address(const void *addr)
}

/**
- * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE
+ * metadata via memblock
*/
-void __init kfence_alloc_pool(void);
+void __init kfence_alloc_pool_and_metadata(void);

/**
* kfence_init() - perform KFENCE initialization at boot time
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index dad3c0eb70a0..ed0424950cf1 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -116,7 +116,16 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
* backing pages (in __kfence_pool).
*/
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
-struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+struct kfence_metadata *kfence_metadata;
+
+/*
+ * When kfence_metadata is not NULL, it may be that kfence is being initialized
+ * at this time, and it may be used by kfence_shutdown_cache() during
+ * initialization. If the initialization fails, kfence_metadata will be released,
+ * causing UAF. So it is necessary to add kfence_metadata_init for initialization,
+ * and kfence_metadata will be visible only when initialization is successful.
+ */
+static struct kfence_metadata *kfence_metadata_init;

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
@@ -591,7 +600,7 @@ static unsigned long kfence_init_pool(void)

__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
MEMCG_DATA_OBJCGS;
#endif
}
@@ -610,7 +619,7 @@ static unsigned long kfence_init_pool(void)
}

for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
- struct kfence_metadata *meta = &kfence_metadata[i];
+ struct kfence_metadata *meta = &kfence_metadata_init[i];

/* Initialize metadata. */
INIT_LIST_HEAD(&meta->list);
@@ -626,6 +635,12 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}

+ /*
+ * Make kfence_metadata visible only when initialization is successful.
+ * Otherwise, if the initialization fails and kfence_metadata is
+ * freed, it may cause UAF in kfence_shutdown_cache().
+ */
+ kfence_metadata = kfence_metadata_init;
return 0;

reset_slab:
@@ -672,26 +687,10 @@ static bool __init kfence_init_pool_early(void)
*/
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
- return false;
-}
-
-static bool kfence_init_pool_late(void)
-{
- unsigned long addr, free_size;

- addr = kfence_init_pool();
-
- if (!addr)
- return true;
+ memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
+ kfence_metadata_init = NULL;

- /* Same as above. */
- free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
-#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
-#else
- free_pages_exact((void *)addr, free_size);
-#endif
- __kfence_pool = NULL;
return false;
}

@@ -841,19 +840,30 @@ static void toggle_allocation_gate(struct work_struct *work)

/* === Public interface ===================================================== */

-void __init kfence_alloc_pool(void)
+void __init kfence_alloc_pool_and_metadata(void)
{
if (!kfence_sample_interval)
return;

- /* if the pool has already been initialized by arch, skip the below. */
- if (__kfence_pool)
- return;
-
- __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
-
+ /*
+ * If the pool has already been initialized by arch, there is no need to
+ * re-allocate the memory pool.
+ */
if (!__kfence_pool)
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool) {
pr_err("failed to allocate pool\n");
+ return;
+ }
+
+ /* The memory allocated by memblock has been zeroed out. */
+ kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE);
+ if (!kfence_metadata_init) {
+ pr_err("failed to allocate metadata\n");
+ memblock_free(__kfence_pool, KFENCE_POOL_SIZE);
+ __kfence_pool = NULL;
+ }
}

static void kfence_init_enable(void)
@@ -895,33 +905,68 @@ void __init kfence_init(void)

static int kfence_init_late(void)
{
- const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE;
+ const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE;
+ unsigned long addr = (unsigned long)__kfence_pool;
+ unsigned long free_size = KFENCE_POOL_SIZE;
+ int err = -ENOMEM;
+
#ifdef CONFIG_CONTIG_ALLOC
struct page *pages;
-
- pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
+ pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node,
+ NULL);
if (!pages)
return -ENOMEM;
+
__kfence_pool = page_to_virt(pages);
+ pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node,
+ NULL);
+ if (pages)
+ kfence_metadata_init = page_to_virt(pages);
#else
- if (nr_pages > MAX_ORDER_NR_PAGES) {
+ if (nr_pages_pool > MAX_ORDER_NR_PAGES ||
+ nr_pages_meta > MAX_ORDER_NR_PAGES) {
pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
return -EINVAL;
}
+
__kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
if (!__kfence_pool)
return -ENOMEM;
+
+ kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL);
#endif

- if (!kfence_init_pool_late()) {
- pr_err("%s failed\n", __func__);
- return -EBUSY;
+ if (!kfence_metadata_init)
+ goto free_pool;
+
+ memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE);
+ addr = kfence_init_pool();
+ if (!addr) {
+ kfence_init_enable();
+ kfence_debugfs_init();
+ return 0;
}

- kfence_init_enable();
- kfence_debugfs_init();
+ pr_err("%s failed\n", __func__);
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+ err = -EBUSY;

- return 0;
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)),
+ nr_pages_meta);
+free_pool:
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)),
+ free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE);
+free_pool:
+ free_pages_exact((void *)addr, free_size);
+#endif
+
+ kfence_metadata_init = NULL;
+ __kfence_pool = NULL;
+ return err;
}

static int kfence_enable_late(void)
@@ -941,6 +986,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
struct kfence_metadata *meta;
int i;

+ if (!kfence_metadata)
+ return;
+
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
bool in_use;

diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 392fb273e7bd..f46fbb03062b 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -102,7 +102,10 @@ struct kfence_metadata {
#endif
};

-extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
+ CONFIG_KFENCE_NUM_OBJECTS)
+
+extern struct kfence_metadata *kfence_metadata;

static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..86b26d013f4b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2778,7 +2778,7 @@ void __init mm_core_init(void)
*/
page_ext_init_flatmem();
mem_debugging_and_hardening_init();
- kfence_alloc_pool();
+ kfence_alloc_pool_and_metadata();
report_meminit();
kmsan_init_shadow();
stack_depot_early_init();
--
2.20.1

Peng Zhang

unread,
Jul 12, 2023, 4:28:40 AM7/12/23
to Marco Elver, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang


在 2023/7/10 18:19, Marco Elver 写道:
> On Mon, 10 Jul 2023 at 05:27, 'Peng Zhang' via kasan-dev
> <kasa...@googlegroups.com> wrote:
>>
>> kfence_metadata is currently a static array. For the purpose of
>> allocating scalable __kfence_pool, we first change it to runtime
>> allocation of metadata. Since the size of an object of kfence_metadata
>> is 1160 bytes, we can save at least 72 pages (with default 256 objects)
>> without enabling kfence.
>>
>> Below is the numbers obtained in qemu (with default 256 objects).
>> before: Memory: 8134692K/8388080K available (3668K bss)
>> after: Memory: 8136740K/8388080K available (1620K bss)
>> More than expected, it saves 2MB memory.
>>
>> Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
>
> Seems like a reasonable optimization, but see comments below.
>
> Also with this patch applied on top of v6.5-rc1, KFENCE just doesn't
> init at all anymore (early init). Please fix.
I'm very sorry because I made a slight modification before sending the
patch but it has not been tested, which caused it to not work properly.
I fixed some of the issues you mentioned in v2[1].

[1]
https://lore.kernel.org/lkml/20230712081616.45...@bytedance.com/
It will be limited to 2^10/sizeof(struct kfence_metadata) by buddy
system, so I used memblock to allocate kfence_metadata in v2.
>
>> + return -EINVAL;
>> + }
>> + kfence_metadata = alloc_pages_exact(KFENCE_METADATA_SIZE,
>> + GFP_KERNEL);
>> +#endif
>> +
>> + if (!kfence_metadata)
>> + return -ENOMEM;
>> +
>> + memset(kfence_metadata, 0, KFENCE_METADATA_SIZE);
>
> memzero_explicit, or pass __GFP_ZERO to alloc_pages?
Unfortunately, __GFP_ZERO does not work successfully in
alloc_contig_pages(), so I used memzero_explicit() in v2.
Even though I don't know if memzero_explicit() is necessary
(it just uses the barrier).
Thanks for spotting this, I added it in v2.

Peng Zhang

unread,
Jul 12, 2023, 4:37:17 AM7/12/23
to Peng Zhang, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Marco Elver
Sorry, 2^10*PAGE_SIZE/sizeof(struct kfence_metadata)

Alexander Potapenko

unread,
Jul 12, 2023, 5:30:49 AM7/12/23
to Peng Zhang, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev
> Below is the numbers obtained in qemu (with default 256 objects).
> before: Memory: 8134692K/8388080K available (3668K bss)
> after: Memory: 8136740K/8388080K available (1620K bss)
> More than expected, it saves 2MB memory. It can be seen that the size
> of the .bss section has changed, possibly because it affects the linker.

The size of .bss should only change by ~288K. Perhaps it has crossed
the alignment boundary for .bss, but this effect cannot be guaranteed
and does not depend exclusively on this patch.
I suggest that you omit these lines from the patch description, as
they may confuse the readers.

Peng Zhang

unread,
Jul 12, 2023, 5:36:27 AM7/12/23
to Alexander Potapenko, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
Ok, I'll revise it to avoid confusion.

Peng Zhang

unread,
Jul 14, 2023, 8:42:50 AM7/14/23
to gli...@google.com, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
Hi all,

Are there any other comments here?
Welcome any comments.

Thanks,
Peng

Marco Elver

unread,
Jul 17, 2023, 6:08:39 AM7/17/23
to Peng Zhang, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev
You've renamed this, but not the stub later in the file. So this
currently breaks with !CONFIG_KFENCE.

Also, there's a reference in comments to kfence_alloc_pool(), please
update as well.

> /**
> * kfence_init() - perform KFENCE initialization at boot time
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index dad3c0eb70a0..ed0424950cf1 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -116,7 +116,16 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> * backing pages (in __kfence_pool).
> */
> static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
> -struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> +struct kfence_metadata *kfence_metadata;

Add __read_mostly, like for __kfence_pool.

> +/*
> + * When kfence_metadata is not NULL, it may be that kfence is being initialized
> + * at this time, and it may be used by kfence_shutdown_cache() during
> + * initialization. If the initialization fails, kfence_metadata will be released,
> + * causing UAF. So it is necessary to add kfence_metadata_init for initialization,
> + * and kfence_metadata will be visible only when initialization is successful.
> + */
> +static struct kfence_metadata *kfence_metadata_init;

Also add __read_mostly.

> /* Freelist with available objects. */
> static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
> @@ -591,7 +600,7 @@ static unsigned long kfence_init_pool(void)
>
> __folio_set_slab(slab_folio(slab));
> #ifdef CONFIG_MEMCG
> - slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
> + slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
> MEMCG_DATA_OBJCGS;
> #endif
> }
> @@ -610,7 +619,7 @@ static unsigned long kfence_init_pool(void)
> }
>
> for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
> - struct kfence_metadata *meta = &kfence_metadata[i];
> + struct kfence_metadata *meta = &kfence_metadata_init[i];
>
> /* Initialize metadata. */
> INIT_LIST_HEAD(&meta->list);
> @@ -626,6 +635,12 @@ static unsigned long kfence_init_pool(void)
> addr += 2 * PAGE_SIZE;
> }
>
> + /*
> + * Make kfence_metadata visible only when initialization is successful.
> + * Otherwise, if the initialization fails and kfence_metadata is
> + * freed, it may cause UAF in kfence_shutdown_cache().
> + */
> + kfence_metadata = kfence_metadata_init;

May cause _concurrent_ UAF, right? I assume so, at least with late init.

So in that case, this should be smp_store_release().
And this should be smp_load_acquire(&kfence_metadata), so that all the
metadata initialization is actually seen by concurrent
kfence_shutdown_cache / kfence_init_pool.

And add a comment something like:

/* Pairs with release in kfence_init_pool(). */
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20230712081616.45177-1-zhangpeng.00%40bytedance.com.

Peng Zhang

unread,
Jul 18, 2023, 3:30:32 AM7/18/23
to gli...@google.com, el...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
kfence_metadata is currently a static array. For the purpose of allocating
scalable __kfence_pool, we first change it to runtime allocation of
metadata. Since the size of an object of kfence_metadata is 1160 bytes, we
can save at least 72 pages (with default 256 objects) without enabling
kfence.

Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
---
Changes since v2:
- Fix missing renaming of kfence_alloc_pool.
- Add __read_mostly for kfence_metadata and kfence_metadata_init.
- Use smp_store_release() and smp_load_acquire() to access kfence_metadata.
- Some tweaks to comments and git log.

v1: https://lore.kernel.org/lkml/20230710032714.26...@bytedance.com/
v2: https://lore.kernel.org/lkml/20230712081616.45...@bytedance.com/

include/linux/kfence.h | 11 ++--
mm/kfence/core.c | 124 ++++++++++++++++++++++++++++-------------
mm/kfence/kfence.h | 5 +-
mm/mm_init.c | 2 +-
4 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index 726857a4b680..401af4757514 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr)
}

/**
- * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE
+ * metadata via memblock
*/
-void __init kfence_alloc_pool(void);
+void __init kfence_alloc_pool_and_metadata(void);

/**
* kfence_init() - perform KFENCE initialization at boot time
*
- * Requires that kfence_alloc_pool() was called before. This sets up the
- * allocation gate timer, and requires that workqueues are available.
+ * Requires that kfence_alloc_pool_and_metadata() was called before. This sets
+ * up the allocation gate timer, and requires that workqueues are available.
*/
void __init kfence_init(void);

@@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
#else /* CONFIG_KFENCE */

static inline bool is_kfence_address(const void *addr) { return false; }
-static inline void kfence_alloc_pool(void) { }
+static inline void kfence_alloc_pool_and_metadata(void) { }
static inline void kfence_init(void) { }
static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index dad3c0eb70a0..6b526435886c 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -116,7 +116,15 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
* backing pages (in __kfence_pool).
*/
static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
-struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+struct kfence_metadata *kfence_metadata __read_mostly;
+
+/*
+ * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache().
+ * So introduce kfence_metadata_init to initialize metadata, and then make
+ * kfence_metadata visible after initialization is successful. This prevents
+ * potential UAF or access to uninitialized metadata.
+ */
+static struct kfence_metadata *kfence_metadata_init __read_mostly;

/* Freelist with available objects. */
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
@@ -591,7 +599,7 @@ static unsigned long kfence_init_pool(void)

__folio_set_slab(slab_folio(slab));
#ifdef CONFIG_MEMCG
- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
+ slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
MEMCG_DATA_OBJCGS;
#endif
}
@@ -610,7 +618,7 @@ static unsigned long kfence_init_pool(void)
}

for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
- struct kfence_metadata *meta = &kfence_metadata[i];
+ struct kfence_metadata *meta = &kfence_metadata_init[i];

/* Initialize metadata. */
INIT_LIST_HEAD(&meta->list);
@@ -626,6 +634,12 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}

+ /*
+ * Make kfence_metadata visible only when initialization is successful.
+ * Otherwise, if the initialization fails and kfence_metadata is freed,
+ * it may cause UAF in kfence_shutdown_cache().
+ */
+ smp_store_release(&kfence_metadata, kfence_metadata_init);
return 0;

reset_slab:
@@ -672,26 +686,10 @@ static bool __init kfence_init_pool_early(void)
*/
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
- return false;
-}
-
-static bool kfence_init_pool_late(void)
-{
- unsigned long addr, free_size;

- addr = kfence_init_pool();
-
- if (!addr)
- return true;
+ memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE);
+ kfence_metadata_init = NULL;

- /* Same as above. */
- free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
-#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
-#else
- free_pages_exact((void *)addr, free_size);
-#endif
- __kfence_pool = NULL;
return false;
}

@@ -841,19 +839,30 @@ static void toggle_allocation_gate(struct work_struct *work)
@@ -895,33 +904,68 @@ void __init kfence_init(void)
@@ -941,6 +985,10 @@ void kfence_shutdown_cache(struct kmem_cache *s)
struct kfence_metadata *meta;
int i;

+ /* Pairs with release in kfence_init_pool(). */
+ if (!smp_load_acquire(&kfence_metadata))
+ return;
+
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
bool in_use;

diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index 392fb273e7bd..f46fbb03062b 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -102,7 +102,10 @@ struct kfence_metadata {
#endif
};

-extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \
+ CONFIG_KFENCE_NUM_OBJECTS)
+
+extern struct kfence_metadata *kfence_metadata;

static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
{
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7f7f9c677854..3d0a63c75829 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2721,7 +2721,7 @@ void __init mm_core_init(void)

Peng Zhang

unread,
Jul 18, 2023, 3:35:08 AM7/18/23
to Marco Elver, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Peng Zhang
Thank you for your review, I have updated v3[1] with all
the changes you mentioned.

[1]
https://lore.kernel.org/lkml/20230718073019.52...@bytedance.com/

Thanks,
Peng

Marco Elver

unread,
Jul 18, 2023, 8:40:20 AM7/18/23
to Peng Zhang, gli...@google.com, dvy...@google.com, ak...@linux-foundation.org, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Kefeng Wang
On Tue, 18 Jul 2023 at 09:30, Peng Zhang <zhangp...@bytedance.com> wrote:
>
> kfence_metadata is currently a static array. For the purpose of allocating
> scalable __kfence_pool, we first change it to runtime allocation of
> metadata. Since the size of an object of kfence_metadata is 1160 bytes, we
> can save at least 72 pages (with default 256 objects) without enabling
> kfence.
>
> Signed-off-by: Peng Zhang <zhangp...@bytedance.com>

This looks good (minor nit below).

Reviewed-by: Marco Elver <el...@google.com>

Thanks!
Unnecessary blank line removal (it looks worse now).

Peng Zhang

unread,
Jul 18, 2023, 9:38:28 AM7/18/23
to Marco Elver, ak...@linux-foundation.org, gli...@google.com, dvy...@google.com, kasa...@googlegroups.com, linu...@kvack.org, linux-...@vger.kernel.org, muchu...@linux.dev, Kefeng Wang, Peng Zhang


在 2023/7/18 20:39, Marco Elver 写道:
> On Tue, 18 Jul 2023 at 09:30, Peng Zhang <zhangp...@bytedance.com> wrote:
>>
>> kfence_metadata is currently a static array. For the purpose of allocating
>> scalable __kfence_pool, we first change it to runtime allocation of
>> metadata. Since the size of an object of kfence_metadata is 1160 bytes, we
>> can save at least 72 pages (with default 256 objects) without enabling
>> kfence.
>>
>> Signed-off-by: Peng Zhang <zhangp...@bytedance.com>
>
> This looks good (minor nit below).
Andrew, if there is no need to update, can you help to add the
deleted blank line below? Thanks.
>
> Reviewed-by: Marco Elver <el...@google.com>
Marco, Thank you for your review!
Reply all
Reply to author
Forward
0 new messages