[PATCH] [2/4] SLAB: Set up the l3 lists for the memory of freshly added memory

Andi Kleen

unread,

Feb 3, 2010, 4:40:03 PM2/3/10

to

So kmalloc_node() works even if no CPU is up yet on the new node.

Requires previous refactor patch.

Signed-off-by: Andi Kleen <a...@linux.intel.com>

---
mm/slab.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c
@@ -115,6 +115,7 @@
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
+#include <linux/memory.h>

#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -1560,6 +1561,20 @@ void __init kmem_cache_init(void)
g_cpucache_up = EARLY;
}

+static int slab_memory_callback(struct notifier_block *self,
+ unsigned long action, void *arg)
+{
+ struct memory_notify *mn = (struct memory_notify *)arg;
+
+ /*
+ * When a node goes online allocate l3s early. This way
+ * kmalloc_node() works for it.
+ */
+ if (action == MEM_ONLINE && mn->status_change_nid >= 0)
+ slab_node_prepare(mn->status_change_nid);
+ return NOTIFY_OK;
+}
+
void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
@@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
*/
register_cpu_notifier(&cpucache_notifier);

+ hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+
/*
* The reap timers are started later, with a module init call: That part
* of the kernel is not yet operational.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Andi Kleen

unread,

Feb 3, 2010, 4:40:02 PM2/3/10

to

No functional changes.

Needed for next patch.

Signed-off-by: Andi Kleen <a...@linux.intel.com>

---
mm/slab.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)

Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c

@@ -1171,19 +1171,9 @@ free_array_cache:
}
}

-static int __cpuinit cpuup_prepare(long cpu)
+static int slab_node_prepare(int node)
{
struct kmem_cache *cachep;
- struct kmem_list3 *l3 = NULL;
- int node = cpu_to_node(cpu);
- const int memsize = sizeof(struct kmem_list3);
-
- /*
- * We need to do this right in the beginning since
- * alloc_arraycache's are going to use this list.
- * kmalloc_node allows us to add the slab to the right
- * kmem_list3 and not this cpu's kmem_list3
- */

list_for_each_entry(cachep, &cache_chain, next) {
/*
@@ -1192,9 +1182,10 @@ static int __cpuinit cpuup_prepare(long
* node has not already allocated this
*/
if (!cachep->nodelists[node]) {
- l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+ struct kmem_list3 *l3;
+ l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
if (!l3)
- goto bad;
+ return -1;
kmem_list3_init(l3);
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
@@ -1213,6 +1204,23 @@ static int __cpuinit cpuup_prepare(long
cachep->batchcount + cachep->num;
spin_unlock_irq(&cachep->nodelists[node]->list_lock);
}
+ return 0;
+}
+
+static int __cpuinit cpuup_prepare(long cpu)
+{
+ struct kmem_cache *cachep;
+ struct kmem_list3 *l3 = NULL;
+ int node = cpu_to_node(cpu);
+
+ /*
+ * We need to do this right in the beginning since
+ * alloc_arraycache's are going to use this list.
+ * kmalloc_node allows us to add the slab to the right
+ * kmem_list3 and not this cpu's kmem_list3
+ */
+ if (slab_node_prepare(node) < 0)
+ goto bad;

/*
* Now we can go ahead with allocating the shared arrays and

Andi Kleen

unread,

Feb 3, 2010, 4:40:04 PM2/3/10

to

When fallback_alloc() runs the node of the CPU might not be initialized yet.
Handle this case by allocating in another node.

Signed-off-by: Andi Kleen <a...@linux.intel.com>

---
mm/slab.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)

Index: linux-2.6.33-rc3-ak/mm/slab.c
===================================================================
--- linux-2.6.33-rc3-ak.orig/mm/slab.c
+++ linux-2.6.33-rc3-ak/mm/slab.c

@@ -3210,7 +3210,24 @@ retry:
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());
+
+ /*
+ * Node not set up yet? Try one that the cache has been set up
+ * for.
+ */
+ nid = numa_node_id();
+ if (cache->nodelists[nid] == NULL) {
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ nid = zone_to_nid(zone);
+ if (cache->nodelists[nid])
+ break;
+ }
+ if (!cache->nodelists[nid])
+ return NULL;
+ }
+
+
+ obj = kmem_getpages(cache, local_flags, nid);
if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {

Christoph Lameter

unread,

Feb 5, 2010, 2:20:02 PM2/5/10

to

Out of sequence. Last patch already uses a function introduced by this
one.

Acked-by: Christoph Lameter <c...@linux-foundation.org>

Christoph Lameter

unread,

Feb 5, 2010, 2:20:03 PM2/5/10

to

On Wed, 3 Feb 2010, Andi Kleen wrote:

> Requires previous refactor patch.

Not in this series?

> + if (action == MEM_ONLINE && mn->status_change_nid >= 0)
> + slab_node_prepare(mn->status_change_nid);
> + return NOTIFY_OK;

Never seen a slab_node_prepare function before.

David Rientjes

unread,

Feb 5, 2010, 4:10:02 PM2/5/10

to

On Wed, 3 Feb 2010, Andi Kleen wrote:

> When fallback_alloc() runs the node of the CPU might not be initialized yet.
> Handle this case by allocating in another node.
>

That other node must be allowed by current's cpuset, otherwise
kmem_getpages() will fail when get_page_from_freelist() iterates only over
unallowed nodes.

> Signed-off-by: Andi Kleen <a...@linux.intel.com>
>
> ---
> mm/slab.c | 19 ++++++++++++++++++-
> 1 file changed, 18 insertions(+), 1 deletion(-)
>
> Index: linux-2.6.33-rc3-ak/mm/slab.c
> ===================================================================
> --- linux-2.6.33-rc3-ak.orig/mm/slab.c
> +++ linux-2.6.33-rc3-ak/mm/slab.c
> @@ -3210,7 +3210,24 @@ retry:
> if (local_flags & __GFP_WAIT)
> local_irq_enable();
> kmem_flagcheck(cache, flags);
> - obj = kmem_getpages(cache, local_flags, numa_node_id());
> +
> + /*
> + * Node not set up yet? Try one that the cache has been set up
> + * for.
> + */
> + nid = numa_node_id();
> + if (cache->nodelists[nid] == NULL) {
> + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> + nid = zone_to_nid(zone);
> + if (cache->nodelists[nid])
> + break;

If you set a bit in a nodemask_t everytime ____cache_alloc_node() fails in
the previous for_each_zone_zonelist() iteration, you could just iterate
that nodemask here without duplicating the zone_to_nid() and
cache->nodelists[nid] != NULL check.

nid = numa_node_id();

if (!cache->nodelists[nid])

for_each_node_mask(nid, allowed_nodes) {

obj = kmem_getpages(cache, local_flags, nid);

if (obj)
break;
}
else

obj = kmem_getpages(cache, local_flags, nid);

This way you can try all allowed nodes for memory instead of just one when
cache->nodelists[numa_node_id()] == NULL.

David Rientjes

unread,

Feb 5, 2010, 4:20:02 PM2/5/10

to

On Wed, 3 Feb 2010, Andi Kleen wrote:

> Index: linux-2.6.33-rc3-ak/mm/slab.c
> ===================================================================
> --- linux-2.6.33-rc3-ak.orig/mm/slab.c
> +++ linux-2.6.33-rc3-ak/mm/slab.c
> @@ -115,6 +115,7 @@
> #include <linux/reciprocal_div.h>
> #include <linux/debugobjects.h>
> #include <linux/kmemcheck.h>
> +#include <linux/memory.h>
>
> #include <asm/cacheflush.h>
> #include <asm/tlbflush.h>
> @@ -1560,6 +1561,20 @@ void __init kmem_cache_init(void)
> g_cpucache_up = EARLY;
> }
>
> +static int slab_memory_callback(struct notifier_block *self,
> + unsigned long action, void *arg)
> +{
> + struct memory_notify *mn = (struct memory_notify *)arg;

No cast necessary.

> +
> + /*
> + * When a node goes online allocate l3s early. This way
> + * kmalloc_node() works for it.
> + */
> + if (action == MEM_ONLINE && mn->status_change_nid >= 0)
> + slab_node_prepare(mn->status_change_nid);
> + return NOTIFY_OK;
> +}
> +
> void __init kmem_cache_init_late(void)
> {
> struct kmem_cache *cachep;
> @@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
> */
> register_cpu_notifier(&cpucache_notifier);
>
> + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);

Only needed for CONFIG_NUMA.

David Rientjes

unread,

Feb 5, 2010, 4:30:01 PM2/5/10

to

On Wed, 3 Feb 2010, Andi Kleen wrote:

As Christoph mentioned, this patch is out of order with the previous one
in the series; slab_node_prepare() is called in that previous patch by a
memory hotplug callback without holding cache_chain_mutex (it's taken by
the cpu hotplug callback prior to calling cpuup_prepare() currently). So
slab_node_prepare() should note that we require the mutex and the memory
hotplug callback should take it in the previous patch.

Andi Kleen

unread,

Feb 6, 2010, 2:30:02 AM2/6/10

to

On Fri, Feb 05, 2010 at 01:17:56PM -0800, David Rientjes wrote:
> > +static int slab_memory_callback(struct notifier_block *self,
> > + unsigned long action, void *arg)
> > +{
> > + struct memory_notify *mn = (struct memory_notify *)arg;
>
> No cast necessary.

It's standard practice to cast void *.

> > void __init kmem_cache_init_late(void)
> > {
> > struct kmem_cache *cachep;
> > @@ -1583,6 +1598,8 @@ void __init kmem_cache_init_late(void)
> > */
> > register_cpu_notifier(&cpucache_notifier);
> >
> > + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
>
> Only needed for CONFIG_NUMA.

Ok.

-Andi

--
a...@linux.intel.com -- Speaking for myself only.

Andi Kleen

unread,

Feb 6, 2010, 2:30:02 AM2/6/10

to

On Fri, Feb 05, 2010 at 01:06:56PM -0800, David Rientjes wrote:
> On Wed, 3 Feb 2010, Andi Kleen wrote:
>
> > When fallback_alloc() runs the node of the CPU might not be initialized yet.
> > Handle this case by allocating in another node.
> >
>
> That other node must be allowed by current's cpuset, otherwise
> kmem_getpages() will fail when get_page_from_freelist() iterates only over
> unallowed nodes.

All theses cases are really only interesting in the memory hotplug path
itself (afterwards the slab is working anyways and memory is there)
and if someone sets funny cpusets for those he gets what he deserves ...

-Andi

Andi Kleen

unread,

Feb 6, 2010, 2:30:03 AM2/6/10

to

> As Christoph mentioned, this patch is out of order with the previous one

Ok.

> in the series; slab_node_prepare() is called in that previous patch by a
> memory hotplug callback without holding cache_chain_mutex (it's taken by
> the cpu hotplug callback prior to calling cpuup_prepare() currently). So
> slab_node_prepare() should note that we require the mutex and the memory
> hotplug callback should take it in the previous patch.

AFAIK the code is correct. If you feel the need for additional
documentation feel free to send patches yourself.

-Andi

David Rientjes

unread,

Feb 6, 2010, 4:50:01 AM2/6/10

to

On Sat, 6 Feb 2010, Andi Kleen wrote:

> > > +static int slab_memory_callback(struct notifier_block *self,
> > > + unsigned long action, void *arg)
> > > +{
> > > + struct memory_notify *mn = (struct memory_notify *)arg;
> >
> > No cast necessary.
>
> It's standard practice to cast void *.
>

$ grep -r "struct memory_notify.*=" *
arch/powerpc/platforms/pseries/cmm.c: struct memory_notify *marg = arg;
drivers/base/node.c: struct memory_notify *mnb = arg;
drivers/net/ehea/ehea_main.c: struct memory_notify *arg = data;
mm/ksm.c: struct memory_notify *mn = arg;
mm/slub.c: struct memory_notify *marg = arg;
mm/slub.c: struct memory_notify *marg = arg;
mm/page_cgroup.c: struct memory_notify *mn = arg;

David Rientjes

unread,

Feb 6, 2010, 5:00:01 AM2/6/10

to

On Sat, 6 Feb 2010, Andi Kleen wrote:

> > That other node must be allowed by current's cpuset, otherwise
> > kmem_getpages() will fail when get_page_from_freelist() iterates only over
> > unallowed nodes.
>
> All theses cases are really only interesting in the memory hotplug path
> itself (afterwards the slab is working anyways and memory is there)
> and if someone sets funny cpusets for those he gets what he deserves ...
>

If a hot-added node has not been initialized for the cache, your code is
picking an existing one in zonelist order which may be excluded by
current's cpuset. Thus, your code has a very real chance of having
kmem_getpages() return NULL because get_page_from_freelist() will reject
non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
scenario that requires a "funny cpuset," it just has to not allow whatever
initialized node comes first in the zonelist.

My suggested alternative does not pick a single initialized node, rather
it tries all nodes that actually have a chance of having kmem_getpages()
succeed which increases the probability that your patch actually has an
effect for cpuset users.

David Rientjes

unread,

Feb 6, 2010, 5:00:02 AM2/6/10

to

On Sat, 6 Feb 2010, Andi Kleen wrote:

> > in the series; slab_node_prepare() is called in that previous patch by a
> > memory hotplug callback without holding cache_chain_mutex (it's taken by
> > the cpu hotplug callback prior to calling cpuup_prepare() currently). So
> > slab_node_prepare() should note that we require the mutex and the memory
> > hotplug callback should take it in the previous patch.
>
> AFAIK the code is correct. If you feel the need for additional
> documentation feel free to send patches yourself.
>

Documentation? You're required to take cache_chain_mutex before calling
slab_node_prepare() in your memory hotplug notifier, it iterates
cache_chain. Please look again.

Andi Kleen

unread,

Feb 6, 2010, 11:00:02 AM2/6/10

to

> If a hot-added node has not been initialized for the cache, your code is
> picking an existing one in zonelist order which may be excluded by
> current's cpuset. Thus, your code has a very real chance of having
> kmem_getpages() return NULL because get_page_from_freelist() will reject
> non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
> scenario that requires a "funny cpuset," it just has to not allow whatever
> initialized node comes first in the zonelist.

The point was that you would need to run whoever triggers the memory
hotadd in a cpuset with limitations. That would be a clear
don't do that if hurts(tm)

> My suggested alternative does not pick a single initialized node, rather
> it tries all nodes that actually have a chance of having kmem_getpages()
> succeed which increases the probability that your patch actually has an
> effect for cpuset users.

cpuset users are unlikely to trigger memory hotadds from inside limiting
cpusets. Typically that's done from udev etc.

-Andi

--
a...@linux.intel.com -- Speaking for myself only.

David Rientjes

unread,

Feb 6, 2010, 5:40:02 PM2/6/10

to

On Sat, 6 Feb 2010, Andi Kleen wrote:

> > If a hot-added node has not been initialized for the cache, your code is
> > picking an existing one in zonelist order which may be excluded by
> > current's cpuset. Thus, your code has a very real chance of having
> > kmem_getpages() return NULL because get_page_from_freelist() will reject
> > non-atomic ALLOC_CPUSET allocations for prohibited nodes. That isn't a
> > scenario that requires a "funny cpuset," it just has to not allow whatever
> > initialized node comes first in the zonelist.
>
> The point was that you would need to run whoever triggers the memory
> hotadd in a cpuset with limitations. That would be a clear
> don't do that if hurts(tm)
>

With a subset of memory nodes, yes. What else prohibits that except for
your new code?

There's a second issue with this approach that I eluded to above: you're
picking the first initialized node for the cache based solely on whether
it is allocated or not. kmem_getpages() may still return NULL when it
would return new slab for any other initialized node, so you're better off
trying them all.

In other words, my earlier (untested) suggestion:

diff --git a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3172,6 +3172,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
gfp_t local_flags;
struct zoneref *z;
struct zone *zone;
+ nodemask_t allowed_nodes = NODE_MASK_NONE;
enum zone_type high_zoneidx = gfp_zone(flags);
void *obj = NULL;
int nid;
@@ -3197,6 +3198,7 @@ retry:
flags | GFP_THISNODE, nid);
if (obj)
break;
+ node_set(nid, allowed_nodes);
}
}

@@ -3210,7 +3212,15 @@ retry:

if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- obj = kmem_getpages(cache, local_flags, numa_node_id());

+ nid = numa_node_id();
+ if (cache->nodelists[nid])

+ obj = kmem_getpages(cache, local_flags, nid);

+ else
+ for_each_node_mask(nid, allowed_nodes) {

+ obj = kmem_getpages(cache, local_flags, nid);

+ if (obj)
+ break;
+ }

if (local_flags & __GFP_WAIT)
local_irq_disable();
if (obj) {

Anyway, I'll leave these otherwise unnecessary limitations to Pekka.
Thanks.