percpu#for-next is for the most part a stable tree and 1-12 are
already in the tree. Can you please send the changes based on the
current percpu#for-next?
Thanks.
--
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Hi Christoph,
Btw, my name is "Math-ieu" ;) I'm not offended by you fat-fingering my
name, it's just rather funny. :)
Please see comments below,
> Instead of using a cmpxchg we keep a state variable in the per cpu structure
> that is incremented when we enter the hot path. We can then detect that
> a thread is in the fastpath and fall back to alternate allocation / free
> technique that bypasses fastpath caching.
>
> A disadvantage is that we have to disable preempt. But if preemt is disabled
> (like on most kernels that I run) then the hotpath becomes very efficient.
>
> Cc: Mathieu Desnoyers <mathieu....@polymtl.ca>
> Cc: Pekka Enberg <pen...@cs.helsinki.fi>
> Signed-off-by: Christoph Lameter <c...@linux-foundation.org>
>
>
> ---
> include/linux/slub_def.h | 1
> mm/slub.c | 91 +++++++++++++++++++++++++++++++++++++----------
> 2 files changed, 74 insertions(+), 18 deletions(-)
>
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h 2009-10-01 15:53:15.000000000 -0500
> +++ linux-2.6/include/linux/slub_def.h 2009-10-01 15:53:15.000000000 -0500
> @@ -38,6 +38,7 @@ struct kmem_cache_cpu {
> void **freelist; /* Pointer to first free per cpu object */
> struct page *page; /* The slab from which we are allocating */
> int node; /* The node of the page (or -1 for debug) */
> + int active; /* Active fastpaths */
> #ifdef CONFIG_SLUB_STATS
> unsigned stat[NR_SLUB_STAT_ITEMS];
> #endif
> Index: linux-2.6/mm/slub.c
> ===================================================================
> --- linux-2.6.orig/mm/slub.c 2009-10-01 15:53:15.000000000 -0500
> +++ linux-2.6/mm/slub.c 2009-10-01 15:53:15.000000000 -0500
> @@ -1606,7 +1606,14 @@ static void *__slab_alloc(struct kmem_ca
> unsigned long addr)
> {
> void **object;
> - struct page *page = __this_cpu_read(s->cpu_slab->page);
> + struct page *page;
> + unsigned long flags;
> + int hotpath;
> +
> + local_irq_save(flags);
> + preempt_enable(); /* Get rid of count */
Ugh ? Is that legit ?
check preempt in irq off context might have weird side-effects on
scheduler real-time behavior (at least). You end up quitting a preempt
off section in irq off mode. So the sched check is skipped, and later
you only re-enable interrupts, which depends on having had a timer
interrupt waiting on the interrupt line to ensure scheduler timings.
But if the timer interrupt arrived while you were in the preempt off
section, you're doomed. The scheduler will not be woken up at interrupt
enable.
> + hotpath = __this_cpu_read(s->cpu_slab->active) != 0;
> + page = __this_cpu_read(s->cpu_slab->page);
>
> /* We handle __GFP_ZERO in the caller */
> gfpflags &= ~__GFP_ZERO;
> @@ -1626,13 +1633,21 @@ load_freelist:
> goto another_slab;
> if (unlikely(SLABDEBUG && PageSlubDebug(page)))
> goto debug;
> -
> - __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object));
> - page->inuse = page->objects;
> - page->freelist = NULL;
> - __this_cpu_write(s->cpu_slab->node, page_to_nid(page));
> + if (unlikely(hotpath)) {
> + /* Object on second free list available and hotpath busy */
> + page->inuse++;
> + page->freelist = get_freepointer(s, object);
> + } else {
> + /* Prepare new list of objects for hotpath */
> + __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object));
> + page->inuse = page->objects;
> + page->freelist = NULL;
> + __this_cpu_write(s->cpu_slab->node, page_to_nid(page));
> + }
> unlock_out:
> + __this_cpu_dec(s->cpu_slab->active);
> slab_unlock(page);
> + local_irq_restore(flags);
> stat(s, ALLOC_SLOWPATH);
> return object;
>
> @@ -1642,8 +1657,12 @@ another_slab:
> new_slab:
> page = get_partial(s, gfpflags, node);
> if (page) {
> - __this_cpu_write(s->cpu_slab->page, page);
> stat(s, ALLOC_FROM_PARTIAL);
> +
> + if (hotpath)
> + goto hot_lock;
> +
> + __this_cpu_write(s->cpu_slab->page, page);
> goto load_freelist;
> }
>
> @@ -1657,6 +1676,10 @@ new_slab:
>
> if (page) {
> stat(s, ALLOC_SLAB);
> +
> + if (hotpath)
> + goto hot_no_lock;
> +
> if (__this_cpu_read(s->cpu_slab->page))
> flush_slab(s, __this_cpu_ptr(s->cpu_slab));
> slab_lock(page);
> @@ -1664,6 +1687,10 @@ new_slab:
> __this_cpu_write(s->cpu_slab->page, page);
> goto load_freelist;
> }
> +
> + __this_cpu_dec(s->cpu_slab->active);
> + local_irq_restore(flags);
> +
> if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
> slab_out_of_memory(s, gfpflags, node);
> return NULL;
> @@ -1675,6 +1702,19 @@ debug:
> page->freelist = get_freepointer(s, object);
> __this_cpu_write(s->cpu_slab->node, -1);
> goto unlock_out;
> +
> + /*
> + * Hotpath is busy and we need to avoid touching
> + * hotpath variables
> + */
> +hot_no_lock:
> + slab_lock(page);
> +hot_lock:
> + __ClearPageSlubFrozen(page);
> + if (get_freepointer(s, page->freelist))
> + /* Cannot put page into the hotpath. Instead back to partial */
> + add_partial(get_node(s, page_to_nid(page)), page, 0);
> + goto load_freelist;
> }
>
> /*
> @@ -1691,7 +1731,6 @@ static __always_inline void *slab_alloc(
> gfp_t gfpflags, int node, unsigned long addr)
> {
> void **object;
> - unsigned long flags;
>
> gfpflags &= gfp_allowed_mask;
>
> @@ -1701,19 +1740,21 @@ static __always_inline void *slab_alloc(
> if (should_failslab(s->objsize, gfpflags))
> return NULL;
>
> - local_irq_save(flags);
> + preempt_disable();
> + irqsafe_cpu_inc(s->cpu_slab->active);
> object = __this_cpu_read(s->cpu_slab->freelist);
> - if (unlikely(!object || !node_match(s, node)))
> + if (unlikely(!object || !node_match(s, node) ||
> + __this_cpu_read(s->cpu_slab->active)))
Interesting approach ! I just wonder about the impact of the
irq off / preempt enable dance.
Mathieu
>
> object = __slab_alloc(s, gfpflags, node, addr);
>
> else {
> __this_cpu_write(s->cpu_slab->freelist,
> get_freepointer(s, object));
> + irqsafe_cpu_dec(s->cpu_slab->active);
> + preempt_enable();
> stat(s, ALLOC_FASTPATH);
> }
> - local_irq_restore(flags);
> -
> if (unlikely((gfpflags & __GFP_ZERO) && object))
> memset(object, 0, s->objsize);
>
> @@ -1777,6 +1818,11 @@ static void __slab_free(struct kmem_cach
> {
> void *prior;
> void **object = (void *)x;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + preempt_enable(); /* Fix up count */
> + __this_cpu_dec(s->cpu_slab->active);
>
> stat(s, FREE_SLOWPATH);
> slab_lock(page);
> @@ -1809,6 +1855,7 @@ checks_ok:
>
> out_unlock:
> slab_unlock(page);
> + local_irq_restore(flags);
> return;
>
> slab_empty:
> @@ -1820,6 +1867,7 @@ slab_empty:
> stat(s, FREE_REMOVE_PARTIAL);
> }
> slab_unlock(page);
> + local_irq_restore(flags);
> stat(s, FREE_SLAB);
> discard_slab(s, page);
> return;
> @@ -1845,24 +1893,26 @@ static __always_inline void slab_free(st
> struct page *page, void *x, unsigned long addr)
> {
> void **object = (void *)x;
> - unsigned long flags;
>
> kmemleak_free_recursive(x, s->flags);
> - local_irq_save(flags);
> kmemcheck_slab_free(s, object, s->objsize);
> debug_check_no_locks_freed(object, s->objsize);
> if (!(s->flags & SLAB_DEBUG_OBJECTS))
> debug_check_no_obj_freed(object, s->objsize);
>
> + preempt_disable();
> + irqsafe_cpu_inc(s->cpu_slab->active);
> if (likely(page == __this_cpu_read(s->cpu_slab->page) &&
> - __this_cpu_read(s->cpu_slab->node) >= 0)) {
> - set_freepointer(s, object, __this_cpu_read(s->cpu_slab->freelist));
> + __this_cpu_read(s->cpu_slab->node) >= 0) &&
> + !__this_cpu_read(s->cpu_slab->active)) {
> + set_freepointer(s, object,
> + __this_cpu_read(s->cpu_slab->freelist));
> __this_cpu_write(s->cpu_slab->freelist, object);
> + irqsafe_cpu_dec(s->cpu_slab->active);
> + preempt_enable();
> stat(s, FREE_FASTPATH);
> } else
> __slab_free(s, page, x, addr);
> -
> - local_irq_restore(flags);
> }
>
> void kmem_cache_free(struct kmem_cache *s, void *x)
> @@ -2064,6 +2114,8 @@ static DEFINE_PER_CPU(struct kmem_cache_
>
> static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> {
> + int cpu;
> +
> if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
> /*
> * Boot time creation of the kmalloc array. Use static per cpu data
> @@ -2073,6 +2125,9 @@ static inline int alloc_kmem_cache_cpus(
> else
> s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
>
> + for_each_possible_cpu(cpu)
> + per_cpu_ptr(s->cpu_slab, cpu)->active = -1;
> +
> if (!s->cpu_slab)
> return 0;
>
>
> --
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Yeah, it reads rather awkward, and the comment doesn't make it any
better, but what he's doing is:
slab_alloc()
preempt_disable();
__slab_alloc()
local_irq_save(flags);
preempt_enable();
Yes, I understood this is what he was doing, but I wonder about the
impact on the scheduler. If we have:
* Jiffy 1 -- timer interrupt
* preempt disable
* Jiffy 2 -- timer interrupt
-> here, the scheduler is disabled, so the timer interrupt is skipped.
The scheduler depends on preempt_check_resched() at preempt_enable()
to execute in a bounded amount of time.
* local_irq_save
* preempt_enable
-> interrupts are disabled, scheduler execution is skipped.
* local_irq_restore
-> the interrupt line is low. The scheduler won't be called. There is
no preempt_check_resched() call.
* Jiffy 3 -- timer interrupt
-> Scheduler finally gets executed, missing a whole jiffy.
At the very least, I think an explicit preempt_check_resched should be
added after local_irq_restore().
Also, preempt_enable here should be replaced with
preempt_enable_no_resched().
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
That would be an issue with all irq disable sections, so I don't think
this is actually true.
I tried to find the actual code, but frigging paravirt crap obfuscated
the code enough that I actually gave up.
AFAIK, irq disable sections rely on the fact that if you get a timer
interrupt during this section, the timer interrupt line stays triggered
for the duration of the irqoff section. Therefore, when interrupts are
re-enabled, the interrupt kicks in, so does the scheduler.
This is not the case with the preempt/irqoff dance proposed by
Christoph.
> I tried to find the actual code, but frigging paravirt crap obfuscated
> the code enough that I actually gave up.
You're probably looking for:
arch/x86/include/asm/irqflags.h:
static inline void native_irq_enable(void)
{
asm volatile("sti": : :"memory");
}
and
static inline void native_restore_fl(unsigned long flags)
{
asm volatile("push %0 ; popf"
: /* no output */
:"g" (flags)
:"memory", "cc");
}
static inline void raw_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}
Which are as simple as it gets.
Thanks,
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> On Wed, 2009-10-07 at 08:46 -0400, Mathieu Desnoyers wrote:
> > * local_irq_restore
> > -> the interrupt line is low. The scheduler won't be called. There is
> > no preempt_check_resched() call.
>
> That would be an issue with all irq disable sections, so I don't think
> this is actually true.
This was not true in the past... News to me that an irq enable would not
be a scheduling point.
> c...@linux-foundation.org wrote:
> > This patch introduces two things: First this_cpu_ptr and then per cpu
> > atomic operations.
>
> percpu#for-next is for the most part a stable tree and 1-12 are
> already in the tree. Can you please send the changes based on the
> current percpu#for-next?
ok should get to it by the afternoon.
Ah, you're quite right indeed.
Yes. All you say here is true. I'm concerned about the _impact_ of this
along with the preempt/irqoff dance you propose. Trimming the following
key points from my execution scenario indeed skips the problem altogether.
Usually, when preemption is disabled, the scheduler restrain from
executing. *Now the important point*: the criterion that bounds the
maximum amount of time before the scheduler will re-check for pending
preemption is when preempt_enable() will re-activate preemption.
But because you run preempt_enable with interrupts off, the scheduler
check is not done. And it's not done when interrupts are re-activated
neither.
Please go back to my complete execution scenario, you'll probably see
the light. ;)
Thanks,
Mathieu
>
> > Also, preempt_enable here should be replaced with
> > preempt_enable_no_resched().
>
> Used to have that in earlier incarnations but I saw a lot of these being
> removed lately.
> Yes, I understood this is what he was doing, but I wonder about the
> impact on the scheduler. If we have:
>
> * Jiffy 1 -- timer interrupt
>
> * preempt disable
> * Jiffy 2 -- timer interrupt
> -> here, the scheduler is disabled, so the timer interrupt is skipped.
> The scheduler depends on preempt_check_resched() at preempt_enable()
> to execute in a bounded amount of time.
preempt disable does not disable interrupts. The timer interrupt will
occur. The scheduler may not reschedule another job on this processor
when the timer interrupt calls the scheduler_tick. It
may not do load balancing.
> Also, preempt_enable here should be replaced with
> preempt_enable_no_resched().
Used to have that in earlier incarnations but I saw a lot of these being
removed lately.
--
Which we like to keep as low as possible, I agree.
> > But because you run preempt_enable with interrupts off, the scheduler
> > check is not done. And it's not done when interrupts are re-activated
> > neither.
>
> Ok so we should be moving the preempt_enable after the irq enable. Then we
> will call into the scheduler at the end of the slow path. This may add
> significantly more overhead that we had before when we simply disabled
> and enabled interrupts...
>
You are already calling the scheduler when ending the _fast_ path. I
don't see the problem with calling it when you end the slow path
execution.
Mathieu
> Usually, when preemption is disabled, the scheduler restrain from
> executing. *Now the important point*: the criterion that bounds the
> maximum amount of time before the scheduler will re-check for pending
> preemption is when preempt_enable() will re-activate preemption.
Which creates additional overhead in the allocator.
> But because you run preempt_enable with interrupts off, the scheduler
> check is not done. And it's not done when interrupts are re-activated
> neither.
Ok so we should be moving the preempt_enable after the irq enable. Then we
will call into the scheduler at the end of the slow path. This may add
significantly more overhead that we had before when we simply disabled
and enabled interrupts...
> You are already calling the scheduler when ending the _fast_ path. I
> don't see the problem with calling it when you end the slow path
> execution.
Well yes that gives rise to the thought of using
preempt_enable_no_sched()
at the end of the fastpath as well. Constant calls into the scheduler
could be a major performance issue. I dont notice it here since I usually
cannot affort the preempt overhead and build kernels without support for
it.
> Btw, my name is "Math-ieu" ;) I'm not offended by you fat-fingering my
> name, it's just rather funny. :)
Sorry. But I got this right further down below...
thanks!
--
tejun
... but the opposite is a major RT behavior killer.
preempt_check_resched is basically:
a test TIF_NEED_RESCHED
if true, call to preempt_schedule
So, it's not a call to the scheduler at each and every preempt_enable.
It's _only_ if an interrupt came in during the preempt off section and
the scheduler considered that it should re-schedule soon.
I really don't see what's bothering you here. Testing a thread flag is
incredibly cheap. That's what is typically added to your fast path.
So, correct behavior would be:
preempt disable()
fast path attempt
if (fast path already taken) {
local_irq_save();
slow path.
local_irq_restore();
}
preempt_enable()
Mathieu
> . I dont notice it here since I usually
> cannot affort the preempt overhead and build kernels without support for
> it.
>
>
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
What you say applies to preempt_enable(). I was describing
preempt_check_resched above, which involves no compiler barrier nor
increment whatsoever.
By the way, the barrier() you are talking about is in
preempt_enable_no_resched(), the very primitive you are considering
using to save these precious cycles.
> > I really don't see what's bothering you here. Testing a thread flag is
> > incredibly cheap. That's what is typically added to your fast path.
>
> I am trying to get rid off all unnecessary overhead. These "incredible
> cheap" tricks en masse have caused lots of regressions. And the allocator
> hotpaths are overloaded with these "incredibly cheap" checks alreayd.
>
> > So, correct behavior would be:
> >
> > preempt disable()
> > fast path attempt
> > if (fast path already taken) {
> > local_irq_save();
> > slow path.
> > local_irq_restore();
> > }
> > preempt_enable()
>
> Ok. If you have to use preempt then you have to suffer I guess..
>
Yes. A user enabling full preemption should be aware that it has a
performance footprint.
By the way, from what I remember of the slub allocator, you might find
the following more suited for your needs. I remember that the slow path
sometimes need to reenable interrupts, so:
preempt disable()
fast path attempt
if (fast path already taken) {
local_irq_save();
preempt_enable_no_resched();
slow path {
if (!flags & GFP_ATOMIC) {
local_irq_enable();
preempt_check_resched();
...
local_irq_disable();
}
}
local_irq_restore();
preempt_check_resched();
return;
}
preempt_enable()
This should work, be efficient and manage to ensure scheduler RT
correctness.
Mathieu
> preempt_check_resched is basically:
>
> a test TIF_NEED_RESCHED
> if true, call to preempt_schedule
You did not mention the effect of incrementing the preempt counter and
the barrier(). Adds an additional cacheline to a very hot OS path.
Possibly register effects.
> I really don't see what's bothering you here. Testing a thread flag is
> incredibly cheap. That's what is typically added to your fast path.
I am trying to get rid off all unnecessary overhead. These "incredible
cheap" tricks en masse have caused lots of regressions. And the allocator
hotpaths are overloaded with these "incredibly cheap" checks alreayd.
> So, correct behavior would be:
>
> preempt disable()
> fast path attempt
> if (fast path already taken) {
> local_irq_save();
> slow path.
> local_irq_restore();
> }
> preempt_enable()
Ok. If you have to use preempt then you have to suffer I guess..
--
> @@ -2387,8 +2364,11 @@ static int kmem_cache_open(struct kmem_c
> if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
> goto error;
>
> - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
> + if (!alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
> +
> + if (s->cpu_slab)
> return 1;
> +
> free_kmem_cache_nodes(s);
Argh. I goofed while fixing a diff problem shortly before release.
The following patch fixes the patch:
---
mm/slub.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2009-10-07 18:00:06.000000000 -0500
+++ linux-2.6/mm/slub.c 2009-10-07 18:03:05.000000000 -0500
@@ -2364,9 +2364,7 @@ static int kmem_cache_open(struct kmem_c
if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
goto error;
- if (!alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
-
- if (s->cpu_slab)
+ if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
return 1;
free_kmem_cache_nodes(s);
NACK, delaying the reschedule is not an option
No, the thing you proposed:
> preempt disable()
> fast path attempt
> if (fast path already taken) {
> local_irq_save();
> preempt_enable_no_resched();
> slow path {
> if (!flags & GFP_ATOMIC) {
> local_irq_enable();
> preempt_check_resched();
> ...
> local_irq_disable();
> }
> }
> local_irq_restore();
> preempt_check_resched();
> return;
> }
> preempt_enable()
Seems ok.
I just don't get why Christoph is getting all upset about the
need_resched() check in preempt_enable(), its still cheaper than poking
at the interrupt flags.
Even if only done with interrupt off, and check resched is called after
each irq enable following this critical section ? I'd like to understand
the reason behind your rejection for this specific case.
Thanks,
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> > preempt_enable_no_sched()
>
> NACK, delaying the reschedule is not an option
I ended up just doing a preempt_disable() at the beginning and a
preempt_disable() at the end. That should be easily reviewable.
> I just don't get why Christoph is getting all upset about the
> need_resched() check in preempt_enable(), its still cheaper than poking
> at the interrupt flags.
Preempt causes too many performance issues. I keep on thinking that I
could make it easier on people to use it. Guess I better leave it simple
then.
I agree with you. need_resched() check is incredibly cheap. And if
Christoph still complains about the compiler barrier in preempt
enable_no_resched/disable, then I think he should consider the fact that
the compiler does not perform cross-function optimizations, and consider
putting the preempt disable/enable statements close to function
boundaries. Therefore, the impact in terms of compiler optimization
restrictions should be minimal.
The scheme I proposed above should be OK in terms of scheduler effect
and permit to deal with re-enabling preemption in the slow path
appropriately.
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Then how do you re-enable preemption in the slow path when you need to
block ?
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> Then how do you re-enable preemption in the slow path when you need to
> block ?
preempt_enable();
before the page allocator call and
preempt_disable();
afterwards.
OK, sounds good,
Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> OK, sounds good,
Then here the full patch for review (vs. percpu-next):
From: Christoph Lameter <c...@linux-foundation.org>
Subject: SLUB: Experimental new fastpath w/o interrupt disable
This is a bit of a different tack on things than the last version provided
by Mathieu.
Instead of using a cmpxchg we keep a state variable in the per cpu structure
that is incremented when we enter the hot path. We can then detect that
a thread is in the fastpath and fall back to alternate allocation / free
technique that bypasses fastpath caching.
V1->V2:
- Use safe preempt_enable / disable.
- Enable preempt before calling into the page allocator
- Checkup on hotpath activity changes when returning from page allocator.
- Add barriers.
Todo:
- Verify that this is really safe
- Is this a benefit?
Cc: Mathieu Desnoyers <mathieu....@polymtl.ca>
Cc: Pekka Enberg <pen...@cs.helsinki.fi>
Signed-off-by: Christoph Lameter <c...@linux-foundation.org>
---
include/linux/slub_def.h | 1
mm/slub.c | 112 +++++++++++++++++++++++++++++++++++++++--------
2 files changed, 96 insertions(+), 17 deletions(-)
Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h 2009-10-08 11:35:59.000000000 -0500
+++ linux-2.6/include/linux/slub_def.h 2009-10-08 11:35:59.000000000 -0500
@@ -38,6 +38,7 @@ struct kmem_cache_cpu {
void **freelist; /* Pointer to first free per cpu object */
struct page *page; /* The slab from which we are allocating */
int node; /* The node of the page (or -1 for debug) */
+ int active; /* Active fastpaths */
#ifdef CONFIG_SLUB_STATS
unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c 2009-10-08 11:35:59.000000000 -0500
+++ linux-2.6/mm/slub.c 2009-10-08 14:03:22.000000000 -0500
@@ -1606,7 +1606,14 @@ static void *__slab_alloc(struct kmem_ca
unsigned long addr)
{
void **object;
- struct page *page = __this_cpu_read(s->cpu_slab->page);
+ struct page *page;
+ unsigned long flags;
+ int hotpath;
+
+ local_irq_save(flags);
+ hotpath = __this_cpu_read(s->cpu_slab->active) != 0;
+ __this_cpu_dec(s->cpu_slab->active); /* Drop count from hotpath */
+ page = __this_cpu_read(s->cpu_slab->page);
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
@@ -1627,12 +1634,24 @@ load_freelist:
if (unlikely(SLABDEBUG && PageSlubDebug(page)))
goto debug;
- __this_cpu_write(s->cpu_slab->node, page_to_nid(page));
- __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object));
- page->inuse = page->objects;
- page->freelist = NULL;
+ if (unlikely(hotpath)) {
+ /* Object on page free list available and hotpath busy */
+ page->inuse++;
+ page->freelist = get_freepointer(s, object);
+
+ } else {
+
+ /* Prepare new list of objects for hotpath */
+ __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object));
+ page->inuse = page->objects;
+ page->freelist = NULL;
+ __this_cpu_write(s->cpu_slab->node, page_to_nid(page));
+
+ }
+
unlock_out:
slab_unlock(page);
+ local_irq_restore(flags);
stat(s, ALLOC_SLOWPATH);
return object;
@@ -1642,21 +1661,38 @@ another_slab:
new_slab:
page = get_partial(s, gfpflags, node);
if (page) {
- __this_cpu_write(s->cpu_slab->page, page);
stat(s, ALLOC_FROM_PARTIAL);
+
+ if (hotpath)
+ goto hot_lock;
+
+ __this_cpu_write(s->cpu_slab->page, page);
goto load_freelist;
}
if (gfpflags & __GFP_WAIT)
local_irq_enable();
+ preempt_enable();
page = new_slab(s, gfpflags, node);
+ preempt_disable();
+
+ /*
+ * We have already decremented our count. Someone else
+ * could be running right now or we were moved to a
+ * processor that is in the hotpath. So check against -1.
+ */
+ hotpath = __this_cpu_read(s->cpu_slab->active) != -1;
if (gfpflags & __GFP_WAIT)
local_irq_disable();
if (page) {
stat(s, ALLOC_SLAB);
+
+ if (hotpath)
+ goto hot_no_lock;
+
if (__this_cpu_read(s->cpu_slab->page))
flush_slab(s, __this_cpu_ptr(s->cpu_slab));
slab_lock(page);
@@ -1664,9 +1700,13 @@ new_slab:
__this_cpu_write(s->cpu_slab->page, page);
goto load_freelist;
}
+
+ local_irq_restore(flags);
+
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
return NULL;
+
debug:
if (!alloc_debug_processing(s, page, object, addr))
goto another_slab;
@@ -1675,6 +1715,20 @@ debug:
page->freelist = get_freepointer(s, object);
__this_cpu_write(s->cpu_slab->node, -1);
goto unlock_out;
+
+ /*
+ * Hotpath is busy and we need to avoid touching
+ * hotpath variables
+ */
+hot_no_lock:
+ slab_lock(page);
+
+hot_lock:
+ __ClearPageSlubFrozen(page);
+ if (get_freepointer(s, page->freelist))
+ /* Cannot put page into the hotpath. Instead back to partial */
+ add_partial(get_node(s, page_to_nid(page)), page, 0);
+ goto load_freelist;
}
/*
@@ -1691,7 +1745,6 @@ static __always_inline void *slab_alloc(
gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
- unsigned long flags;
gfpflags &= gfp_allowed_mask;
@@ -1701,18 +1754,27 @@ static __always_inline void *slab_alloc(
if (should_failslab(s->objsize, gfpflags))
return NULL;
- local_irq_save(flags);
+ preempt_disable();
+
+ irqsafe_cpu_inc(s->cpu_slab->active);
+ barrier();
object = __this_cpu_read(s->cpu_slab->freelist);
- if (unlikely(!object || !node_match(s, node)))
+ if (unlikely(!object || !node_match(s, node) ||
+ __this_cpu_read(s->cpu_slab->active)))
object = __slab_alloc(s, gfpflags, node, addr);
else {
+
__this_cpu_write(s->cpu_slab->freelist,
get_freepointer(s, object));
+ barrier();
+ irqsafe_cpu_dec(s->cpu_slab->active);
stat(s, ALLOC_FASTPATH);
+
}
- local_irq_restore(flags);
+
+ preempt_enable();
if (unlikely((gfpflags & __GFP_ZERO) && object))
memset(object, 0, s->objsize);
@@ -1777,7 +1839,9 @@ static void __slab_free(struct kmem_cach
{
void *prior;
void **object = (void *)x;
+ unsigned long flags;
+ local_irq_save(flags);
stat(s, FREE_SLOWPATH);
slab_lock(page);
@@ -1809,6 +1873,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
+ local_irq_restore(flags);
return;
slab_empty:
@@ -1820,6 +1885,7 @@ slab_empty:
stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
+ local_irq_restore(flags);
stat(s, FREE_SLAB);
discard_slab(s, page);
return;
@@ -1845,24 +1911,31 @@ static __always_inline void slab_free(st
struct page *page, void *x, unsigned long addr)
{
void **object = (void *)x;
- unsigned long flags;
kmemleak_free_recursive(x, s->flags);
- local_irq_save(flags);
+ preempt_disable();
kmemcheck_slab_free(s, object, s->objsize);
debug_check_no_locks_freed(object, s->objsize);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(object, s->objsize);
+ irqsafe_cpu_inc(s->cpu_slab->active);
+ barrier();
if (likely(page == __this_cpu_read(s->cpu_slab->page) &&
- __this_cpu_read(s->cpu_slab->node) >= 0)) {
- set_freepointer(s, object, __this_cpu_read(s->cpu_slab->freelist));
+ __this_cpu_read(s->cpu_slab->node) >= 0) &&
+ !__this_cpu_read(s->cpu_slab->active)) {
+ set_freepointer(s, object,
+ __this_cpu_read(s->cpu_slab->freelist));
__this_cpu_write(s->cpu_slab->freelist, object);
+ barrier();
+ irqsafe_cpu_dec(s->cpu_slab->active);
+ preempt_enable();
stat(s, FREE_FASTPATH);
- } else
+ } else {
+ irqsafe_cpu_dec(s->cpu_slab->active);
+ preempt_enable();
__slab_free(s, page, x, addr);
-
- local_irq_restore(flags);
+ }
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2064,6 +2137,8 @@ static DEFINE_PER_CPU(struct kmem_cache_
static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
{
+ int cpu;
+
if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
/*
* Boot time creation of the kmalloc array. Use static per cpu data
@@ -2073,6 +2148,9 @@ static inline int alloc_kmem_cache_cpus(
else
s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->active = -1;
+
if (!s->cpu_slab)
return 0;
(Recommend adding)
preempt_enable_no_resched();
The preempt enable right in the middle of a big function is adding an
unnecessary barrier(), which will restrain gcc from doing its
optimizations. This might hurt performances.
I still recommend the preempt_enable_no_resched() at the beginning of
__slab_alloc(), and simply putting a check_resched() here (which saves
us the odd compiler barrier in the middle of function).
We could replace the above by:
if (gfpflags & __GFP_WAIT) {
local_irq_enable();
preempt_check_resched();
}
> page = new_slab(s, gfpflags, node);
> + preempt_disable();
> +
(remove the above)
Missing a barrier() here ?
The idea is to let gcc know that "active" inc/dec and "freelist" reads
must never be reordered. Even when the decrement is done in the slow
path branch.
>
> object = __slab_alloc(s, gfpflags, node, addr);
>
> else {
> +
> __this_cpu_write(s->cpu_slab->freelist,
> get_freepointer(s, object));
> + barrier();
> + irqsafe_cpu_dec(s->cpu_slab->active);
> stat(s, ALLOC_FASTPATH);
> +
> }
> - local_irq_restore(flags);
> +
> + preempt_enable();
Could move the preempt_enable() above to the else (fast path) branch.
Perhaps missing a barrier() in the else ?
Thanks,
Mathieu
> + irqsafe_cpu_dec(s->cpu_slab->active);
> + preempt_enable();
> __slab_free(s, page, x, addr);
> -
> - local_irq_restore(flags);
> + }
> }
>
> void kmem_cache_free(struct kmem_cache *s, void *x)
> @@ -2064,6 +2137,8 @@ static DEFINE_PER_CPU(struct kmem_cache_
>
> static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
> {
> + int cpu;
> +
> if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
> /*
> * Boot time creation of the kmalloc array. Use static per cpu data
> @@ -2073,6 +2148,9 @@ static inline int alloc_kmem_cache_cpus(
> else
> s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
>
> + for_each_possible_cpu(cpu)
> + per_cpu_ptr(s->cpu_slab, cpu)->active = -1;
> +
> if (!s->cpu_slab)
> return 0;
>
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
> > Index: linux-2.6/mm/slub.c
> > ===================================================================
> > --- linux-2.6.orig/mm/slub.c 2009-10-08 11:35:59.000000000 -0500
> > +++ linux-2.6/mm/slub.c 2009-10-08 14:03:22.000000000 -0500
> > @@ -1606,7 +1606,14 @@ static void *__slab_alloc(struct kmem_ca
> > unsigned long addr)
> > {
> > void **object;
> > - struct page *page = __this_cpu_read(s->cpu_slab->page);
> > + struct page *page;
> > + unsigned long flags;
> > + int hotpath;
> > +
> > + local_irq_save(flags);
>
> (Recommend adding)
>
> preempt_enable_no_resched();
>
>
> The preempt enable right in the middle of a big function is adding an
> unnecessary barrier(), which will restrain gcc from doing its
> optimizations. This might hurt performances.
In the middle of the function we have determine that we have to go to the
page allocator to get more memory. There is not much the compiler can do
to speed that up.
> I still recommend the preempt_enable_no_resched() at the beginning of
> __slab_alloc(), and simply putting a check_resched() here (which saves
> us the odd compiler barrier in the middle of function).
Then preemption would be unnecessarily disabled for the page allocator
call?
> > if (gfpflags & __GFP_WAIT)
> > local_irq_enable();
> >
> > + preempt_enable();
>
> We could replace the above by:
>
> if (gfpflags & __GFP_WAIT) {
> local_irq_enable();
> preempt_check_resched();
> }
Which would leave preempt off for the page allocator.
> > + irqsafe_cpu_inc(s->cpu_slab->active);
> > + barrier();
> > object = __this_cpu_read(s->cpu_slab->freelist);
> > - if (unlikely(!object || !node_match(s, node)))
> > + if (unlikely(!object || !node_match(s, node) ||
> > + __this_cpu_read(s->cpu_slab->active)))
>
> Missing a barrier() here ?
The modifications of the s->cpu_slab->freelist in __slab_alloc() are only
done after interrupts have been disabled and after the slab has been locked.
> The idea is to let gcc know that "active" inc/dec and "freelist" reads
> must never be reordered. Even when the decrement is done in the slow
> path branch.
Right. How could that occur with this code?
> > + preempt_enable();
> > stat(s, FREE_FASTPATH);
> > - } else
> > + } else {
>
> Perhaps missing a barrier() in the else ?
Not sure why that would be necessary. __slab_free() does not even touch
s->cpu_slab->freelist if you have the same reasons as in the alloc path.