The slowdown was tracked to lookup_memtype which acquires the
spinlock memtype_lock. This heavily contended lock was slowing down
vm_insert_pfn().
With the cmpxchg on page->flags method, both the 32 cpu and 256 cpu
cases take approx 00:01.3 seconds to complete.
To: Ingo Molnar <mi...@redhat.com>
To: H. Peter Anvin <h...@zytor.com>
To: Thomas Gleixner <tg...@linutronix.de>
Signed-off-by: Robin Holt <ho...@sgi.com>
Cc: Venkatesh Pallipadi <venkatesh...@intel.com>
Cc: Suresh Siddha <suresh....@intel.com>
Cc: Linux Kernel Mailing List <linux-...@vger.kernel.org>
Cc: x...@kernel.org
---
Changes since -V1:
1) Introduce atomically setting and clearing the page flags and not
using the global memtype_lock to protect page->flags.
2) This allowed me the opportunity to convert the rwlock back into a
spinlock and not affect _MY_ tests performance as all the pages my test
was utilizing are tracked by struct pages.
3) Corrected the commit log. The timings were for 32 cpus and not 256.
arch/x86/include/asm/cacheflush.h | 43 +++++++++++++++++++++-----------------
arch/x86/mm/pat.c | 10 --------
2 files changed, 25 insertions(+), 28 deletions(-)
Index: memtype_lock_rwlock_V1/arch/x86/include/asm/cacheflush.h
===================================================================
--- memtype_lock_rwlock_V1.orig/arch/x86/include/asm/cacheflush.h 2010-03-11 10:15:18.000000000 -0600
+++ memtype_lock_rwlock_V1/arch/x86/include/asm/cacheflush.h 2010-03-11 10:15:20.000000000 -0600
@@ -44,9 +44,6 @@ static inline void copy_from_user_page(s
memcpy(dst, src, len);
}
-#define PG_WC PG_arch_1
-PAGEFLAG(WC, WC)
-
#ifdef CONFIG_X86_PAT
/*
* X86 PAT uses page flags WC and Uncached together to keep track of
@@ -55,16 +52,23 @@ PAGEFLAG(WC, WC)
* _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not
* been changed from its default (value of -1 used to denote this).
* Note we do not support _PAGE_CACHE_UC here.
- *
- * Caller must hold memtype_lock for atomicity.
*/
+
+#define _PGMT_DEFAULT 0
+#define _PGMT_WC PG_arch_1
+#define _PGMT_UC_MINUS PG_uncached
+#define _PGMT_WB (PG_uncached | PG_arch_1)
+#define _PGMT_MASK (~(PG_uncached | PG_arch_1))
+
static inline unsigned long get_page_memtype(struct page *pg)
{
- if (!PageUncached(pg) && !PageWC(pg))
+ unsigned long pg_flags = pg->flags & (PG_uncached | PG_arch_1);
+
+ if (pg_flags == _PGMT_DEFAULT)
return -1;
- else if (!PageUncached(pg) && PageWC(pg))
+ else if (pg_flags == _PGMT_WC)
return _PAGE_CACHE_WC;
- else if (PageUncached(pg) && !PageWC(pg))
+ else if (pg_flags == _PGMT_UC_MINUS)
return _PAGE_CACHE_UC_MINUS;
else
return _PAGE_CACHE_WB;
@@ -72,25 +76,26 @@ static inline unsigned long get_page_mem
static inline void set_page_memtype(struct page *pg, unsigned long memtype)
{
+ unsigned long memtype_flags = _PGMT_DEFAULT;
+ unsigned long old_flags;
+ unsigned long new_flags;
+
switch (memtype) {
case _PAGE_CACHE_WC:
- ClearPageUncached(pg);
- SetPageWC(pg);
+ memtype_flags = _PGMT_WC;
break;
case _PAGE_CACHE_UC_MINUS:
- SetPageUncached(pg);
- ClearPageWC(pg);
+ memtype_flags = _PGMT_UC_MINUS;
break;
case _PAGE_CACHE_WB:
- SetPageUncached(pg);
- SetPageWC(pg);
- break;
- default:
- case -1:
- ClearPageUncached(pg);
- ClearPageWC(pg);
+ memtype_flags = _PGMT_WB;
break;
}
+
+ do {
+ old_flags = pg->flags;
+ new_flags = (old_flags & _PGMT_MASK) | memtype_flags;
+ } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
}
#else
static inline unsigned long get_page_memtype(struct page *pg) { return -1; }
Index: memtype_lock_rwlock_V1/arch/x86/mm/pat.c
===================================================================
--- memtype_lock_rwlock_V1.orig/arch/x86/mm/pat.c 2010-03-11 10:15:18.000000000 -0600
+++ memtype_lock_rwlock_V1/arch/x86/mm/pat.c 2010-03-11 10:15:20.000000000 -0600
@@ -156,7 +156,7 @@ static char *cattr_name(unsigned long fl
* The data structure is a list that is also organized as an rbtree
* sorted on the start address of memtype range.
*
- * memtype_lock protects both the linear list and rbtree.
+ * memtype_lock protects the rbtree.
*/
struct memtype {
@@ -296,8 +296,6 @@ static int pat_pagerange_is_ram(unsigned
* Here we do two pass:
* - Find the memtype of all the pages in the range, look for any conflicts
* - In case of no conflicts, set the new memtype for pages in the range
- *
- * Caller must hold memtype_lock for atomicity.
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
@@ -404,9 +402,7 @@ int reserve_memtype(u64 start, u64 end,
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1) {
- spin_lock(&memtype_lock);
err = reserve_ram_pages_type(start, end, req_type, new_type);
- spin_unlock(&memtype_lock);
return err;
} else if (is_range_ram < 0) {
@@ -501,9 +497,7 @@ int free_memtype(u64 start, u64 end)
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1) {
- spin_lock(&memtype_lock);
err = free_ram_pages_type(start, end);
- spin_unlock(&memtype_lock);
return err;
} else if (is_range_ram < 0) {
@@ -583,10 +577,8 @@ static unsigned long lookup_memtype(u64
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
struct page *page;
- spin_lock(&memtype_lock);
page = pfn_to_page(paddr >> PAGE_SHIFT);
rettype = get_page_memtype(page);
- spin_unlock(&memtype_lock);
/*
* -1 from get_page_memtype() implies RAM page is in its
* default state and not reserved, and hence of type WB
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Can you also include this spinlock to rwlock conversion, which can be
used for non RAM pages as a second patch?
Also, this patch doesn't apply to tip/master because of recent rbtree
changes in tip. Can you please send an updated patch?
For WB case it should be _PGMT_WB and for the case of "-1" this should
be _PGMT_DEFAULT, as in the case of free page we mark it _PGMT_DEFAULT
and when there is an explicit request to mark it WB, then we mark it
_PGMT_WB
Other than that it looks good to me.
thanks,
suresh
spinlock -> rwlock conversion ? I hope you meant it the other way
round as Robin said in #2 :)
And yes it should be a separate patch.
Thanks,
tglx
Thomas, No. I meant converting the existing memtype_lock which is a
spinlock into rwlock. Robin has this in the first version of the patch
but he dropped it in a later version as he avoided this lock for RAM
pages.
Just wanted to make sure that we address the perf aspect even for non
RAM pages by converting the memtype_lock into rwlock before some one
else reports a similar issue for non RAM pages.
thanks,
suresh
> On Fri, 2010-03-12 at 17:08 -0800, Thomas Gleixner wrote:
> > On Fri, 12 Mar 2010, Suresh Siddha wrote:
> > > On Thu, 2010-03-11 at 08:17 -0800, Robin Holt wrote:
> > > > Changes since -V1:
> > > > 1) Introduce atomically setting and clearing the page flags and not
> > > > using the global memtype_lock to protect page->flags.
> > > >
> > > > 2) This allowed me the opportunity to convert the rwlock back into a
> > > > spinlock and not affect _MY_ tests performance as all the pages my test
> > > > was utilizing are tracked by struct pages.
> > >
> > > Can you also include this spinlock to rwlock conversion, which can be
> > > used for non RAM pages as a second patch?
> >
> > spinlock -> rwlock conversion ? I hope you meant it the other way
> > round as Robin said in #2 :)
>
> Thomas, No. I meant converting the existing memtype_lock which is a
> spinlock into rwlock. Robin has this in the first version of the patch
> but he dropped it in a later version as he avoided this lock for RAM
> pages.
>
> Just wanted to make sure that we address the perf aspect even for non
> RAM pages by converting the memtype_lock into rwlock before some one
> else reports a similar issue for non RAM pages.
Care to explain why a rwlock is a good solution and which problem is
solved by the conversion of a spinlock to a rwlock ?
Thanks,
tglx
Thomas, This is the original posting from Robin.
http://marc.info/?l=linux-kernel&m=126720623119649&w=2
lookup_memtype() currently takes the memtype spinlock. Changing this to
a readlock will allow multiple folks to go through this in parallel.
thanks,
suresh
This is easier to look at if you ignore the diff and just look at the
fixed up code:
static inline void set_page_memtype(struct page *pg, unsigned long memtype)
{
unsigned long memtype_flags = _PGMT_DEFAULT;
unsigned long old_flags;
unsigned long new_flags;
switch (memtype) {
case _PAGE_CACHE_WC:
memtype_flags = _PGMT_WC;
break;
case _PAGE_CACHE_UC_MINUS:
memtype_flags = _PGMT_UC_MINUS;
break;
case _PAGE_CACHE_WB:
memtype_flags = _PGMT_WB;
break;
}
do {
old_flags = pg->flags;
new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
}
The memtype_flags start out cleared and only deviate when _WC, _UC_MINUS,
or _WB is specified.
Did I miss anything there?
Thanks,
Robin