From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
H. Peter Anvin doesn't like huge zero page which sticks in memory forever
after the first allocation. Here's implementation of lockless refcounting
for huge zero page.
We have two basic primitives: {get,put}_huge_zero_page(). They
manipulate reference counter.
If counter is 0, get_huge_zero_page() allocates a new huge page and
takes two references: one for caller and one for shrinker. We free the
page only in shrinker callback if counter is 1 (only shrinker has the
reference).
put_huge_zero_page() only decrements counter. Counter is never zero
in put_huge_zero_page() since shrinker holds on reference.
Freeing huge zero page in shrinker callback helps to avoid frequent
allocate-free.
Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
parallel (40 processes) read page faulting comparing to lazy huge page
allocation. I think it's pretty reasonable for synthetic benchmark.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 52073c2..92a1b66 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,6 +18,7 @@
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/shrinker.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -47,7 +48,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
-static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +160,74 @@ static int start_khugepaged(void)
return err;
}
-static int init_huge_zero_pfn(void)
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- struct page *hpage;
- unsigned long pfn;
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
- hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!hpage)
- return -ENOMEM;
- pfn = page_to_pfn(hpage);
- if (cmpxchg(&huge_zero_pfn, 0, pfn))
- __free_page(hpage);
- return 0;
+ if (!zero_page)
+ return 0;
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
}
-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static void put_huge_zero_page(void)
{
- return huge_zero_pfn && pfn == huge_zero_pfn;
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}
-static inline bool is_huge_zero_pmd(pmd_t pmd)
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- return is_huge_zero_pfn(pmd_pfn(pmd));
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
}
+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -698,10 +743,11 @@ static inline struct page *alloc_hugepage(int defrag)
#endif
static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
{
pmd_t entry;
- entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
@@ -724,15 +770,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
- if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
+ unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
spin_lock(&mm->page_table_lock);
- set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -801,7 +851,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
if (is_huge_zero_pmd(pmd)) {
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ unsigned long zero_pfn;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
ret = 0;
goto out_unlock;
}
@@ -908,6 +966,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
hzp_alloc is incremented every time a huge zero page is successfully
allocated. It includes allocations which where dropped due
race with other allocation. Note, it doesn't count every map
of the huge zero page, only its allocation.
hzp_alloc_failed is incremented if kernel fails to allocate huge zero
page and falls back to using small pages.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 677a599..ec4e84e 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -197,6 +197,14 @@ thp_split is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.
+hzp_alloc is incremented every time a huge zero page is successfully
+ allocated. It includes allocations which where dropped due
+ race with other allocation. Note, it doesn't count every map
+ of the huge zero page, only its allocation.
+
+hzp_alloc_failed is incremented if kernel fails to allocate huge zero
+ page and falls back to using small pages.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in /proc/vmstat to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3d31145..d7156fb 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
THP_SPLIT,
+ HZP_ALLOC,
+ HZP_ALLOC_FAILED,
#endif
NR_VM_EVENT_ITEMS
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92a1b66..492658a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -183,8 +183,11 @@ retry:
From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
On write access to huge zero page we alloc a new huge page and clear it.
If ENOMEM, graceful fallback: we create a new pmd table and set pte
around fault address to newly allocated normal (4k) page. All other ptes
in the pmd set to normal zero page.
+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ extern unsigned long zero_pfn;
+ return zero_pfn;
+}
+#endif
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0d903bf..d767a7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -824,6 +824,88 @@ out:
return ret;
}
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+}
+
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -930,19 +1012,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
-- 1.7.7.6
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index f734bb2..677a599 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -276,7 +276,7 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==
Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+split_huge_page_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_page_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
@@ -299,7 +299,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;
From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
Hi,
Andrew, here's updated huge zero page patchset.
Please consider applying.
=================
During testing I noticed big (up to 2.5 times) memory consumption overhead
on some workloads (e.g. ft.A from NPB) if THP is enabled.
The main reason for that big difference is lacking zero page in THP case.
We have to allocate a real page on read page fault.
A program to demonstrate the issue:
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#define MB 1024*1024
int main(int argc, char **argv)
{
char *p;
int i;
posix_memalign((void **)&p, 2 * MB, 200 * MB);
for (i = 0; i < 200 * MB; i+= 4096)
assert(p[i] == 0);
pause();
return 0;
}
With thp-never RSS is about 400k, but with thp-always it's 200M.
After the patcheset thp-always RSS is 400k too.
Design overview.
Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled with
zeros. The way how we allocate it changes in the patchset:
- [01/10] simplest way: hzp allocated on boot time in hugepage_init();
- [09/10] lazy allocation on first use;
- [10/10] lockless refcounting + shrinker-reclaimable hzp;
We setup it in do_huge_pmd_anonymous_page() if area around fault address
is suitable for THP and we've got read page fault.
If we fail to setup hzp (ENOMEM) we fallback to handle_pte_fault() as we
normally do in THP.
On wp fault to hzp we allocate real memory for the huge page and clear it.
If ENOMEM, graceful fallback: we create a new pmd table and set pte around
fault address to newly allocated normal (4k) page. All other ptes in the
pmd set to normal zero page.
We cannot split hzp (and it's bug if we try), but we can split the pmd
which points to it. On splitting the pmd we create a table with all ptes
set to normal zero page.
Patchset organized in bisect-friendly way:
Patches 01-07: prepare all code paths for hzp
Patch 08: all code paths are covered: safe to setup hzp
Patch 09: lazy allocation
Patch 10: lockless refcounting for hzp
v5:
- implement HZP_ALLOC and HZP_ALLOC_FAILED events;
v4:
- Rebase to v3.7-rc1;
- Update commit message;
v3:
- fix potential deadlock in refcounting code on preemptive kernel.
- do not mark huge zero page as movable.
- fix typo in comment.
- Reviewed-by tag from Andrea Arcangeli.
v2:
- Avoid find_vma() if we've already had vma on stack.
Suggested by Andrea Arcangeli.
- Implement refcounting for huge zero page.
By hpa request I've tried alternative approach for hzp implementation (see
Virtual huge zero page patchset): pmd table with all entries set to zero
page. This way should be more cache friendly, but it increases TLB
pressure.
The problem with virtual huge zero page: it requires per-arch enabling.
We need a way to mark that pmd table has all ptes set to zero page.
Some numbers to compare two implementations (on 4s Westmere-EX):
Mirobenchmark1
==============
test:
posix_memalign((void **)&p, 2 * MB, 8 * GB);
for (i = 0; i < 100; i++) {
assert(memcmp(p, p + 4*GB, 4*GB) == 0);
asm volatile ("": : :"memory");
}
hzp:
Performance counter stats for './test_memcmp' (5 runs):
On Wed, 7 Nov 2012 17:00:52 +0200
"Kirill A. Shutemov" <kirill.shute...@linux.intel.com> wrote:
> Andrew, here's updated huge zero page patchset.
There is still a distinct lack of reviewed-by's and acked-by's on this
patchset.
On 13 Sep, Andrea did indicate that he "reviewed the whole patchset and
it looks fine to me". But that information failed to make it into the
changelogs, which is bad.
I grabbed the patchset. I might hold it over until 3.9 depending on
additional review/test feedback and upon whether Andrea can be
persuaded to take another look at it all.
I'm still a bit concerned over the possibility that some workloads will
cause a high-frequency free/alloc/memset cycle on that huge zero page. We'll see how it goes...
For this reason and for general ease-of-testing: can and should we add
a knob which will enable users to disable the feature at runtime? That
way if it causes problems or if we suspect it's causing problems, we
can easily verify the theory and offer users a temporary fix.
Such a knob could be a boot-time option, but a post-boot /proc thing
would be much nicer.
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 40f17c3..e5ce979 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -47,6 +47,7 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
> /* during fragmentation poll the hugepage allocator once every minute */
> static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
> static struct task_struct *khugepaged_thread __read_mostly;
> +static unsigned long huge_zero_pfn __read_mostly;
> static DEFINE_MUTEX(khugepaged_mutex);
> static DEFINE_SPINLOCK(khugepaged_mm_lock);
> static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
> @@ -159,6 +160,29 @@ static int start_khugepaged(void)
> return err;
> }
> +static int init_huge_zero_page(void)
Could be __init, but this gets switched over to lazy allocation later in the series so probably not worth it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
> We don't have a real page to zap in huge zero page case. Let's just
> clear pmd and remove it from tlb.
s/real/mapped/
> Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Acked-by: David Rientjes <rient...@google.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
You said in the introduction message in this series that you still allow splitting of the pmd, so why no check for pmd_trans_splitting() before this?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
> +#ifndef my_zero_pfn
> +static inline unsigned long my_zero_pfn(unsigned long addr)
> +{
> + extern unsigned long zero_pfn;
I don't think you should be declaring this inside an inlined function, you probably should be protecting the declarations of the variable and the function instead. Perhaps by CONFIG_MMU?
> + return zero_pfn;
> +}
> +#endif
> +
> /*
> * Multiple processes may "see" the same page. E.g. for untouched
> * mappings of /dev/null, all processes see the same page full of
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0d903bf..d767a7c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -824,6 +824,88 @@ out:
> return ret;
> }
> +/* no "address" argument so destroys page coloring of some arch */
> +pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
> +{
Umm, this is a copy and paste of pgtable_trans_huge_withdraw() from the generic page table handling. Why can't you reuse that and support (and/or modify) the s390 and sparc code?
This whole function is extremely similar to the implementation of do_huge_pmd_wp_page_fallback(), there really is no way to fold the two?
Typically in cases like this it's helpful to split out different logical segments of a function into smaller functions that would handle both
page and !page accordingly.
This could all use a minor restructuring to make it much more cleaner, perhaps by extracting the page_mapcount(page) == 1 case to be a separate function that deals with non-copying writes?
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index d767a7c..05490b3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> pmd_t entry;
> entry = pmdp_get_and_clear(mm, addr, pmd);
> entry = pmd_modify(entry, newprot);
> + if (is_huge_zero_pmd(entry))
> + entry = pmd_wrprotect(entry);
> set_pmd_at(mm, addr, pmd, entry);
> spin_unlock(&vma->vm_mm->page_table_lock);
> ret = 1;
Nack, this should be handled in pmd_modify().
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
> I'm still a bit concerned over the possibility that some workloads will
> cause a high-frequency free/alloc/memset cycle on that huge zero page. > We'll see how it goes...
That is easy enough to fix - we can delay the freeing by a random time or
until memory pressure is applied.
This entire function duplicates other code in mm/huge_memory.c which gives even more incentive into breaking do_huge_pmd_wp_zero_page_fallback() into logical helper functions and reusing them for both page and !page.
Duplicating all this code throughout the thp code just becomes a maintenance nightmare down the road.
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
> index f734bb2..677a599 100644
> --- a/Documentation/vm/transhuge.txt
> +++ b/Documentation/vm/transhuge.txt
> @@ -276,7 +276,7 @@ unaffected. libhugetlbfs will also work fine as usual.
> == Graceful fallback ==
> Code walking pagetables but unware about huge pmds can simply call
> -split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
> +split_huge_page_pmd(vma, pmd, addr) where the pmd is the one returned by
> pmd_offset. It's trivial to make the code transparent hugepage aware
> by just grepping for "pmd_offset" and adding split_huge_page_pmd where
> missing after pmd_offset returns the pmd. Thanks to the graceful
> @@ -299,7 +299,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
> return NULL;
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
> Instead of allocating huge zero page on hugepage_init() we can postpone it
> until first huge zero page map. It saves memory if THP is not in use.
Is it worth the branch on every non-write pagefault after that? The unlikely() is not going to help on x86. If thp is enabled in your .config (which isn't the default), then I think it's better to just allocate the zero huge page once and avoid any branches after that to lazily allocate it. (Or do it only when thp is set to "madvise" or "always" if booting with transparent_hugepage=never.)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Alan Cox <a...@lxorguk.ukuu.org.uk> wrote:
> > I'm still a bit concerned over the possibility that some workloads will
> > cause a high-frequency free/alloc/memset cycle on that huge zero page. > > We'll see how it goes...
> That is easy enough to fix - we can delay the freeing by a random time or
> until memory pressure is applied.
The current code does the latter, by freeing the page via a
"slab"-shrinker callback.
But I do suspect that with the right combination of use/unuse and
memory pressure, we could still get into the high-frequency scenario.
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> From: "Kirill A. Shutemov" <kirill.shute...@linux.intel.com>
> H. Peter Anvin doesn't like huge zero page which sticks in memory forever
> after the first allocation. Here's implementation of lockless refcounting
> for huge zero page.
> We have two basic primitives: {get,put}_huge_zero_page(). They
> manipulate reference counter.
> If counter is 0, get_huge_zero_page() allocates a new huge page and
> takes two references: one for caller and one for shrinker. We free the
> page only in shrinker callback if counter is 1 (only shrinker has the
> reference).
> put_huge_zero_page() only decrements counter. Counter is never zero
> in put_huge_zero_page() since shrinker holds on reference.
> Freeing huge zero page in shrinker callback helps to avoid frequent
> allocate-free.
> Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
> parallel (40 processes) read page faulting comparing to lazy huge page
> allocation. I think it's pretty reasonable for synthetic benchmark.
Eek, this is disappointing that we need to check a refcount before referencing the zero huge page and it obviously shows in your benchmark (which I consider 1% to be significant given the alternative is 2MB of memory for a system where thp was enabled to be on). I think it would be much better to simply allocate and reference the zero huge page locklessly when thp is enabled to be either "madvise" or "always", i.e. allocate it when enabled.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/