Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH v5 10/11] thp: implement refcounting for huge zero page

74 views
Skip to first unread message

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:01 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

H. Peter Anvin doesn't like huge zero page which sticks in memory forever
after the first allocation. Here's implementation of lockless refcounting
for huge zero page.

We have two basic primitives: {get,put}_huge_zero_page(). They
manipulate reference counter.

If counter is 0, get_huge_zero_page() allocates a new huge page and
takes two references: one for caller and one for shrinker. We free the
page only in shrinker callback if counter is 1 (only shrinker has the
reference).

put_huge_zero_page() only decrements counter. Counter is never zero
in put_huge_zero_page() since shrinker holds on reference.

Freeing huge zero page in shrinker callback helps to avoid frequent
allocate-free.

Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
parallel (40 processes) read page faulting comparing to lazy huge page
allocation. I think it's pretty reasonable for synthetic benchmark.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 111 ++++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 52073c2..92a1b66 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,6 +18,7 @@
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/shrinker.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -47,7 +48,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
-static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +160,74 @@ static int start_khugepaged(void)
return err;
}

-static int init_huge_zero_pfn(void)
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- struct page *hpage;
- unsigned long pfn;
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);

- hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!hpage)
- return -ENOMEM;
- pfn = page_to_pfn(hpage);
- if (cmpxchg(&huge_zero_pfn, 0, pfn))
- __free_page(hpage);
- return 0;
+ if (!zero_page)
+ return 0;
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
}

-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static void put_huge_zero_page(void)
{
- return huge_zero_pfn && pfn == huge_zero_pfn;
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

-static inline bool is_huge_zero_pmd(pmd_t pmd)
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- return is_huge_zero_pfn(pmd_pfn(pmd));
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
}

+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS

static ssize_t double_flag_show(struct kobject *kobj,
@@ -576,6 +619,8 @@ static int __init hugepage_init(void)
goto out;
}

+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -698,10 +743,11 @@ static inline struct page *alloc_hugepage(int defrag)
#endif

static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
{
pmd_t entry;
- entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
@@ -724,15 +770,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
- if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
+ unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
spin_lock(&mm->page_table_lock);
- set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -801,7 +851,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
if (is_huge_zero_pmd(pmd)) {
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ unsigned long zero_pfn;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
ret = 0;
goto out_unlock;
}
@@ -908,6 +966,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

@@ -1109,8 +1168,10 @@ alloc:
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ }
if (page) {
VM_BUG_ON(!PageHead(page));
page_remove_rmap(page);
@@ -1188,6 +1249,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
@@ -2544,6 +2606,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(vma->vm_mm, pmd, pgtable);
+ put_huge_zero_page();
}

void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
--
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:01 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We don't have a real page to zap in huge zero page case. Let's just
clear pmd and remove it from tlb.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 21 +++++++++++++--------
1 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e5ce979..ff834ea 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1058,15 +1058,20 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
pgtable = pgtable_trans_huge_withdraw(tlb->mm);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
- page = pmd_page(orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
pte_free(tlb->mm, pgtable);
ret = 1;

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We want to get page fault on write attempt to huge zero page, so let's
keep it write-protected.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d767a7c..05490b3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ if (is_huge_zero_pmd(entry))
+ entry = pmd_wrprotect(entry);
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

hzp_alloc is incremented every time a huge zero page is successfully
allocated. It includes allocations which where dropped due
race with other allocation. Note, it doesn't count every map
of the huge zero page, only its allocation.

hzp_alloc_failed is incremented if kernel fails to allocate huge zero
page and falls back to using small pages.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
Documentation/vm/transhuge.txt | 8 ++++++++
include/linux/vm_event_item.h | 2 ++
mm/huge_memory.c | 5 ++++-
mm/vmstat.c | 2 ++
4 files changed, 16 insertions(+), 1 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 677a599..ec4e84e 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -197,6 +197,14 @@ thp_split is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.

+hzp_alloc is incremented every time a huge zero page is successfully
+ allocated. It includes allocations which where dropped due
+ race with other allocation. Note, it doesn't count every map
+ of the huge zero page, only its allocation.
+
+hzp_alloc_failed is incremented if kernel fails to allocate huge zero
+ page and falls back to using small pages.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in /proc/vmstat to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3d31145..d7156fb 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
THP_SPLIT,
+ HZP_ALLOC,
+ HZP_ALLOC_FAILED,
#endif
NR_VM_EVENT_ITEMS
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 92a1b66..492658a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -183,8 +183,11 @@ retry:

zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!zero_page)
+ if (!zero_page) {
+ count_vm_event(HZP_ALLOC_FAILED);
return 0;
+ }
+ count_vm_event(HZP_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
preempt_enable();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c737057..cb8901c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -801,6 +801,8 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_split",
+ "hzp_alloc",
+ "hzp_alloc_failed",
#endif

#endif /* CONFIG_VM_EVENTS_COUNTERS */

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

All code paths seems covered. Now we can map huge zero page on read page
fault.

We setup it in do_huge_pmd_anonymous_page() if area around fault address
is suitable for THP and we've got read page fault.

If we fail to setup huge zero page (ENOMEM) we fallback to
handle_pte_fault() as we normally do in THP.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f36bc7d..41f05f1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -726,6 +726,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE)) {
+ pgtable_t pgtable;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ goto out;
+ spin_lock(&mm->page_table_lock);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+ }
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We can't split huge zero page itself (and it's bug if we try), but we
can split the pmd which points to it.

On splitting the pmd we create a table with all ptes set to normal zero
page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 47 ++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 90e651c..f36bc7d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1611,6 +1611,7 @@ int split_huge_page(struct page *page)
struct anon_vma *anon_vma;
int ret = 1;

+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
@@ -2509,23 +2510,63 @@ static int khugepaged(void *none)
return 0;
}

+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(vma->vm_mm);
+ pmd_populate(vma->vm_mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(vma->vm_mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(vma->vm_mm, pmd, pgtable);
+}
+
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */

BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);

- spin_lock(&vma->vm_mm->page_table_lock);
+ mmun_start = haddr;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&vma->vm_mm->page_table_lock);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

split_huge_page(page);

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

It's easy to copy huge zero page. Just set destination pmd to huge zero
page.

It's safe to copy huge zero page since we have none yet :-p

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 17 +++++++++++++++++
1 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ff834ea..0d903bf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -701,6 +701,18 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif

+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+{
+ pmd_t entry;
+ entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+}
+
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
@@ -778,6 +790,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
goto out_unlock;
}
+ if (is_huge_zero_pmd(pmd)) {
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ ret = 0;
+ goto out_unlock;
+ }
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(&src_mm->page_table_lock);

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:03 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Instead of allocating huge zero page on hugepage_init() we can postpone it
until first huge zero page map. It saves memory if THP is not in use.

cmpxchg() is used to avoid race on huge_zero_pfn initialization.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 20 ++++++++++----------
1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 41f05f1..52073c2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -160,22 +160,24 @@ static int start_khugepaged(void)
return err;
}

-static int init_huge_zero_page(void)
+static int init_huge_zero_pfn(void)
{
struct page *hpage;
+ unsigned long pfn;

hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!hpage)
return -ENOMEM;
-
- huge_zero_pfn = page_to_pfn(hpage);
+ pfn = page_to_pfn(hpage);
+ if (cmpxchg(&huge_zero_pfn, 0, pfn))
+ __free_page(hpage);
return 0;
}

static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- return pfn == huge_zero_pfn;
+ return huge_zero_pfn && pfn == huge_zero_pfn;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
@@ -564,10 +566,6 @@ static int __init hugepage_init(void)
if (err)
return err;

- err = init_huge_zero_page();
- if (err)
- goto out;
-
err = khugepaged_slab_init();
if (err)
goto out;
@@ -590,8 +588,6 @@ static int __init hugepage_init(void)

return 0;
out:
- if (huge_zero_pfn)
- __free_page(pfn_to_page(huge_zero_pfn));
hugepage_exit_sysfs(hugepage_kobj);
return err;
}
@@ -728,6 +724,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
+ if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:03 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

On write access to huge zero page we alloc a new huge page and clear it.

If ENOMEM, graceful fallback: we create a new pmd table and set pte
around fault address to newly allocated normal (4k) page. All other ptes
in the pmd set to normal zero page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/mm.h | 8 +++
mm/huge_memory.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++------
mm/memory.c | 7 ---
3 files changed, 122 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa06804..fe329da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
}
#endif

+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ extern unsigned long zero_pfn;
+ return zero_pfn;
+}
+#endif
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0d903bf..d767a7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -824,6 +824,88 @@ out:
return ret;
}

+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+}
+
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -930,19 +1012,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */

VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;

page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
- haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -954,7 +1038,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
get_page(page);
spin_unlock(&mm->page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -964,24 +1048,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,

if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
- if (ret & VM_FAULT_OOM)
- split_huge_page(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, haddr);
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
+ put_page(page);
+ }
goto out;
}
count_vm_event(THP_FAULT_ALLOC);

if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
- split_huge_page(page);
- put_page(page);
+ if (page) {
+ split_huge_page(page);
+ put_page(page);
+ }
ret |= VM_FAULT_OOM;
goto out;
}

- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ if (is_huge_zero_pmd(orig_pmd))
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);

mmun_start = haddr;
@@ -989,7 +1083,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

spin_lock(&mm->page_table_lock);
- put_page(page);
+ if (page)
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
@@ -997,7 +1092,6 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mn;
} else {
pmd_t entry;
- VM_BUG_ON(!PageHead(page));
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
@@ -1005,8 +1099,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- page_remove_rmap(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd))
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ if (page) {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
ret |= VM_FAULT_WRITE;
}
spin_unlock(&mm->page_table_lock);
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba..6edc030 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -724,13 +724,6 @@ static inline int is_zero_pfn(unsigned long pfn)
}
#endif

-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:02 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled
with zeros.

For now let's allocate the page on hugepage_init(). We'll switch to lazy
allocation later.

We are not going to map the huge zero page until we can handle it
properly on all code paths.

is_huge_zero_{pfn,pmd}() functions will be used by following patches to
check whether the pfn/pmd is huge zero page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c3..e5ce979 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -47,6 +47,7 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
+static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -159,6 +160,29 @@ static int start_khugepaged(void)
return err;
}

+static int init_huge_zero_page(void)
+{
+ struct page *hpage;
+
+ hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!hpage)
+ return -ENOMEM;
+
+ huge_zero_pfn = page_to_pfn(hpage);
+ return 0;
+}
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ return pfn == huge_zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
#ifdef CONFIG_SYSFS

static ssize_t double_flag_show(struct kobject *kobj,
@@ -540,6 +564,10 @@ static int __init hugepage_init(void)
if (err)
return err;

+ err = init_huge_zero_page();
+ if (err)
+ goto out;
+
err = khugepaged_slab_init();
if (err)
goto out;
@@ -562,6 +590,8 @@ static int __init hugepage_init(void)

return 0;
out:
+ if (huge_zero_pfn)
+ __free_page(pfn_to_page(huge_zero_pfn));
hugepage_exit_sysfs(hugepage_kobj);
return err;

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:03 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Pass vma instead of mm and add address parameter.

In most cases we already have vma on the stack. We provides
split_huge_page_pmd_mm() for few cases when we have mm, but not vma.

This change is preparation to huge zero pmd splitting implementation.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
Documentation/vm/transhuge.txt | 4 ++--
arch/x86/kernel/vm86_32.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/huge_mm.h | 14 ++++++++++----
mm/huge_memory.c | 24 +++++++++++++++++++-----
mm/memory.c | 4 ++--
mm/mempolicy.c | 2 +-
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/pagewalk.c | 2 +-
10 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index f734bb2..677a599 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -276,7 +276,7 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==

Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+split_huge_page_pmd(vma, pmd, addr) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_page_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
@@ -299,7 +299,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;

pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(mm, pmd);
++ split_huge_page_pmd(vma, pmd, addr);
if (pmd_none_or_clear_bad(pmd))
return NULL;

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5c9687b..1dfe69c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, 0xA0000, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9..291a0d1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;

- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_trans_unstable(pmd))
return 0;

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b31cb7d..856f080 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -91,12 +91,14 @@ extern int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags);
extern int split_huge_page(struct page *page);
-extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
-#define split_huge_page_pmd(__mm, __pmd) \
+extern void __split_huge_page_pmd(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd);
+#define split_huge_page_pmd(__vma, __address, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
if (unlikely(pmd_trans_huge(*____pmd))) \
- __split_huge_page_pmd(__mm, ____pmd); \
+ __split_huge_page_pmd(__vma, __address, \
+ ____pmd); \
} while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
@@ -106,6 +108,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd);
BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \
} while (0)
+extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd);
#if HPAGE_PMD_ORDER > MAX_ORDER
#error "hugepages can't be allocated by the buddy allocator"
#endif
@@ -173,10 +177,12 @@ static inline int split_huge_page(struct page *page)
{
return 0;
}
-#define split_huge_page_pmd(__mm, __pmd) \
+#define split_huge_page_pmd(__vma, __address, __pmd) \
do { } while (0)
#define wait_split_huge_page(__anon_vma, __pmd) \
do { } while (0)
+#define split_huge_page_pmd_mm(__mm, __address, __pmd) \
+ do { } while (0)
#define compound_trans_head(page) compound_head(page)
static inline int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 05490b3..90e651c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2509,19 +2509,23 @@ static int khugepaged(void *none)
return 0;
}

-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
{
struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;

- spin_lock(&mm->page_table_lock);
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+
+ spin_lock(&vma->vm_mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_mm->page_table_lock);

split_huge_page(page);

@@ -2529,6 +2533,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
BUG_ON(pmd_trans_huge(*pmd));
}

+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
@@ -2553,7 +2567,7 @@ static void split_huge_page_address(struct mm_struct *mm,
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, address, pmd);
}

void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 6edc030..6017e23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1243,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1512,7 +1512,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
}
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
spin_lock(&mm->page_table_lock);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a5..b68061e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a409926..e8c3938 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
else if (change_huge_pmd(vma, pmd, addr, newprot))
continue;
/* fall through */
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d..eabb24d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
need_flush = true;
continue;
} else if (!err) {
- split_huge_page_pmd(vma->vm_mm, old_pmd);
+ split_huge_page_pmd(vma, old_addr, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d0..35aa294 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;

- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd_mm(walk->mm, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);

Kirill A. Shutemov

unread,
Nov 7, 2012, 10:10:03 AM11/7/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Hi,

Andrew, here's updated huge zero page patchset.

Please consider applying.

=================

During testing I noticed big (up to 2.5 times) memory consumption overhead
on some workloads (e.g. ft.A from NPB) if THP is enabled.

The main reason for that big difference is lacking zero page in THP case.
We have to allocate a real page on read page fault.

A program to demonstrate the issue:
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>

#define MB 1024*1024

int main(int argc, char **argv)
{
char *p;
int i;

posix_memalign((void **)&p, 2 * MB, 200 * MB);
for (i = 0; i < 200 * MB; i+= 4096)
assert(p[i] == 0);
pause();
return 0;
}

With thp-never RSS is about 400k, but with thp-always it's 200M.
After the patcheset thp-always RSS is 400k too.

Design overview.

Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled with
zeros. The way how we allocate it changes in the patchset:

- [01/10] simplest way: hzp allocated on boot time in hugepage_init();
- [09/10] lazy allocation on first use;
- [10/10] lockless refcounting + shrinker-reclaimable hzp;

We setup it in do_huge_pmd_anonymous_page() if area around fault address
is suitable for THP and we've got read page fault.
If we fail to setup hzp (ENOMEM) we fallback to handle_pte_fault() as we
normally do in THP.

On wp fault to hzp we allocate real memory for the huge page and clear it.
If ENOMEM, graceful fallback: we create a new pmd table and set pte around
fault address to newly allocated normal (4k) page. All other ptes in the
pmd set to normal zero page.

We cannot split hzp (and it's bug if we try), but we can split the pmd
which points to it. On splitting the pmd we create a table with all ptes
set to normal zero page.

Patchset organized in bisect-friendly way:
Patches 01-07: prepare all code paths for hzp
Patch 08: all code paths are covered: safe to setup hzp
Patch 09: lazy allocation
Patch 10: lockless refcounting for hzp

v5:
- implement HZP_ALLOC and HZP_ALLOC_FAILED events;
v4:
- Rebase to v3.7-rc1;
- Update commit message;
v3:
- fix potential deadlock in refcounting code on preemptive kernel.
- do not mark huge zero page as movable.
- fix typo in comment.
- Reviewed-by tag from Andrea Arcangeli.
v2:
- Avoid find_vma() if we've already had vma on stack.
Suggested by Andrea Arcangeli.
- Implement refcounting for huge zero page.

--------------------------------------------------------------------------

By hpa request I've tried alternative approach for hzp implementation (see
Virtual huge zero page patchset): pmd table with all entries set to zero
page. This way should be more cache friendly, but it increases TLB
pressure.

The problem with virtual huge zero page: it requires per-arch enabling.
We need a way to mark that pmd table has all ptes set to zero page.

Some numbers to compare two implementations (on 4s Westmere-EX):

Mirobenchmark1
==============

test:
posix_memalign((void **)&p, 2 * MB, 8 * GB);
for (i = 0; i < 100; i++) {
assert(memcmp(p, p + 4*GB, 4*GB) == 0);
asm volatile ("": : :"memory");
}

hzp:
Performance counter stats for './test_memcmp' (5 runs):

32356.272845 task-clock # 0.998 CPUs utilized ( +- 0.13% )
40 context-switches # 0.001 K/sec ( +- 0.94% )
0 CPU-migrations # 0.000 K/sec
4,218 page-faults # 0.130 K/sec ( +- 0.00% )
76,712,481,765 cycles # 2.371 GHz ( +- 0.13% ) [83.31%]
36,279,577,636 stalled-cycles-frontend # 47.29% frontend cycles idle ( +- 0.28% ) [83.35%]
1,684,049,110 stalled-cycles-backend # 2.20% backend cycles idle ( +- 2.96% ) [66.67%]
134,355,715,816 instructions # 1.75 insns per cycle
# 0.27 stalled cycles per insn ( +- 0.10% ) [83.35%]
13,526,169,702 branches # 418.039 M/sec ( +- 0.10% ) [83.31%]
1,058,230 branch-misses # 0.01% of all branches ( +- 0.91% ) [83.36%]

32.413866442 seconds time elapsed ( +- 0.13% )

vhzp:
Performance counter stats for './test_memcmp' (5 runs):

30327.183829 task-clock # 0.998 CPUs utilized ( +- 0.13% )
38 context-switches # 0.001 K/sec ( +- 1.53% )
0 CPU-migrations # 0.000 K/sec
4,218 page-faults # 0.139 K/sec ( +- 0.01% )
71,964,773,660 cycles # 2.373 GHz ( +- 0.13% ) [83.35%]
31,191,284,231 stalled-cycles-frontend # 43.34% frontend cycles idle ( +- 0.40% ) [83.32%]
773,484,474 stalled-cycles-backend # 1.07% backend cycles idle ( +- 6.61% ) [66.67%]
134,982,215,437 instructions # 1.88 insns per cycle
# 0.23 stalled cycles per insn ( +- 0.11% ) [83.32%]
13,509,150,683 branches # 445.447 M/sec ( +- 0.11% ) [83.34%]
1,017,667 branch-misses # 0.01% of all branches ( +- 1.07% ) [83.32%]

30.381324695 seconds time elapsed ( +- 0.13% )

Mirobenchmark2
==============

test:
posix_memalign((void **)&p, 2 * MB, 8 * GB);
for (i = 0; i < 1000; i++) {
char *_p = p;
while (_p < p+4*GB) {
assert(*_p == *(_p+4*GB));
_p += 4096;
asm volatile ("": : :"memory");
}
}

hzp:
Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs):

3505.727639 task-clock # 0.998 CPUs utilized ( +- 0.26% )
9 context-switches # 0.003 K/sec ( +- 4.97% )
4,384 page-faults # 0.001 M/sec ( +- 0.00% )
8,318,482,466 cycles # 2.373 GHz ( +- 0.26% ) [33.31%]
5,134,318,786 stalled-cycles-frontend # 61.72% frontend cycles idle ( +- 0.42% ) [33.32%]
2,193,266,208 stalled-cycles-backend # 26.37% backend cycles idle ( +- 5.51% ) [33.33%]
9,494,670,537 instructions # 1.14 insns per cycle
# 0.54 stalled cycles per insn ( +- 0.13% ) [41.68%]
2,108,522,738 branches # 601.451 M/sec ( +- 0.09% ) [41.68%]
158,746 branch-misses # 0.01% of all branches ( +- 1.60% ) [41.71%]
3,168,102,115 L1-dcache-loads
# 903.693 M/sec ( +- 0.11% ) [41.70%]
1,048,710,998 L1-dcache-misses
# 33.10% of all L1-dcache hits ( +- 0.11% ) [41.72%]
1,047,699,685 LLC-load
# 298.854 M/sec ( +- 0.03% ) [33.38%]
2,287 LLC-misses
# 0.00% of all LL-cache hits ( +- 8.27% ) [33.37%]
3,166,187,367 dTLB-loads
# 903.147 M/sec ( +- 0.02% ) [33.35%]
4,266,538 dTLB-misses
# 0.13% of all dTLB cache hits ( +- 0.03% ) [33.33%]

3.513339813 seconds time elapsed ( +- 0.26% )

vhzp:
Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs):

27313.891128 task-clock # 0.998 CPUs utilized ( +- 0.24% )
62 context-switches # 0.002 K/sec ( +- 0.61% )
4,384 page-faults # 0.160 K/sec ( +- 0.01% )
64,747,374,606 cycles # 2.370 GHz ( +- 0.24% ) [33.33%]
61,341,580,278 stalled-cycles-frontend # 94.74% frontend cycles idle ( +- 0.26% ) [33.33%]
56,702,237,511 stalled-cycles-backend # 87.57% backend cycles idle ( +- 0.07% ) [33.33%]
10,033,724,846 instructions # 0.15 insns per cycle
# 6.11 stalled cycles per insn ( +- 0.09% ) [41.65%]
2,190,424,932 branches # 80.195 M/sec ( +- 0.12% ) [41.66%]
1,028,630 branch-misses # 0.05% of all branches ( +- 1.50% ) [41.66%]
3,302,006,540 L1-dcache-loads
# 120.891 M/sec ( +- 0.11% ) [41.68%]
271,374,358 L1-dcache-misses
# 8.22% of all L1-dcache hits ( +- 0.04% ) [41.66%]
20,385,476 LLC-load
# 0.746 M/sec ( +- 1.64% ) [33.34%]
76,754 LLC-misses
# 0.38% of all LL-cache hits ( +- 2.35% ) [33.34%]
3,309,927,290 dTLB-loads
# 121.181 M/sec ( +- 0.03% ) [33.34%]
2,098,967,427 dTLB-misses
# 63.41% of all dTLB cache hits ( +- 0.03% ) [33.34%]

27.364448741 seconds time elapsed ( +- 0.24% )

--------------------------------------------------------------------------

I personally prefer implementation present in this patchset. It doesn't
touch arch-specific code.

Kirill A. Shutemov (11):
thp: huge zero page: basic preparation
thp: zap_huge_pmd(): zap huge zero pmd
thp: copy_huge_pmd(): copy huge zero page
thp: do_huge_pmd_wp_page(): handle huge zero page
thp: change_huge_pmd(): keep huge zero page write-protected
thp: change split_huge_page_pmd() interface
thp: implement splitting pmd for huge zero page
thp: setup huge zero page on non-write page fault
thp: lazy huge zero page allocation
thp: implement refcounting for huge zero page
thp, vmstat: implement HZP_ALLOC and HZP_ALLOC_FAILED events

Documentation/vm/transhuge.txt | 12 ++-
arch/x86/kernel/vm86_32.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/huge_mm.h | 14 ++-
include/linux/mm.h | 8 +
include/linux/vm_event_item.h | 2 +
mm/huge_memory.c | 334 +++++++++++++++++++++++++++++++++++++---
mm/memory.c | 11 +-
mm/mempolicy.c | 2 +-
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/pagewalk.c | 2 +-
mm/vmstat.c | 2 +
13 files changed, 349 insertions(+), 46 deletions(-)

Andrew Morton

unread,
Nov 14, 2012, 4:40:02 PM11/14/12
to
On Wed, 7 Nov 2012 17:00:52 +0200
"Kirill A. Shutemov" <kirill....@linux.intel.com> wrote:

> Andrew, here's updated huge zero page patchset.

There is still a distinct lack of reviewed-by's and acked-by's on this
patchset.

On 13 Sep, Andrea did indicate that he "reviewed the whole patchset and
it looks fine to me". But that information failed to make it into the
changelogs, which is bad.

I grabbed the patchset. I might hold it over until 3.9 depending on
additional review/test feedback and upon whether Andrea can be
persuaded to take another look at it all.

I'm still a bit concerned over the possibility that some workloads will
cause a high-frequency free/alloc/memset cycle on that huge zero page.
We'll see how it goes...

For this reason and for general ease-of-testing: can and should we add
a knob which will enable users to disable the feature at runtime? That
way if it causes problems or if we suspect it's causing problems, we
can easily verify the theory and offer users a temporary fix.

Such a knob could be a boot-time option, but a post-boot /proc thing
would be much nicer.

David Rientjes

unread,
Nov 14, 2012, 5:10:03 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled
> with zeros.
>
> For now let's allocate the page on hugepage_init(). We'll switch to lazy
> allocation later.
>
> We are not going to map the huge zero page until we can handle it
> properly on all code paths.
>
> is_huge_zero_{pfn,pmd}() functions will be used by following patches to
> check whether the pfn/pmd is huge zero page.
>
> Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>

Acked-by: David Rientjes <rien...@google.com>

> ---
> mm/huge_memory.c | 30 ++++++++++++++++++++++++++++++
> 1 files changed, 30 insertions(+), 0 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 40f17c3..e5ce979 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -47,6 +47,7 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
> /* during fragmentation poll the hugepage allocator once every minute */
> static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
> static struct task_struct *khugepaged_thread __read_mostly;
> +static unsigned long huge_zero_pfn __read_mostly;
> static DEFINE_MUTEX(khugepaged_mutex);
> static DEFINE_SPINLOCK(khugepaged_mm_lock);
> static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
> @@ -159,6 +160,29 @@ static int start_khugepaged(void)
> return err;
> }
>
> +static int init_huge_zero_page(void)

Could be __init, but this gets switched over to lazy allocation later in
the series so probably not worth it.

David Rientjes

unread,
Nov 14, 2012, 5:20:02 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> We don't have a real page to zap in huge zero page case. Let's just
> clear pmd and remove it from tlb.
>

s/real/mapped/

> Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>

Acked-by: David Rientjes <rien...@google.com>

David Rientjes

unread,
Nov 14, 2012, 5:40:02 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

You said in the introduction message in this series that you still allow
splitting of the pmd, so why no check for pmd_trans_splitting() before
this?

David Rientjes

unread,
Nov 14, 2012, 6:10:03 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa06804..fe329da 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> }
> #endif
>
> +#ifndef my_zero_pfn
> +static inline unsigned long my_zero_pfn(unsigned long addr)
> +{
> + extern unsigned long zero_pfn;

I don't think you should be declaring this inside an inlined function, you
probably should be protecting the declarations of the variable and the
function instead. Perhaps by CONFIG_MMU?

> + return zero_pfn;
> +}
> +#endif
> +
> /*
> * Multiple processes may "see" the same page. E.g. for untouched
> * mappings of /dev/null, all processes see the same page full of
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0d903bf..d767a7c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -824,6 +824,88 @@ out:
> return ret;
> }
>
> +/* no "address" argument so destroys page coloring of some arch */
> +pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
> +{

Umm, this is a copy and paste of pgtable_trans_huge_withdraw() from the
generic page table handling. Why can't you reuse that and support (and/or
modify) the s390 and sparc code?

> + pgtable_t pgtable;
> +
> + assert_spin_locked(&mm->page_table_lock);
> +
> + /* FIFO */
> + pgtable = mm->pmd_huge_pte;
> + if (list_empty(&pgtable->lru))
> + mm->pmd_huge_pte = NULL;
> + else {
> + mm->pmd_huge_pte = list_entry(pgtable->lru.next,
> + struct page, lru);
> + list_del(&pgtable->lru);
> + }
> + return pgtable;
> +}
> +
> +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
> + struct vm_area_struct *vma, unsigned long address,
> + pmd_t *pmd, unsigned long haddr)

This whole function is extremely similar to the implementation of
do_huge_pmd_wp_page_fallback(), there really is no way to fold the two?
Typically in cases like this it's helpful to split out different logical
segments of a function into smaller functions that would handle both
page and !page accordingly.
This could all use a minor restructuring to make it much more cleaner,
perhaps by extracting the page_mapcount(page) == 1 case to be a separate
function that deals with non-copying writes?
Couldn't this be an "else" instead?

> + VM_BUG_ON(!PageHead(page));
> + page_remove_rmap(page);
> + put_page(page);
> + }
> ret |= VM_FAULT_WRITE;
> }
> spin_unlock(&mm->page_table_lock);

David Rientjes

unread,
Nov 14, 2012, 6:20:01 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index d767a7c..05490b3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> pmd_t entry;
> entry = pmdp_get_and_clear(mm, addr, pmd);
> entry = pmd_modify(entry, newprot);
> + if (is_huge_zero_pmd(entry))
> + entry = pmd_wrprotect(entry);
> set_pmd_at(mm, addr, pmd, entry);
> spin_unlock(&vma->vm_mm->page_table_lock);
> ret = 1;

Nack, this should be handled in pmd_modify().

Alan Cox

unread,
Nov 14, 2012, 6:20:01 PM11/14/12
to
> I'm still a bit concerned over the possibility that some workloads will
> cause a high-frequency free/alloc/memset cycle on that huge zero page.
> We'll see how it goes...

That is easy enough to fix - we can delay the freeing by a random time or
until memory pressure is applied.

Alan

David Rientjes

unread,
Nov 14, 2012, 6:30:02 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 90e651c..f36bc7d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1611,6 +1611,7 @@ int split_huge_page(struct page *page)
> struct anon_vma *anon_vma;
> int ret = 1;
>
> + BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
> BUG_ON(!PageAnon(page));
> anon_vma = page_lock_anon_vma(page);
> if (!anon_vma)
> @@ -2509,23 +2510,63 @@ static int khugepaged(void *none)
> return 0;
> }
>
> +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
> + unsigned long haddr, pmd_t *pmd)
> +{

This entire function duplicates other code in mm/huge_memory.c which gives
even more incentive into breaking do_huge_pmd_wp_zero_page_fallback() into
logical helper functions and reusing them for both page and !page.
Duplicating all this code throughout the thp code just becomes a
maintenance nightmare down the road.
address or haddr?

> + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
> + spin_lock(&mm->page_table_lock);
> if (unlikely(!pmd_trans_huge(*pmd))) {
> - spin_unlock(&vma->vm_mm->page_table_lock);
> + spin_unlock(&mm->page_table_lock);
> + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> + return;
> + }
> + if (is_huge_zero_pmd(*pmd)) {
> + __split_huge_zero_page_pmd(vma, haddr, pmd);
> + spin_unlock(&mm->page_table_lock);
> + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
> return;
> }
> page = pmd_page(*pmd);
> VM_BUG_ON(!page_count(page));
> get_page(page);
> - spin_unlock(&vma->vm_mm->page_table_lock);
> + spin_unlock(&mm->page_table_lock);
> + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
>
> split_huge_page(page);
>
--

David Rientjes

unread,
Nov 14, 2012, 6:30:01 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

Why not be consistent and make this split_huge_page_pmd_mm(mm, pmd, addr)?

> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 90c63f9..291a0d1 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
> spinlock_t *ptl;
> struct page *page;
>
> - split_huge_page_pmd(walk->mm, pmd);
> + split_huge_page_pmd(vma, addr, pmd);

Ah, it's because the change to the documentation is wrong: the format is
actually split_huge_page_pmd(vma, addr, pmd).
Just do

struct mm_struct *mm = vma->vm_mm;

here and it makes everything else simpler.

> - spin_lock(&mm->page_table_lock);
> + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
> +
> + spin_lock(&vma->vm_mm->page_table_lock);
> if (unlikely(!pmd_trans_huge(*pmd))) {
> - spin_unlock(&mm->page_table_lock);
> + spin_unlock(&vma->vm_mm->page_table_lock);
> return;
> }
> page = pmd_page(*pmd);
> VM_BUG_ON(!page_count(page));
> get_page(page);
> - spin_unlock(&mm->page_table_lock);
> + spin_unlock(&vma->vm_mm->page_table_lock);
>
> split_huge_page(page);
>

David Rientjes

unread,
Nov 14, 2012, 6:40:01 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index f36bc7d..41f05f1 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -726,6 +726,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
> return VM_FAULT_OOM;
> if (unlikely(khugepaged_enter(vma)))
> return VM_FAULT_OOM;
> + if (!(flags & FAULT_FLAG_WRITE)) {
> + pgtable_t pgtable;
> + pgtable = pte_alloc_one(mm, haddr);
> + if (unlikely(!pgtable))
> + goto out;

No use in retrying, just return VM_FAULT_OOM.

> + spin_lock(&mm->page_table_lock);
> + set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
> + spin_unlock(&mm->page_table_lock);
> + return 0;
> + }
> page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
> vma, haddr, numa_node_id(), 0);
> if (unlikely(!page)) {
--

David Rientjes

unread,
Nov 14, 2012, 6:40:02 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> Instead of allocating huge zero page on hugepage_init() we can postpone it
> until first huge zero page map. It saves memory if THP is not in use.
>

Is it worth the branch on every non-write pagefault after that? The
unlikely() is not going to help on x86. If thp is enabled in your
.config (which isn't the default), then I think it's better to just
allocate the zero huge page once and avoid any branches after that to
lazily allocate it. (Or do it only when thp is set to "madvise" or
"always" if booting with transparent_hugepage=never.)

Andrew Morton

unread,
Nov 14, 2012, 6:40:02 PM11/14/12
to
On Wed, 14 Nov 2012 23:20:13 +0000
Alan Cox <al...@lxorguk.ukuu.org.uk> wrote:

> > I'm still a bit concerned over the possibility that some workloads will
> > cause a high-frequency free/alloc/memset cycle on that huge zero page.
> > We'll see how it goes...
>
> That is easy enough to fix - we can delay the freeing by a random time or
> until memory pressure is applied.
>

The current code does the latter, by freeing the page via a
"slab"-shrinker callback.

But I do suspect that with the right combination of use/unuse and
memory pressure, we could still get into the high-frequency scenario.

David Rientjes

unread,
Nov 14, 2012, 6:50:01 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> H. Peter Anvin doesn't like huge zero page which sticks in memory forever
> after the first allocation. Here's implementation of lockless refcounting
> for huge zero page.
>
> We have two basic primitives: {get,put}_huge_zero_page(). They
> manipulate reference counter.
>
> If counter is 0, get_huge_zero_page() allocates a new huge page and
> takes two references: one for caller and one for shrinker. We free the
> page only in shrinker callback if counter is 1 (only shrinker has the
> reference).
>
> put_huge_zero_page() only decrements counter. Counter is never zero
> in put_huge_zero_page() since shrinker holds on reference.
>
> Freeing huge zero page in shrinker callback helps to avoid frequent
> allocate-free.
>
> Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
> parallel (40 processes) read page faulting comparing to lazy huge page
> allocation. I think it's pretty reasonable for synthetic benchmark.
>

Eek, this is disappointing that we need to check a refcount before
referencing the zero huge page and it obviously shows in your benchmark
(which I consider 1% to be significant given the alternative is 2MB of
memory for a system where thp was enabled to be on). I think it would be
much better to simply allocate and reference the zero huge page locklessly
when thp is enabled to be either "madvise" or "always", i.e. allocate it
when enabled.

David Rientjes

unread,
Nov 14, 2012, 6:50:01 PM11/14/12
to
On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:

> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> hzp_alloc is incremented every time a huge zero page is successfully
> allocated. It includes allocations which where dropped due
> race with other allocation. Note, it doesn't count every map
> of the huge zero page, only its allocation.
>
> hzp_alloc_failed is incremented if kernel fails to allocate huge zero
> page and falls back to using small pages.
>

Nobody is going to know what hzp_ is, sorry. It's better to be more
verbose and name them what they actually are: THP_ZERO_PAGE_ALLOC and
THP_ZERO_PAGE_ALLOC_FAILED. But this would assume we want to lazily
allocate them, which I disagree with hpa about.

H. Peter Anvin

unread,
Nov 14, 2012, 7:00:01 PM11/14/12
to
On 11/14/2012 03:32 PM, Andrew Morton wrote:
>
> The current code does the latter, by freeing the page via a
> "slab"-shrinker callback.
>
> But I do suspect that with the right combination of use/unuse and
> memory pressure, we could still get into the high-frequency scenario.
>

There probably isn't any mechanism that doesn't end up with poor results
in some corner case... just like the vhzp variant.

-hpa

David Rientjes

unread,
Nov 14, 2012, 7:30:03 PM11/14/12
to
On Wed, 14 Nov 2012, Andrew Morton wrote:

> For this reason and for general ease-of-testing: can and should we add
> a knob which will enable users to disable the feature at runtime? That
> way if it causes problems or if we suspect it's causing problems, we
> can easily verify the theory and offer users a temporary fix.
>

I think it would be best to add a tunable under
/sys/kernel/mm/transparent_hugepage and enable it by default whenever
/sys/kernel/mm/transparent_hugepage/enabled is "always" or "madvise" and
allocate the huge zero page under such circumstances. Then we can free it
if disabled (or if enabled is set to "never") and avoid all the
refcounting and lazy allocation that causes a regression on Kirill's
benchmark.

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:02 AM11/15/12
to
On Wed, Nov 14, 2012 at 01:33:42PM -0800, Andrew Morton wrote:
> On Wed, 7 Nov 2012 17:00:52 +0200
> "Kirill A. Shutemov" <kirill....@linux.intel.com> wrote:
>
> > Andrew, here's updated huge zero page patchset.
>
> There is still a distinct lack of reviewed-by's and acked-by's on this
> patchset.
>
> On 13 Sep, Andrea did indicate that he "reviewed the whole patchset and
> it looks fine to me". But that information failed to make it into the
> changelogs, which is bad.

As I said before, I had to drop Andrea's reviewed-by on rebase to
v3.7-rc1. I had to solve few not-that-trivial conflicts and I was not sure
if the reviewed-by is still applicable.

> I grabbed the patchset. I might hold it over until 3.9 depending on
> additional review/test feedback and upon whether Andrea can be
> persuaded to take another look at it all.
>
> I'm still a bit concerned over the possibility that some workloads will
> cause a high-frequency free/alloc/memset cycle on that huge zero page.
> We'll see how it goes...
>
> For this reason and for general ease-of-testing: can and should we add
> a knob which will enable users to disable the feature at runtime? That
> way if it causes problems or if we suspect it's causing problems, we
> can easily verify the theory and offer users a temporary fix.
>
> Such a knob could be a boot-time option, but a post-boot /proc thing
> would be much nicer.

Okay, I'll add sysfs knob.

BTW, we already have build time knob: just revert last two patches in the
series. It will bring lazy allocation instead of refcounting.

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 3:10:02 AM11/15/12
to
pmd_trans_splitting() returns true only for pmd which points to a page
under spliiting. It never happens with huge zero page.
We only split a pmd to a page table without touching the page.
mm->page_table_lock is enough to protect against that.

--
Kirill A. Shutemov
signature.asc

David Rientjes

unread,
Nov 15, 2012, 3:20:01 AM11/15/12
to
On Thu, 15 Nov 2012, Kirill A. Shutemov wrote:

> > > @@ -778,6 +790,11 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > > pte_free(dst_mm, pgtable);
> > > goto out_unlock;
> > > }
> > > + if (is_huge_zero_pmd(pmd)) {
> > > + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
> > > + ret = 0;
> > > + goto out_unlock;
> > > + }
> >
> > You said in the introduction message in this series that you still allow
> > splitting of the pmd, so why no check for pmd_trans_splitting() before
> > this?
>
> pmd_trans_splitting() returns true only for pmd which points to a page
> under spliiting. It never happens with huge zero page.
> We only split a pmd to a page table without touching the page.
> mm->page_table_lock is enough to protect against that.
>

Comment in the code?

Kirill A. Shutemov

unread,
Nov 15, 2012, 3:30:03 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:08:45PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index fa06804..fe329da 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> > }
> > #endif
> >
> > +#ifndef my_zero_pfn
> > +static inline unsigned long my_zero_pfn(unsigned long addr)
> > +{
> > + extern unsigned long zero_pfn;
>
> I don't think you should be declaring this inside an inlined function, you
> probably should be protecting the declarations of the variable and the
> function instead. Perhaps by CONFIG_MMU?

mips and s390 use declaration inside inline function to implement
is_zero_pfn(). I wanted to be consistent with that.

I have patch to cleanup zero page helpers a bit. It's on top of
this patchset.

http://article.gmane.org/gmane.linux.kernel.mm/87387

> > + return zero_pfn;
> > +}
> > +#endif
> > +
> > /*
> > * Multiple processes may "see" the same page. E.g. for untouched
> > * mappings of /dev/null, all processes see the same page full of
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 0d903bf..d767a7c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -824,6 +824,88 @@ out:
> > return ret;
> > }
> >
> > +/* no "address" argument so destroys page coloring of some arch */
> > +pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
> > +{
>
> Umm, this is a copy and paste of pgtable_trans_huge_withdraw() from the
> generic page table handling. Why can't you reuse that and support (and/or
> modify) the s390 and sparc code?

My bad. It's mistake on conflict solving. I'll fix that.

> > + pgtable_t pgtable;
> > +
> > + assert_spin_locked(&mm->page_table_lock);
> > +
> > + /* FIFO */
> > + pgtable = mm->pmd_huge_pte;
> > + if (list_empty(&pgtable->lru))
> > + mm->pmd_huge_pte = NULL;
> > + else {
> > + mm->pmd_huge_pte = list_entry(pgtable->lru.next,
> > + struct page, lru);
> > + list_del(&pgtable->lru);
> > + }
> > + return pgtable;
> > +}
> > +
> > +static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
> > + struct vm_area_struct *vma, unsigned long address,
> > + pmd_t *pmd, unsigned long haddr)
>
> This whole function is extremely similar to the implementation of
> do_huge_pmd_wp_page_fallback(), there really is no way to fold the two?

It's similar by structure (I used do_huge_pmd_wp_page_fallback() as a
template) but details are different in many places and I fail to see how
to combine them without making result ugly.
Makes sense. I'll do it as a separate patch on top of the series.
Yes. I'll update.

> > + VM_BUG_ON(!PageHead(page));
> > + page_remove_rmap(page);
> > + put_page(page);
> > + }
> > ret |= VM_FAULT_WRITE;
> > }
> > spin_unlock(&mm->page_table_lock);

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 3:50:02 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:12:54PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index d767a7c..05490b3 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > pmd_t entry;
> > entry = pmdp_get_and_clear(mm, addr, pmd);
> > entry = pmd_modify(entry, newprot);
> > + if (is_huge_zero_pmd(entry))
> > + entry = pmd_wrprotect(entry);
> > set_pmd_at(mm, addr, pmd, entry);
> > spin_unlock(&vma->vm_mm->page_table_lock);
> > ret = 1;
>
> Nack, this should be handled in pmd_modify().

I disagree. It means we will have to enable hzp per arch. Bad idea.

What's wrong with the check?

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 4:00:02 AM11/15/12
to
Thanks, will fix.
Okay.

> > - spin_lock(&mm->page_table_lock);
> > + BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
> > +
> > + spin_lock(&vma->vm_mm->page_table_lock);
> > if (unlikely(!pmd_trans_huge(*pmd))) {
> > - spin_unlock(&mm->page_table_lock);
> > + spin_unlock(&vma->vm_mm->page_table_lock);
> > return;
> > }
> > page = pmd_page(*pmd);
> > VM_BUG_ON(!page_count(page));
> > get_page(page);
> > - spin_unlock(&mm->page_table_lock);
> > + spin_unlock(&vma->vm_mm->page_table_lock);
> >
> > split_huge_page(page);
> >

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 4:30:02 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:28:15PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 90e651c..f36bc7d 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1611,6 +1611,7 @@ int split_huge_page(struct page *page)
> > struct anon_vma *anon_vma;
> > int ret = 1;
> >
> > + BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
> > BUG_ON(!PageAnon(page));
> > anon_vma = page_lock_anon_vma(page);
> > if (!anon_vma)
> > @@ -2509,23 +2510,63 @@ static int khugepaged(void *none)
> > return 0;
> > }
> >
> > +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
> > + unsigned long haddr, pmd_t *pmd)
> > +{
>
> This entire function duplicates other code in mm/huge_memory.c which gives
> even more incentive into breaking do_huge_pmd_wp_zero_page_fallback() into
> logical helper functions and reusing them for both page and !page.
> Duplicating all this code throughout the thp code just becomes a
> maintenance nightmare down the road.

Okay, I'll try.
haddr. I'll fix.

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 4:40:03 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:33:16PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index f36bc7d..41f05f1 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -726,6 +726,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > return VM_FAULT_OOM;
> > if (unlikely(khugepaged_enter(vma)))
> > return VM_FAULT_OOM;
> > + if (!(flags & FAULT_FLAG_WRITE)) {
> > + pgtable_t pgtable;
> > + pgtable = pte_alloc_one(mm, haddr);
> > + if (unlikely(!pgtable))
> > + goto out;
>
> No use in retrying, just return VM_FAULT_OOM.

Hm. It's consistent with non-hzp path: if pte_alloc_one() in
__do_huge_pmd_anonymous_page() fails __do_huge_pmd_anonymous_page()
returns VM_FAULT_OOM which leads to "goto out".

Should it be fixed too?

>
> > + spin_lock(&mm->page_table_lock);
> > + set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
> > + spin_unlock(&mm->page_table_lock);
> > + return 0;
> > + }
> > page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
> > vma, haddr, numa_node_id(), 0);
> > if (unlikely(!page)) {

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 4:50:02 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:37:09PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
> >
> > Instead of allocating huge zero page on hugepage_init() we can postpone it
> > until first huge zero page map. It saves memory if THP is not in use.
> >
>
> Is it worth the branch on every non-write pagefault after that? The
> unlikely() is not going to help on x86. If thp is enabled in your
> .config (which isn't the default), then I think it's better to just
> allocate the zero huge page once and avoid any branches after that to
> lazily allocate it. (Or do it only when thp is set to "madvise" or
> "always" if booting with transparent_hugepage=never.)

I can rewrite the check to static_key if you want. Would it be better?

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 4:50:03 AM11/15/12
to
On Wed, Nov 14, 2012 at 03:40:37PM -0800, David Rientjes wrote:
> On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
>
> > From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
> >
> > H. Peter Anvin doesn't like huge zero page which sticks in memory forever
> > after the first allocation. Here's implementation of lockless refcounting
> > for huge zero page.
> >
> > We have two basic primitives: {get,put}_huge_zero_page(). They
> > manipulate reference counter.
> >
> > If counter is 0, get_huge_zero_page() allocates a new huge page and
> > takes two references: one for caller and one for shrinker. We free the
> > page only in shrinker callback if counter is 1 (only shrinker has the
> > reference).
> >
> > put_huge_zero_page() only decrements counter. Counter is never zero
> > in put_huge_zero_page() since shrinker holds on reference.
> >
> > Freeing huge zero page in shrinker callback helps to avoid frequent
> > allocate-free.
> >
> > Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
> > parallel (40 processes) read page faulting comparing to lazy huge page
> > allocation. I think it's pretty reasonable for synthetic benchmark.
> >
>
> Eek, this is disappointing that we need to check a refcount before
> referencing the zero huge page

No we don't. It's parallel *read* page fault benchmark meaning we
map/unmap huge zero page all the time. So it's pure synthetic test to show
refcounting overhead.

If we see only 1% overhead on the synthetic test we will not see it in
real world workloads.

> and it obviously shows in your benchmark
> (which I consider 1% to be significant given the alternative is 2MB of
> memory for a system where thp was enabled to be on). I think it would be
> much better to simply allocate and reference the zero huge page locklessly
> when thp is enabled to be either "madvise" or "always", i.e. allocate it
> when enabled.

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:02 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Pass vma instead of mm and add address parameter.

In most cases we already have vma on the stack. We provides
split_huge_page_pmd_mm() for few cases when we have mm, but not vma.

This change is preparation to huge zero pmd splitting implementation.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
Documentation/vm/transhuge.txt | 4 ++--
arch/x86/kernel/vm86_32.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/huge_mm.h | 14 ++++++++++----
mm/huge_memory.c | 19 +++++++++++++++++--
mm/memory.c | 4 ++--
mm/mempolicy.c | 2 +-
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/pagewalk.c | 2 +-
10 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index f734bb2..8f5b41d 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -276,7 +276,7 @@ unaffected. libhugetlbfs will also work fine as usual.
== Graceful fallback ==

Code walking pagetables but unware about huge pmds can simply call
-split_huge_page_pmd(mm, pmd) where the pmd is the one returned by
+split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by
pmd_offset. It's trivial to make the code transparent hugepage aware
by just grepping for "pmd_offset" and adding split_huge_page_pmd where
missing after pmd_offset returns the pmd. Thanks to the graceful
@@ -299,7 +299,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c
return NULL;

pmd = pmd_offset(pud, addr);
-+ split_huge_page_pmd(mm, pmd);
++ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5c9687b..1dfe69c 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, 0xA0000, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9..291a0d1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;

- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 42607a1..2e1dbba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2495,9 +2495,14 @@ static int khugepaged(void *none)
return 0;
}

-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
{
struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct mm_struct *mm = vma->vm_mm;
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);

spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2515,6 +2520,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
BUG_ON(pmd_trans_huge(*pmd));
}

+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
@@ -2539,7 +2554,7 @@ static void split_huge_page_address(struct mm_struct *mm,
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, address, pmd);
}

void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 6edc030..6017e23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1243,7 +1243,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1512,7 +1512,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
}
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
spin_lock(&mm->page_table_lock);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a5..b68061e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a409926..e8c3938 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
else if (change_huge_pmd(vma, pmd, addr, newprot))
continue;
/* fall through */
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b61c2d..eabb24d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
need_flush = true;
continue;
} else if (!err) {
- split_huge_page_pmd(vma->vm_mm, old_pmd);
+ split_huge_page_pmd(vma, old_addr, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d0..35aa294 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;

- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd_mm(walk->mm, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
--
1.7.11.7

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:02 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

By default kernel tries to use huge zero page on read page fault.
It's possible to disable huge zero page by writing 0 or enable it
back by writing 1:

echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
Documentation/vm/transhuge.txt | 7 +++++++
include/linux/huge_mm.h | 4 ++++
mm/huge_memory.c | 21 +++++++++++++++++++--
3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 60aeedd..8785fb8 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag
echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
echo never >/sys/kernel/mm/transparent_hugepage/defrag

+By default kernel tries to use huge zero page on read page fault.
+It's possible to disable huge zero page by writing 0 or enable it
+back by writing 1:
+
+echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
+echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page
+
khugepaged will be automatically started when
transparent_hugepage/enabled is set to "always" or "madvise, and it'll
be automatically shutdown if it's set to "never".
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 856f080..a9f5bd4 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -35,6 +35,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
#ifdef CONFIG_DEBUG_VM
TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
#endif
@@ -74,6 +75,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
(transparent_hugepage_flags & \
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \
(__vma)->vm_flags & VM_HUGEPAGE))
+#define transparent_hugepage_use_zero_page() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
#ifdef CONFIG_DEBUG_VM
#define transparent_hugepage_debug_cow() \
(transparent_hugepage_flags & \
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b104718..1f6c6de 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -38,7 +38,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -356,6 +357,20 @@ static ssize_t defrag_store(struct kobject *kobj,
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);

+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -377,6 +392,7 @@ static struct kobj_attribute debug_cow_attr =
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
+ &use_zero_page_attr.attr,
#ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr,
#endif
@@ -771,7 +787,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
- if (!(flags & FAULT_FLAG_WRITE)) {
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:02 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We don't have a mapped page to zap in huge zero page case. Let's just
clear pmd and remove it from tlb.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
Acked-by: David Rientjes <rien...@google.com>
---
mm/huge_memory.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8d7a54b..fc2cada 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1058,15 +1058,20 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
pgtable = pgtable_trans_huge_withdraw(tlb->mm);
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
- page = pmd_page(orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
pte_free(tlb->mm, pgtable);
ret = 1;

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:02 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We want to get page fault on write attempt to huge zero page, so let's
keep it write-protected.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f5589c0..42607a1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1245,6 +1245,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ if (is_huge_zero_pmd(entry))
+ entry = pmd_wrprotect(entry);
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:03 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled
with zeros.

For now let's allocate the page on hugepage_init(). We'll switch to lazy
allocation later.

We are not going to map the huge zero page until we can handle it
properly on all code paths.

is_huge_zero_{pfn,pmd}() functions will be used by following patches to
check whether the pfn/pmd is huge zero page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
Acked-by: David Rientjes <rien...@google.com>
---
mm/huge_memory.c | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c3..8d7a54b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -47,6 +47,7 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
+static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -159,6 +160,29 @@ static int start_khugepaged(void)
return err;
}

+static int __init init_huge_zero_page(void)
+{
+ struct page *hpage;
+
+ hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!hpage)
+ return -ENOMEM;
+
+ huge_zero_pfn = page_to_pfn(hpage);
+ return 0;
+}
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ return pfn == huge_zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
#ifdef CONFIG_SYSFS

static ssize_t double_flag_show(struct kobject *kobj,
@@ -540,6 +564,10 @@ static int __init hugepage_init(void)
if (err)
return err;

+ err = init_huge_zero_page();
+ if (err)
+ goto out;
+
err = khugepaged_slab_init();
if (err)
goto out;
@@ -562,6 +590,8 @@ static int __init hugepage_init(void)

return 0;
out:
+ if (huge_zero_pfn)
+ __free_page(pfn_to_page(huge_zero_pfn));
hugepage_exit_sysfs(hugepage_kobj);
return err;

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:03 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

We can't split huge zero page itself (and it's bug if we try), but we
can split the pmd which points to it.

On splitting the pmd we create a table with all ptes set to normal zero
page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 41 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2e1dbba..015a13a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1597,6 +1597,7 @@ int split_huge_page(struct page *page)
struct anon_vma *anon_vma;
int ret = 1;

+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
anon_vma = page_lock_anon_vma(page);
if (!anon_vma)
@@ -2495,24 +2496,64 @@ static int khugepaged(void *none)
return 0;
}

+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+}
+
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
struct mm_struct *mm = vma->vm_mm;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */

BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);

+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

split_huge_page(page);

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:03 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

Instead of allocating huge zero page on hugepage_init() we can postpone it
until first huge zero page map. It saves memory if THP is not in use.

cmpxchg() is used to avoid race on huge_zero_pfn initialization.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ca3f6f2..bad9c8f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -160,22 +160,24 @@ static int start_khugepaged(void)
return err;
}

-static int __init init_huge_zero_page(void)
+static int init_huge_zero_pfn(void)
{
struct page *hpage;
+ unsigned long pfn;

hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!hpage)
return -ENOMEM;
-
- huge_zero_pfn = page_to_pfn(hpage);
+ pfn = page_to_pfn(hpage);
+ if (cmpxchg(&huge_zero_pfn, 0, pfn))
+ __free_page(hpage);
return 0;
}

static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- return pfn == huge_zero_pfn;
+ return huge_zero_pfn && pfn == huge_zero_pfn;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
@@ -564,10 +566,6 @@ static int __init hugepage_init(void)
if (err)
return err;

- err = init_huge_zero_page();
- if (err)
- goto out;
-
err = khugepaged_slab_init();
if (err)
goto out;
@@ -590,8 +588,6 @@ static int __init hugepage_init(void)

return 0;
out:
- if (huge_zero_pfn)
- __free_page(pfn_to_page(huge_zero_pfn));
hugepage_exit_sysfs(hugepage_kobj);
return err;
}
@@ -728,6 +724,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
+ if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:03 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

hzp_alloc is incremented every time a huge zero page is successfully
allocated. It includes allocations which where dropped due
race with other allocation. Note, it doesn't count every map
of the huge zero page, only its allocation.

hzp_alloc_failed is incremented if kernel fails to allocate huge zero
page and falls back to using small pages.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
Documentation/vm/transhuge.txt | 8 ++++++++
include/linux/vm_event_item.h | 2 ++
mm/huge_memory.c | 5 ++++-
mm/vmstat.c | 2 ++
4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 8f5b41d..60aeedd 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -197,6 +197,14 @@ thp_split is incremented every time a huge page is split into base
pages. This can happen for a variety of reasons but a common
reason is that a huge page is old and is being reclaimed.

+thp_zero_page_alloc is incremented every time a huge zero page is
+ successfully allocated. It includes allocations which where
+ dropped due race with other allocation. Note, it doesn't count
+ every map of the huge zero page, only its allocation.
+
+thp_zero_page_alloc_failed is incremented if kernel fails to allocate
+ huge zero page and falls back to using small pages.
+
As the system ages, allocating huge pages may be expensive as the
system uses memory compaction to copy data around memory to free a
huge page for use. There are some counters in /proc/vmstat to help
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3d31145..fe786f0 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
THP_COLLAPSE_ALLOC,
THP_COLLAPSE_ALLOC_FAILED,
THP_SPLIT,
+ THP_ZERO_PAGE_ALLOC,
+ THP_ZERO_PAGE_ALLOC_FAILED,
#endif
NR_VM_EVENT_ITEMS
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 923ea75..b104718 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -183,8 +183,11 @@ retry:

zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!zero_page)
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
return 0;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
preempt_enable();
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c737057..5da4b19 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -801,6 +801,8 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_split",
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
#endif

#endif /* CONFIG_VM_EVENTS_COUNTERS */

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:04 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

On write access to huge zero page we alloc a new huge page and clear it.

If ENOMEM, graceful fallback: we create a new pmd table and set pte
around fault address to newly allocated normal (4k) page. All other ptes
in the pmd set to normal zero page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/mm.h | 8 ++++
mm/huge_memory.c | 110 +++++++++++++++++++++++++++++++++++++++++++++--------
mm/memory.c | 7 ----
3 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa06804..fe329da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -516,6 +516,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
}
#endif

+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ extern unsigned long zero_pfn;
+ return zero_pfn;
+}
+#endif
+
/*
* Multiple processes may "see" the same page. E.g. for untouched
* mappings of /dev/null, all processes see the same page full of
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 183127c..f5589c0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -829,6 +829,69 @@ out:
return ret;
}

+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+}
+
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -935,19 +998,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */

VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;

page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
- haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -959,7 +1024,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
get_page(page);
spin_unlock(&mm->page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -969,24 +1034,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -994,7 +1069,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

spin_lock(&mm->page_table_lock);
- put_page(page);
+ if (page)
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
@@ -1002,7 +1078,6 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mn;
} else {
pmd_t entry;
- VM_BUG_ON(!PageHead(page));
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
@@ -1010,8 +1085,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- page_remove_rmap(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd))
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ else {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
ret |= VM_FAULT_WRITE;
}
spin_unlock(&mm->page_table_lock);
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba..6edc030 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -724,13 +724,6 @@ static inline int is_zero_pfn(unsigned long pfn)
}
#endif

-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:04 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

It's easy to copy huge zero page. Just set destination pmd to huge zero
page.

It's safe to copy huge zero page since we have none yet :-p

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc2cada..183127c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -701,6 +701,18 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif

+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+{
+ pmd_t entry;
+ entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+}
+
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
@@ -778,6 +790,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
goto out_unlock;
}
+ /*
+ * mm->pagetable lock is enough to be sure that huge zero pmd is not
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ ret = 0;
+ goto out_unlock;
+ }
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(&src_mm->page_table_lock);

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:04 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

During testing I noticed big (up to 2.5 times) memory consumption overhead
on some workloads (e.g. ft.A from NPB) if THP is enabled.

The main reason for that big difference is lacking zero page in THP case.
We have to allocate a real page on read page fault.

A program to demonstrate the issue:
#include <assert.h>
#include <stdlib.h>
#include <unistd.h>

#define MB 1024*1024

int main(int argc, char **argv)
{
char *p;
int i;

posix_memalign((void **)&p, 2 * MB, 200 * MB);
for (i = 0; i < 200 * MB; i+= 4096)
assert(p[i] == 0);
pause();
return 0;
}

With thp-never RSS is about 400k, but with thp-always it's 200M.
After the patcheset thp-always RSS is 400k too.

Design overview.

Huge zero page (hzp) is a non-movable huge page (2M on x86-64) filled with
zeros. The way how we allocate it changes in the patchset:

- [01/10] simplest way: hzp allocated on boot time in hugepage_init();
- [09/10] lazy allocation on first use;
- [10/10] lockless refcounting + shrinker-reclaimable hzp;

We setup it in do_huge_pmd_anonymous_page() if area around fault address
is suitable for THP and we've got read page fault.
If we fail to setup hzp (ENOMEM) we fallback to handle_pte_fault() as we
normally do in THP.

On wp fault to hzp we allocate real memory for the huge page and clear it.
If ENOMEM, graceful fallback: we create a new pmd table and set pte around
fault address to newly allocated normal (4k) page. All other ptes in the
pmd set to normal zero page.

We cannot split hzp (and it's bug if we try), but we can split the pmd
which points to it. On splitting the pmd we create a table with all ptes
set to normal zero page.

Patchset organized in bisect-friendly way:
Patches 01-07: prepare all code paths for hzp
Patch 08: all code paths are covered: safe to setup hzp
Patch 09: lazy allocation
Patch 10: lockless refcounting for hzp
Patch 11: vmstat events for hzp allocation
Patch 12: sysfs knob to disable hzp

v6:
- updates based on feedback from David Rientjes;
- sysfs knob to disable hzp;
v5:
- implement HZP_ALLOC and HZP_ALLOC_FAILED events;
v4:
- Rebase to v3.7-rc1;
- Update commit message;
v3:
- fix potential deadlock in refcounting code on preemptive kernel.
- do not mark huge zero page as movable.
- fix typo in comment.
- Reviewed-by tag from Andrea Arcangeli.
v2:
- Avoid find_vma() if we've already had vma on stack.
Suggested by Andrea Arcangeli.
- Implement refcounting for huge zero page.

--------------------------------------------------------------------------

By hpa request I've tried alternative approach for hzp implementation (see
Virtual huge zero page patchset): pmd table with all entries set to zero
page. This way should be more cache friendly, but it increases TLB
pressure.

The problem with virtual huge zero page: it requires per-arch enabling.
We need a way to mark that pmd table has all ptes set to zero page.

Some numbers to compare two implementations (on 4s Westmere-EX):

Mirobenchmark1
==============

test:
posix_memalign((void **)&p, 2 * MB, 8 * GB);
for (i = 0; i < 100; i++) {
assert(memcmp(p, p + 4*GB, 4*GB) == 0);
asm volatile ("": : :"memory");
}

hzp:
Performance counter stats for './test_memcmp' (5 runs):

32356.272845 task-clock # 0.998 CPUs utilized ( +- 0.13% )
40 context-switches # 0.001 K/sec ( +- 0.94% )
0 CPU-migrations # 0.000 K/sec
4,218 page-faults # 0.130 K/sec ( +- 0.00% )
76,712,481,765 cycles # 2.371 GHz ( +- 0.13% ) [83.31%]
36,279,577,636 stalled-cycles-frontend # 47.29% frontend cycles idle ( +- 0.28% ) [83.35%]
1,684,049,110 stalled-cycles-backend # 2.20% backend cycles idle ( +- 2.96% ) [66.67%]
134,355,715,816 instructions # 1.75 insns per cycle
# 0.27 stalled cycles per insn ( +- 0.10% ) [83.35%]
13,526,169,702 branches # 418.039 M/sec ( +- 0.10% ) [83.31%]
1,058,230 branch-misses # 0.01% of all branches ( +- 0.91% ) [83.36%]

32.413866442 seconds time elapsed ( +- 0.13% )

vhzp:
Performance counter stats for './test_memcmp' (5 runs):

30327.183829 task-clock # 0.998 CPUs utilized ( +- 0.13% )
38 context-switches # 0.001 K/sec ( +- 1.53% )
0 CPU-migrations # 0.000 K/sec
4,218 page-faults # 0.139 K/sec ( +- 0.01% )
71,964,773,660 cycles # 2.373 GHz ( +- 0.13% ) [83.35%]
31,191,284,231 stalled-cycles-frontend # 43.34% frontend cycles idle ( +- 0.40% ) [83.32%]
773,484,474 stalled-cycles-backend # 1.07% backend cycles idle ( +- 6.61% ) [66.67%]
134,982,215,437 instructions # 1.88 insns per cycle
# 0.23 stalled cycles per insn ( +- 0.11% ) [83.32%]
13,509,150,683 branches # 445.447 M/sec ( +- 0.11% ) [83.34%]
1,017,667 branch-misses # 0.01% of all branches ( +- 1.07% ) [83.32%]

30.381324695 seconds time elapsed ( +- 0.13% )

Mirobenchmark2
==============

test:
posix_memalign((void **)&p, 2 * MB, 8 * GB);
for (i = 0; i < 1000; i++) {
char *_p = p;
while (_p < p+4*GB) {
assert(*_p == *(_p+4*GB));
_p += 4096;
asm volatile ("": : :"memory");
}
}

hzp:
Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs):

3505.727639 task-clock # 0.998 CPUs utilized ( +- 0.26% )
9 context-switches # 0.003 K/sec ( +- 4.97% )
4,384 page-faults # 0.001 M/sec ( +- 0.00% )
8,318,482,466 cycles # 2.373 GHz ( +- 0.26% ) [33.31%]
5,134,318,786 stalled-cycles-frontend # 61.72% frontend cycles idle ( +- 0.42% ) [33.32%]
2,193,266,208 stalled-cycles-backend # 26.37% backend cycles idle ( +- 5.51% ) [33.33%]
9,494,670,537 instructions # 1.14 insns per cycle
# 0.54 stalled cycles per insn ( +- 0.13% ) [41.68%]
2,108,522,738 branches # 601.451 M/sec ( +- 0.09% ) [41.68%]
158,746 branch-misses # 0.01% of all branches ( +- 1.60% ) [41.71%]
3,168,102,115 L1-dcache-loads
# 903.693 M/sec ( +- 0.11% ) [41.70%]
1,048,710,998 L1-dcache-misses
# 33.10% of all L1-dcache hits ( +- 0.11% ) [41.72%]
1,047,699,685 LLC-load
# 298.854 M/sec ( +- 0.03% ) [33.38%]
2,287 LLC-misses
# 0.00% of all LL-cache hits ( +- 8.27% ) [33.37%]
3,166,187,367 dTLB-loads
# 903.147 M/sec ( +- 0.02% ) [33.35%]
4,266,538 dTLB-misses
# 0.13% of all dTLB cache hits ( +- 0.03% ) [33.33%]

3.513339813 seconds time elapsed ( +- 0.26% )

vhzp:
Performance counter stats for 'taskset -c 0 ./test_memcmp2' (5 runs):

27313.891128 task-clock # 0.998 CPUs utilized ( +- 0.24% )
62 context-switches # 0.002 K/sec ( +- 0.61% )
4,384 page-faults # 0.160 K/sec ( +- 0.01% )
64,747,374,606 cycles # 2.370 GHz ( +- 0.24% ) [33.33%]
61,341,580,278 stalled-cycles-frontend # 94.74% frontend cycles idle ( +- 0.26% ) [33.33%]
56,702,237,511 stalled-cycles-backend # 87.57% backend cycles idle ( +- 0.07% ) [33.33%]
10,033,724,846 instructions # 0.15 insns per cycle
# 6.11 stalled cycles per insn ( +- 0.09% ) [41.65%]
2,190,424,932 branches # 80.195 M/sec ( +- 0.12% ) [41.66%]
1,028,630 branch-misses # 0.05% of all branches ( +- 1.50% ) [41.66%]
3,302,006,540 L1-dcache-loads
# 120.891 M/sec ( +- 0.11% ) [41.68%]
271,374,358 L1-dcache-misses
# 8.22% of all L1-dcache hits ( +- 0.04% ) [41.66%]
20,385,476 LLC-load
# 0.746 M/sec ( +- 1.64% ) [33.34%]
76,754 LLC-misses
# 0.38% of all LL-cache hits ( +- 2.35% ) [33.34%]
3,309,927,290 dTLB-loads
# 121.181 M/sec ( +- 0.03% ) [33.34%]
2,098,967,427 dTLB-misses
# 63.41% of all dTLB cache hits ( +- 0.03% ) [33.34%]

27.364448741 seconds time elapsed ( +- 0.24% )

--------------------------------------------------------------------------

I personally prefer implementation present in this patchset. It doesn't
touch arch-specific code.

Kirill A. Shutemov (12):
thp: huge zero page: basic preparation
thp: zap_huge_pmd(): zap huge zero pmd
thp: copy_huge_pmd(): copy huge zero page
thp: do_huge_pmd_wp_page(): handle huge zero page
thp: change_huge_pmd(): keep huge zero page write-protected
thp: change split_huge_page_pmd() interface
thp: implement splitting pmd for huge zero page
thp: setup huge zero page on non-write page fault
thp: lazy huge zero page allocation
thp: implement refcounting for huge zero page
thp, vmstat: implement HZP_ALLOC and HZP_ALLOC_FAILED events
thp: introduce sysfs knob to disable huge zero page

Documentation/vm/transhuge.txt | 19 ++-
arch/x86/kernel/vm86_32.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/linux/huge_mm.h | 18 ++-
include/linux/mm.h | 8 +
include/linux/vm_event_item.h | 2 +
mm/huge_memory.c | 339 +++++++++++++++++++++++++++++++++++++----
mm/memory.c | 11 +-
mm/mempolicy.c | 2 +-
mm/mprotect.c | 2 +-
mm/mremap.c | 2 +-
mm/pagewalk.c | 2 +-
mm/vmstat.c | 2 +
13 files changed, 364 insertions(+), 47 deletions(-)

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:04 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

H. Peter Anvin doesn't like huge zero page which sticks in memory forever
after the first allocation. Here's implementation of lockless refcounting
for huge zero page.

We have two basic primitives: {get,put}_huge_zero_page(). They
manipulate reference counter.

If counter is 0, get_huge_zero_page() allocates a new huge page and
takes two references: one for caller and one for shrinker. We free the
page only in shrinker callback if counter is 1 (only shrinker has the
reference).

put_huge_zero_page() only decrements counter. Counter is never zero
in put_huge_zero_page() since shrinker holds on reference.

Freeing huge zero page in shrinker callback helps to avoid frequent
allocate-free.

Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
parallel (40 processes) read page faulting comparing to lazy huge page
allocation. I think it's pretty reasonable for synthetic benchmark.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 112 ++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 87 insertions(+), 25 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bad9c8f..923ea75 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,6 +18,7 @@
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/shrinker.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -47,7 +48,6 @@ static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static struct task_struct *khugepaged_thread __read_mostly;
-static unsigned long huge_zero_pfn __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
@@ -160,31 +160,74 @@ static int start_khugepaged(void)
return err;
}

-static int init_huge_zero_pfn(void)
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
{
- struct page *hpage;
- unsigned long pfn;
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);

- hpage = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
- if (!hpage)
- return -ENOMEM;
- pfn = page_to_pfn(hpage);
- if (cmpxchg(&huge_zero_pfn, 0, pfn))
- __free_page(hpage);
- return 0;
+ if (!zero_page)
+ return 0;
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
}

-static inline bool is_huge_zero_pfn(unsigned long pfn)
+static void put_huge_zero_page(void)
{
- return huge_zero_pfn && pfn == huge_zero_pfn;
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

-static inline bool is_huge_zero_pmd(pmd_t pmd)
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- return is_huge_zero_pfn(pmd_pfn(pmd));
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
}

+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS

static ssize_t double_flag_show(struct kobject *kobj,
@@ -576,6 +619,8 @@ static int __init hugepage_init(void)
goto out;
}

+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -698,10 +743,11 @@ static inline struct page *alloc_hugepage(int defrag)
#endif

static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd)
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
{
pmd_t entry;
- entry = pfn_pmd(huge_zero_pfn, vma->vm_page_prot);
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
set_pmd_at(mm, haddr, pmd, entry);
@@ -724,15 +770,19 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE)) {
pgtable_t pgtable;
- if (unlikely(!huge_zero_pfn && init_huge_zero_pfn())) {
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
+ unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
goto out;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
spin_lock(&mm->page_table_lock);
- set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -806,7 +856,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd);
+ unsigned long zero_pfn;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
ret = 0;
goto out_unlock;
}
@@ -894,6 +952,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();

mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);

@@ -1095,9 +1154,10 @@ alloc:
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- else {
+ put_huge_zero_page();
+ } else {
VM_BUG_ON(!PageHead(page));
page_remove_rmap(page);
put_page(page);
@@ -1174,6 +1234,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
@@ -2531,6 +2592,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
}

void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,

Kirill A. Shutemov

unread,
Nov 15, 2012, 2:30:04 PM11/15/12
to
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>

All code paths seems covered. Now we can map huge zero page on read page
fault.

We setup it in do_huge_pmd_anonymous_page() if area around fault address
is suitable for THP and we've got read page fault.

If we fail to setup huge zero page (ENOMEM) we fallback to
handle_pte_fault() as we normally do in THP.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 10 ++++++++++
1 file changed, 10 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 015a13a..ca3f6f2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -726,6 +726,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE)) {
+ pgtable_t pgtable;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ goto out;
+ spin_lock(&mm->page_table_lock);
+ set_huge_zero_page(pgtable, mm, vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ return 0;
+ }
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {

David Rientjes

unread,
Nov 15, 2012, 4:50:02 PM11/15/12
to
On Thu, 15 Nov 2012, Kirill A. Shutemov wrote:

> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index d767a7c..05490b3 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > > pmd_t entry;
> > > entry = pmdp_get_and_clear(mm, addr, pmd);
> > > entry = pmd_modify(entry, newprot);
> > > + if (is_huge_zero_pmd(entry))
> > > + entry = pmd_wrprotect(entry);
> > > set_pmd_at(mm, addr, pmd, entry);
> > > spin_unlock(&vma->vm_mm->page_table_lock);
> > > ret = 1;
> >
> > Nack, this should be handled in pmd_modify().
>
> I disagree. It means we will have to enable hzp per arch. Bad idea.
>

pmd_modify() only exists for those architectures with thp support already,
so you've already implicitly enabled for all the necessary architectures
with your patchset.

> What's wrong with the check?
>

Anybody using pmd_modify() to set new protections in the future perhaps
without knowledge of huge zero page can incorrectly make the huge zero
page writable, which can never be allowed to happen. It's better to make
sure it can never happen with the usual interface to modify protections.

David Rientjes

unread,
Nov 15, 2012, 5:00:02 PM11/15/12
to
On Thu, 15 Nov 2012, Kirill A. Shutemov wrote:

> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index f36bc7d..41f05f1 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -726,6 +726,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > > return VM_FAULT_OOM;
> > > if (unlikely(khugepaged_enter(vma)))
> > > return VM_FAULT_OOM;
> > > + if (!(flags & FAULT_FLAG_WRITE)) {
> > > + pgtable_t pgtable;
> > > + pgtable = pte_alloc_one(mm, haddr);
> > > + if (unlikely(!pgtable))
> > > + goto out;
> >
> > No use in retrying, just return VM_FAULT_OOM.
>
> Hm. It's consistent with non-hzp path: if pte_alloc_one() in
> __do_huge_pmd_anonymous_page() fails __do_huge_pmd_anonymous_page()
> returns VM_FAULT_OOM which leads to "goto out".
>

If the pte_alloc_one(), which wraps __pte_alloc(), you're adding fails,
it's pointless to "goto out" to try __pte_alloc() which we know won't
succeed.

> Should it be fixed too?
>

It's done for maintainablility because although
__do_huge_pmd_anonymous_page() will only return VM_FAULT_OOM today when
pte_alloc_one() fails, if it were to ever fail in a different way then the
caller is already has a graceful failure.

David Rientjes

unread,
Nov 15, 2012, 5:40:01 PM11/15/12
to
Fix comment

Signed-off-by: David Rientjes <rien...@google.com>
---
mm/huge_memory.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -791,7 +791,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * mm->pagetable lock is enough to be sure that huge zero pmd is not
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/

Kirill A. Shutemov

unread,
Nov 16, 2012, 1:20:02 PM11/16/12
to
Okay, here's fixlet. Andrew, please squash it to the patch 08/11.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1f6c6de..d7ab890 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -793,7 +793,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long zero_pfn;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
- goto out;
+ return VM_FAULT_OOM;
zero_pfn = get_huge_zero_page();
if (unlikely(!zero_pfn)) {
pte_free(mm, pgtable);
--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Nov 16, 2012, 1:20:02 PM11/16/12
to
On Thu, Nov 15, 2012 at 01:47:33PM -0800, David Rientjes wrote:
> On Thu, 15 Nov 2012, Kirill A. Shutemov wrote:
>
> > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > index d767a7c..05490b3 100644
> > > > --- a/mm/huge_memory.c
> > > > +++ b/mm/huge_memory.c
> > > > @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > > > pmd_t entry;
> > > > entry = pmdp_get_and_clear(mm, addr, pmd);
> > > > entry = pmd_modify(entry, newprot);
> > > > + if (is_huge_zero_pmd(entry))
> > > > + entry = pmd_wrprotect(entry);
> > > > set_pmd_at(mm, addr, pmd, entry);
> > > > spin_unlock(&vma->vm_mm->page_table_lock);
> > > > ret = 1;
> > >
> > > Nack, this should be handled in pmd_modify().
> >
> > I disagree. It means we will have to enable hzp per arch. Bad idea.
> >
>
> pmd_modify() only exists for those architectures with thp support already,
> so you've already implicitly enabled for all the necessary architectures
> with your patchset.

Now we have huge zero page fully implemented inside mm/huge_memory.c. Push
this logic to pmd_modify() means we expose hzp implementation details to
arch code. Looks ugly for me.

> > What's wrong with the check?
> >
>
> Anybody using pmd_modify() to set new protections in the future perhaps
> without knowledge of huge zero page can incorrectly make the huge zero
> page writable, which can never be allowed to happen. It's better to make
> sure it can never happen with the usual interface to modify protections.

I haven't found where we check if the page is a 4k zero page, but it's
definitely not pte_modify().

--
Kirill A. Shutemov
signature.asc

David Rientjes

unread,
Nov 16, 2012, 3:20:02 PM11/16/12
to
On Fri, 16 Nov 2012, Kirill A. Shutemov wrote:

> > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > > > index d767a7c..05490b3 100644
> > > > > --- a/mm/huge_memory.c
> > > > > +++ b/mm/huge_memory.c
> > > > > @@ -1259,6 +1259,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
> > > > > pmd_t entry;
> > > > > entry = pmdp_get_and_clear(mm, addr, pmd);
> > > > > entry = pmd_modify(entry, newprot);
> > > > > + if (is_huge_zero_pmd(entry))
> > > > > + entry = pmd_wrprotect(entry);
> > > > > set_pmd_at(mm, addr, pmd, entry);
> > > > > spin_unlock(&vma->vm_mm->page_table_lock);
> > > > > ret = 1;
> > > >
> > > > Nack, this should be handled in pmd_modify().
> > >
> > > I disagree. It means we will have to enable hzp per arch. Bad idea.
> > >
> >
> > pmd_modify() only exists for those architectures with thp support already,
> > so you've already implicitly enabled for all the necessary architectures
> > with your patchset.
>
> Now we have huge zero page fully implemented inside mm/huge_memory.c. Push
> this logic to pmd_modify() means we expose hzp implementation details to
> arch code. Looks ugly for me.
>

So you are suggesting that anybody who ever does pmd_modify() in the
future is responsible for knowing about the zero page and to protect
against giving it write permission in the calling code??

H. Peter Anvin

unread,
Nov 16, 2012, 4:30:02 PM11/16/12
to
On 11/14/2012 03:41 PM, David Rientjes wrote:
>
> Nobody is going to know what hzp_ is, sorry. It's better to be more
> verbose and name them what they actually are: THP_ZERO_PAGE_ALLOC and
> THP_ZERO_PAGE_ALLOC_FAILED. But this would assume we want to lazily
> allocate them, which I disagree with hpa about.
>

You want to permanently sit on 2 MiB of memory on all systems? That
being an obvious nonstarter, then you end up having to make some kind of
static control, with all the problems that entails (if Linus had not set
his foot down on tunables a long time ago, we today would have had a
Linux mm which only performed well if you manually set hundreds or
thousands of parameters) you either have lazy allocation or you go with
the virtual huge zero page solution and just accept that either is going
to perform poorly under some set of pathological circumstances.

-hpa

Jaegeuk Hanse

unread,
Nov 18, 2012, 1:30:02 AM11/18/12
to
On 11/16/2012 03:27 AM, Kirill A. Shutemov wrote:
> From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
>
> H. Peter Anvin doesn't like huge zero page which sticks in memory forever
> after the first allocation. Here's implementation of lockless refcounting
> for huge zero page.
>
> We have two basic primitives: {get,put}_huge_zero_page(). They
> manipulate reference counter.
>
> If counter is 0, get_huge_zero_page() allocates a new huge page and
> takes two references: one for caller and one for shrinker. We free the
> page only in shrinker callback if counter is 1 (only shrinker has the
> reference).
>
> put_huge_zero_page() only decrements counter. Counter is never zero
> in put_huge_zero_page() since shrinker holds on reference.
>
> Freeing huge zero page in shrinker callback helps to avoid frequent
> allocate-free.
>
> Refcounting has cost. On 4 socket machine I observe ~1% slowdown on
> parallel (40 processes) read page faulting comparing to lazy huge page
> allocation. I think it's pretty reasonable for synthetic benchmark.

Hi Kirill,

I see your and Andew's hot discussion in v4 resend thread.

"I also tried another scenario: usemem -n16 100M -r 1000. It creates
real memory pressure - no easy reclaimable memory. This time callback
called with nr_to_scan > 0 and we freed hzp. "

What's "usemem"? Is it a tool and how to get it? It's hard for me to
find nr_to_scan > 0 in every callset, how can nr_to_scan > 0 in your
scenario?

Regards,
Jaegeuk

Kirill A. Shutemov

unread,
Nov 19, 2012, 5:00:02 AM11/19/12
to
http://www.spinics.net/lists/linux-mm/attachments/gtarazbJaHPaAT.gtar

> It's hard for me to
> find nr_to_scan > 0 in every callset, how can nr_to_scan > 0 in your
> scenario?

shrink_slab() calls the callback with nr_to_scan > 0 if system is under
pressure -- look for do_shrinker_shrink().

--
Kirill A. Shutemov
signature.asc

Jaegeuk Hanse

unread,
Nov 19, 2012, 5:30:02 AM11/19/12
to
Thanks for your response. But how to use it, I even can't compile the
files.

# ./case-lru-file-mmap-read
./case-lru-file-mmap-read: line 3: hw_vars: No such file or directory
./case-lru-file-mmap-read: line 7: 10 * mem / nr_cpu: division by 0
(error token is "nr_cpu")

# gcc usemem.c -o usemem
/tmp/ccFkIDWk.o: In function `do_task':
usemem.c:(.text+0x9f2): undefined reference to `pthread_create'
usemem.c:(.text+0xa44): undefined reference to `pthread_join'
collect2: ld returned 1 exit status

>
>> It's hard for me to
>> find nr_to_scan > 0 in every callset, how can nr_to_scan > 0 in your
>> scenario?
> shrink_slab() calls the callback with nr_to_scan > 0 if system is under
> pressure -- look for do_shrinker_shrink().

Why Andrew's example(dd if=/fast-disk/large-file) doesn't call this
path? I think it also can add memory pressure, where I miss?

Kirill A. Shutemov

unread,
Nov 19, 2012, 5:30:02 AM11/19/12
to
-lpthread

> /tmp/ccFkIDWk.o: In function `do_task':
> usemem.c:(.text+0x9f2): undefined reference to `pthread_create'
> usemem.c:(.text+0xa44): undefined reference to `pthread_join'
> collect2: ld returned 1 exit status
>
> >
> >>It's hard for me to
> >>find nr_to_scan > 0 in every callset, how can nr_to_scan > 0 in your
> >>scenario?
> >shrink_slab() calls the callback with nr_to_scan > 0 if system is under
> >pressure -- look for do_shrinker_shrink().
>
> Why Andrew's example(dd if=/fast-disk/large-file) doesn't call this
> path? I think it also can add memory pressure, where I miss?

dd if=large-file only fills pagecache -- easy reclaimable memory.
Pagecache will be dropped first, before shrinking slabs.

--
Kirill A. Shutemov

Kirill A. Shutemov

unread,
Nov 19, 2012, 6:10:01 AM11/19/12
to
On Mon, Nov 19, 2012 at 07:02:22PM +0800, Jaegeuk Hanse wrote:
> How could I confirm page reclaim working hard and slabs are
> reclaimed at this time?

The only what I see is slabs_scanned in vmstat.

Jaegeuk Hanse

unread,
Nov 19, 2012, 6:10:02 AM11/19/12
to
How could I confirm page reclaim working hard and slabs are reclaimed at
this time?


Jaegeuk Hanse

unread,
Nov 19, 2012, 6:30:02 AM11/19/12
to
Oh, I see. Thanks! :-)

Kirill A. Shutemov

unread,
Nov 20, 2012, 11:00:03 AM11/20/12
to
Looks like we don't need the patch at all.

IIUC, if you ask for PROT_WRITE vm_get_page_prot() will translate it to
_PAGE_COPY or similar and you'll only get the page writable on pagefault.

Could anybody confirm that it's correct?

--
Kirill A. Shutemov
signature.asc

Kirill A. Shutemov

unread,
Dec 3, 2012, 5:00:02 AM12/3/12
to
Andrew, please drop the patch or replace it with the patch below, if you
wish.

From 048e3d4c97202cfecab55ead2a816421dce4b382 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
Date: Tue, 7 Aug 2012 06:09:56 -0700
Subject: [PATCH] thp: change_huge_pmd(): make sure we don't try to make a
page writable

mprotect core never tries to make page writable using change_huge_pmd().
Let's add an assert that the assumption is true. It's important to be
sure we will not make huge zero page writable.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f5589c0..5fba83b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1245,6 +1245,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
--
Kirill A. Shutemov
signature.asc

Andrew Morton

unread,
Dec 12, 2012, 4:40:02 PM12/12/12
to
On Thu, 15 Nov 2012 11:41:55 +0200
"Kirill A. Shutemov" <kirill....@linux.intel.com> wrote:

> On Wed, Nov 14, 2012 at 03:37:09PM -0800, David Rientjes wrote:
> > On Wed, 7 Nov 2012, Kirill A. Shutemov wrote:
> >
> > > From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
> > >
> > > Instead of allocating huge zero page on hugepage_init() we can postpone it
> > > until first huge zero page map. It saves memory if THP is not in use.
> > >
> >
> > Is it worth the branch on every non-write pagefault after that? The
> > unlikely() is not going to help on x86. If thp is enabled in your
> > .config (which isn't the default), then I think it's better to just
> > allocate the zero huge page once and avoid any branches after that to
> > lazily allocate it. (Or do it only when thp is set to "madvise" or
> > "always" if booting with transparent_hugepage=never.)
>
> I can rewrite the check to static_key if you want. Would it be better?

The new test-n-branch only happens on the first read fault against a
thp huge page, yes? In which case it's a quite infrequent event and I
suspect this isn't worth bothering about.

H. Peter Anvin

unread,
Dec 12, 2012, 4:50:02 PM12/12/12
to
On 12/12/2012 01:30 PM, Andrew Morton wrote:
>>
>> I can rewrite the check to static_key if you want. Would it be better?
>
> The new test-n-branch only happens on the first read fault against a
> thp huge page, yes? In which case it's a quite infrequent event and I
> suspect this isn't worth bothering about.
>

Not to mention that flipping the static key is *incredibly* expensive.

-hpa

Kirill A. Shutemov

unread,
Dec 12, 2012, 5:10:02 PM12/12/12
to
On Wed, Dec 12, 2012 at 01:48:04PM -0800, H. Peter Anvin wrote:
> On 12/12/2012 01:30 PM, Andrew Morton wrote:
> >>
> >> I can rewrite the check to static_key if you want. Would it be better?
> >
> > The new test-n-branch only happens on the first read fault against a
> > thp huge page, yes? In which case it's a quite infrequent event and I
> > suspect this isn't worth bothering about.
> >
>
> Not to mention that flipping the static key is *incredibly* expensive.

I'm fine with current code.

--
Kirill A. Shutemov
signature.asc
0 new messages