Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCHv4 36/43] ext4: handle huge pages in ext4_da_write_end()

139 views
Skip to first unread message

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:06 PM10/24/16
to
Call ext4_da_should_update_i_disksize() for head page with offset
relative to head page.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/ext4/inode.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1eae6801846c..59cd2b113eb2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3015,7 +3015,6 @@ static int ext4_da_write_end(struct file *file,
int ret = 0, ret2;
handle_t *handle = ext4_journal_current_handle();
loff_t new_i_size;
- unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;

if (write_mode == FALL_BACK_TO_NONDELALLOC)
@@ -3023,8 +3022,6 @@ static int ext4_da_write_end(struct file *file,
len, copied, page, fsdata);

trace_ext4_da_write_end(inode, pos, len, copied);
- start = pos & (PAGE_SIZE - 1);
- end = start + copied - 1;

/*
* generic_write_end() will run mark_inode_dirty() if i_size
@@ -3033,8 +3030,10 @@ static int ext4_da_write_end(struct file *file,
*/
new_i_size = pos + copied;
if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
+ struct page *head = compound_head(page);
+ unsigned long end = (pos & ~hpage_mask(head)) + copied - 1;
if (ext4_has_inline_data(inode) ||
- ext4_da_should_update_i_disksize(page, end)) {
+ ext4_da_should_update_i_disksize(head, end)) {
ext4_update_i_disksize(inode, new_i_size);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:06 PM10/24/16
to
We writeback whole huge page a time.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/filemap.c | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/mm/filemap.c b/mm/filemap.c
index 954720092cf8..ecf5c2dba3fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -509,9 +509,14 @@ static int __filemap_fdatawait_range(struct address_space *mapping,
if (page->index > end)
continue;

+ page = compound_head(page);
wait_on_page_writeback(page);
if (TestClearPageError(page))
ret = -EIO;
+ if (PageTransHuge(page)) {
+ index = page->index + HPAGE_PMD_NR;
+ i += index - pvec.pages[i]->index - 1;
+ }
}
pagevec_release(&pvec);
cond_resched();
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:06 PM10/24/16
to
It simply matches changes to __block_write_begin_int().

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/ext4/inode.c | 35 +++++++++++++++++++++--------------
1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c1728d2bf47b..1eae6801846c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1087,9 +1087,8 @@ int do_journal_get_write_access(handle_t *handle,
static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
- unsigned to = from + len;
- struct inode *inode = page->mapping->host;
+ unsigned from, to;
+ struct inode *inode = page_mapping(page)->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
@@ -1097,10 +1096,14 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
unsigned bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
bool decrypt = false;
+ bool uptodate = PageUptodate(page);

+ page = compound_head(page);
+ from = pos & ~hpage_mask(page);
+ to = from + len;
BUG_ON(!PageLocked(page));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(from > hpage_size(page));
+ BUG_ON(to > hpage_size(page));
BUG_ON(from > to);

if (!page_has_buffers(page))
@@ -1113,10 +1116,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
block++, block_start = block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
- if (PageUptodate(page)) {
- if (!buffer_uptodate(bh))
- set_buffer_uptodate(bh);
- }
+ if (uptodate && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
continue;
}
if (buffer_new(bh))
@@ -1129,19 +1130,25 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (buffer_new(bh)) {
unmap_underlying_metadata(bh->b_bdev,
bh->b_blocknr);
- if (PageUptodate(page)) {
+ if (uptodate) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
- if (block_end > to || block_start < from)
- zero_user_segments(page, to, block_end,
- block_start, from);
+ if (block_end > to || block_start < from) {
+ BUG_ON(to - from > PAGE_SIZE);
+ zero_user_segments(page +
+ block_start / PAGE_SIZE,
+ to % PAGE_SIZE,
+ (block_start % PAGE_SIZE) + blocksize,
+ block_start % PAGE_SIZE,
+ from % PAGE_SIZE);
+ }
continue;
}
}
- if (PageUptodate(page)) {
+ if (uptodate) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:07 PM10/24/16
to
Adjust check on whether part of the page beyond file size and apply
compound_head() and page_mapping() where appropriate.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/buffer.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index c078f5d74a2a..8dff5817e313 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2502,7 +2502,7 @@ EXPORT_SYMBOL(block_commit_write);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
- struct page *page = vmf->page;
+ struct page *page = compound_head(vmf->page);
struct inode *inode = file_inode(vma->vm_file);
unsigned long end;
loff_t size;
@@ -2510,7 +2510,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,

lock_page(page);
size = i_size_read(inode);
- if ((page->mapping != inode->i_mapping) ||
+ if ((page_mapping(page) != inode->i_mapping) ||
(page_offset(page) > size)) {
/* We overload EFAULT to mean page got truncated */
ret = -EFAULT;
@@ -2518,10 +2518,10 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
}

/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_SHIFT) > size)
- end = size & ~PAGE_MASK;
+ if (((page->index + hpage_nr_pages(page)) << PAGE_SHIFT) > size)
+ end = size & ~hpage_mask(page);
else
- end = PAGE_SIZE;
+ end = hpage_size(page);

ret = __block_write_begin(page, 0, end, get_block);
if (!ret)
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
__ext4_block_zero_page_range() adjusted to calculate starting iblock
correctry for huge pages.

ext4_{collapse,insert}_range() requires page cache invalidation. We need
the invalidation to be aligning to huge page border if huge pages are
possible in page cache.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/ext4/extents.c | 10 ++++++++--
fs/ext4/inode.c | 3 +--
2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c930a0110fb4..16a6bdeac36f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5511,7 +5511,10 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ ioffset = round_down(offset, HPAGE_PMD_SIZE);
+ else
+ ioffset = round_down(offset, PAGE_SIZE);
/*
* Write tail of the last page before removed range since it will get
* removed from the page cache below.
@@ -5660,7 +5663,10 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
* Need to round down to align start offset to page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ ioffset = round_down(offset, HPAGE_PMD_SIZE);
+ else
+ ioffset = round_down(offset, PAGE_SIZE);
/* Write out all dirty pages */
ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
LLONG_MAX);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d179467f82e7..95b64a7cee67 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3716,7 +3716,6 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
unsigned offset;
unsigned blocksize, pos;
ext4_lblk_t iblock;
@@ -3735,7 +3734,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,

blocksize = inode->i_sb->s_blocksize;

- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = page->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);

if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
We would need to use multi-order radix-tree entires for ext4 and other
filesystems to have coherent view on tags (dirty/towrite) in the tree.

This patch converts huge tmpfs implementation to multi-order entries, so
we will be able to use the same code patch for all filesystems.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/radix-tree.h | 1 +
lib/radix-tree.c | 6 +
mm/filemap.c | 351 +++++++++++++++++++++++++++------------------
mm/huge_memory.c | 47 ++++--
mm/khugepaged.c | 26 ++--
mm/shmem.c | 36 ++---
6 files changed, 271 insertions(+), 196 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 8ffb0518ae9a..76fec3f52203 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -283,6 +283,7 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
void radix_tree_clear_tags(struct radix_tree_root *root,
struct radix_tree_node *node,
void **slot);
+void radix_tree_clear_siblings(struct radix_tree_node *node, void **slot);
unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
void **results, unsigned long first_index,
unsigned int max_items);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index d36cff71d225..6c4ac58ab687 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1741,6 +1741,12 @@ static inline void delete_sibling_entries(struct radix_tree_node *node,
#endif
}

+void radix_tree_clear_siblings(struct radix_tree_node *node, void **slot)
+{
+ delete_sibling_entries(node, node_to_entry(slot),
+ get_slot_offset(node, slot));
+}
+
/**
* radix_tree_delete_item - delete an item from a radix tree
* @root: radix tree root
diff --git a/mm/filemap.c b/mm/filemap.c
index 849f459ad078..23fbbb61725c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -169,59 +169,58 @@ static int page_cache_tree_insert(struct address_space *mapping,
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ struct radix_tree_node *node;
+ void **slot;
+ int nr = PageHuge(page) ? 1 : hpage_nr_pages(page);

VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);

- for (i = 0; i < nr; i++) {
- struct radix_tree_node *node;
- void **slot;
-
- __radix_tree_lookup(&mapping->page_tree, page->index + i,
- &node, &slot);

- radix_tree_clear_tags(&mapping->page_tree, node, slot);
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+ radix_tree_clear_tags(&mapping->page_tree, node, slot);

- if (!node) {
- VM_BUG_ON_PAGE(nr != 1, page);
- /*
- * We need a node to properly account shadow
- * entries. Don't plant any without. XXX
- */
- shadow = NULL;
- }
+ if (!node) {
+ VM_BUG_ON_PAGE(nr != 1, page);
+ /*
+ * We need a node to properly account shadow
+ * entries. Don't plant any without. XXX
+ */
+ shadow = NULL;
+ }

- radix_tree_replace_slot(slot, shadow);
+ radix_tree_replace_slot(slot, shadow);
+ if (!shadow && node)
+ radix_tree_clear_siblings(node, slot);

- if (!node)
- break;
+ if (!node)
+ goto out;

- workingset_node_pages_dec(node);
- if (shadow)
- workingset_node_shadows_inc(node);
- else
- if (__radix_tree_delete_node(&mapping->page_tree, node))
- continue;
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ goto out;

- /*
- * Track node that only contains shadow entries. DAX mappings
- * contain no shadow entries and may contain other exceptional
- * entries so skip those.
- *
- * Avoid acquiring the list_lru lock if already tracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
- list_empty(&node->private_list)) {
- node->private_data = mapping;
- list_lru_add(&workingset_shadow_nodes,
- &node->private_list);
- }
+ /*
+ * Track node that only contains shadow entries. DAX mappings
+ * contain no shadow entries and may contain other exceptional
+ * entries so skip those.
+ *
+ * Avoid acquiring the list_lru lock if already tracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
+ PageTransHuge(page) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes,
+ &node->private_list);
}
-
+out:
if (shadow) {
mapping->nrexceptional += nr;
/*
@@ -334,12 +333,7 @@ void delete_from_page_cache(struct page *page)
if (freepage)
freepage(page);

- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
- }
+ put_page(page);
}
EXPORT_SYMBOL(delete_from_page_cache);

@@ -1086,7 +1080,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
void **pagep;
- struct page *head, *page;
+ struct page *page;

rcu_read_lock();
repeat:
@@ -1107,25 +1101,25 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
goto out;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;

- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
* Has the page moved?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+
+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(offset - page->index < 0);
+ VM_BUG_ON(offset - page->index >= HPAGE_PMD_NR);
+ page += offset - page->index;
+ }
}
out:
rcu_read_unlock();
@@ -1288,7 +1282,7 @@ unsigned find_get_entries(struct address_space *mapping,
struct page **entries, pgoff_t *indices)
{
void **slot;
- unsigned int ret = 0;
+ unsigned int refs, ret = 0;
struct radix_tree_iter iter;

if (!nr_entries)
@@ -1296,7 +1290,10 @@ unsigned find_get_entries(struct address_space *mapping,

rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *head, *page;
+ struct page *page;
+ unsigned long index = iter.index;
+ if (index < start)
+ index = start;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1314,26 +1311,38 @@ unsigned find_get_entries(struct address_space *mapping,
goto export;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
+ if (!page_cache_get_speculative(page))
goto repeat;
- }

/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+
+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
export:
- indices[ret] = iter.index;
+ indices[ret] = index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ if (radix_tree_exception(page) || !PageTransCompound(page))
+ continue;
+ for (refs = 0; ret < nr_entries &&
+ (index + 1) % HPAGE_PMD_NR;
+ ret++, refs++) {
+ indices[ret] = ++index;
+ entries[ret] = ++page;
+ }
+ if (refs)
+ page_ref_add(compound_head(page), refs);
+ if (ret == nr_entries)
+ break;
}
rcu_read_unlock();
return ret;
@@ -1360,14 +1369,17 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
{
struct radix_tree_iter iter;
void **slot;
- unsigned ret = 0;
+ unsigned refs, ret = 0;

if (unlikely(!nr_pages))
return 0;

rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- struct page *head, *page;
+ struct page *page;
+ unsigned long index = iter.index;
+ if (index < start)
+ index = start;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1386,25 +1398,35 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
continue;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;

- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}

+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
+
pages[ret] = page;
if (++ret == nr_pages)
break;
+ if (!PageTransCompound(page))
+ continue;
+ for (refs = 0; ret < nr_pages &&
+ (index + 1) % HPAGE_PMD_NR;
+ ret++, refs++, index++)
+ pages[ret] = ++page;
+ if (refs)
+ page_ref_add(compound_head(page), refs);
+ if (ret == nr_pages)
+ break;
}

rcu_read_unlock();
@@ -1414,7 +1436,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
/**
* find_get_pages_contig - gang contiguous pagecache lookup
* @mapping: The address_space to search
- * @index: The starting page index
+ * @start: The starting page index
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
@@ -1423,19 +1445,22 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
*
* find_get_pages_contig() returns the number of pages which were found.
*/
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
- unsigned int ret = 0;
+ unsigned int refs, ret = 0;

if (unlikely(!nr_pages))
return 0;

rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
- struct page *head, *page;
+ radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, start) {
+ struct page *page;
+ unsigned long index = iter.index;
+ if (index < start)
+ index = start;
repeat:
page = radix_tree_deref_slot(slot);
/* The hole, there no reason to continue */
@@ -1455,19 +1480,12 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
break;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
+ if (!page_cache_get_speculative(page))
goto repeat;
- }

/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}

@@ -1476,14 +1494,31 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
+ if (page->mapping == NULL || page_to_pgoff(page) != index) {
put_page(page);
break;
}

+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
+
pages[ret] = page;
if (++ret == nr_pages)
break;
+ if (!PageTransCompound(page))
+ continue;
+ for (refs = 0; ret < nr_pages &&
+ (index + 1) % HPAGE_PMD_NR;
+ ret++, refs++, index++)
+ pages[ret] = ++page;
+ if (refs)
+ page_ref_add(compound_head(page), refs);
+ if (ret == nr_pages)
+ break;
}
rcu_read_unlock();
return ret;
@@ -1493,7 +1528,7 @@ EXPORT_SYMBOL(find_get_pages_contig);
/**
* find_get_pages_tag - find and return pages that match @tag
* @mapping: the address_space to search
- * @index: the starting page index
+ * @indexp: the starting page index
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1501,20 +1536,23 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *indexp,
int tag, unsigned int nr_pages, struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
- unsigned ret = 0;
+ unsigned refs, ret = 0;

if (unlikely(!nr_pages))
return 0;

rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
- &iter, *index, tag) {
- struct page *head, *page;
+ &iter, *indexp, tag) {
+ struct page *page;
+ unsigned long index = iter.index;
+ if (index < *indexp)
+ index = *indexp;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1539,31 +1577,41 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
continue;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
+ if (!page_cache_get_speculative(page))
goto repeat;
- }

/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}

+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
+
pages[ret] = page;
if (++ret == nr_pages)
break;
+ if (!PageTransCompound(page))
+ continue;
+ for (refs = 0; ret < nr_pages &&
+ (index + 1) % HPAGE_PMD_NR;
+ ret++, refs++, index++)
+ pages[ret] = ++page;
+ if (refs)
+ page_ref_add(compound_head(page), refs);
+ if (ret == nr_pages)
+ break;
}

rcu_read_unlock();

if (ret)
- *index = pages[ret - 1]->index + 1;
+ *indexp = page_to_pgoff(pages[ret - 1]) + 1;

return ret;
}
@@ -1586,7 +1634,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
struct page **entries, pgoff_t *indices)
{
void **slot;
- unsigned int ret = 0;
+ unsigned int refs, ret = 0;
struct radix_tree_iter iter;

if (!nr_entries)
@@ -1595,7 +1643,10 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
rcu_read_lock();
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
- struct page *head, *page;
+ struct page *page;
+ unsigned long index = iter.index;
+ if (index < start)
+ index = start;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1614,26 +1665,38 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
goto export;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
+ if (!page_cache_get_speculative(page))
goto repeat;
- }

/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+
+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
export:
- indices[ret] = iter.index;
+ indices[ret] = index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ if (radix_tree_exception(page) || !PageTransCompound(page))
+ continue;
+ for (refs = 0; ret < nr_entries &&
+ (index + 1) % HPAGE_PMD_NR;
+ ret++, refs++) {
+ indices[ret] = ++index;
+ entries[ret] = ++page;
+ }
+ if (refs)
+ page_ref_add(compound_head(page), refs);
+ if (ret == nr_entries)
+ break;
}
rcu_read_unlock();
return ret;
@@ -2221,12 +2284,15 @@ void filemap_map_pages(struct fault_env *fe,
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
- struct page *head, *page;
+ struct page *page;

rcu_read_lock();
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
start_pgoff) {
- if (iter.index > end_pgoff)
+ unsigned long index = iter.index;
+ if (index < start_pgoff)
+ index = start_pgoff;
+ if (index > end_pgoff)
break;
repeat:
page = radix_tree_deref_slot(slot);
@@ -2237,25 +2303,26 @@ void filemap_map_pages(struct fault_env *fe,
slot = radix_tree_iter_retry(&iter);
continue;
}
+ page = NULL;
goto next;
}

- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;

- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/* Has the page moved? */
if (unlikely(page != *slot)) {
- put_page(head);
+ put_page(page);
goto repeat;
}

+ /* For multi-order entries, find relevant subpage */
+ if (PageTransHuge(page)) {
+ VM_BUG_ON(index - page->index < 0);
+ VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ page += index - page->index;
+ }
+
if (!PageUptodate(page) ||
PageReadahead(page) ||
PageHWPoison(page))
@@ -2263,20 +2330,20 @@ void filemap_map_pages(struct fault_env *fe,
if (!trylock_page(page))
goto skip;

- if (page->mapping != mapping || !PageUptodate(page))
+ if (page_mapping(page) != mapping || !PageUptodate(page))
goto unlock;

size = round_up(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= size >> PAGE_SHIFT)
+ if (compound_head(page)->index >= size >> PAGE_SHIFT)
goto unlock;

if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;

- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ fe->address += (index - last_pgoff) << PAGE_SHIFT;
if (fe->pte)
- fe->pte += iter.index - last_pgoff;
- last_pgoff = iter.index;
+ fe->pte += index - last_pgoff;
+ last_pgoff = index;
if (alloc_set_pte(fe, NULL, page))
goto unlock;
unlock_page(page);
@@ -2289,8 +2356,14 @@ void filemap_map_pages(struct fault_env *fe,
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*fe->pmd))
break;
- if (iter.index == end_pgoff)
+ if (index == end_pgoff)
break;
+ if (page && PageTransCompound(page) &&
+ (index & (HPAGE_PMD_NR - 1)) !=
+ HPAGE_PMD_NR - 1) {
+ index++;
+ goto repeat;
+ }
}
rcu_read_unlock();
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cdcd25cb30fe..9888dbec1d01 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1878,6 +1878,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
+ struct page *subpage;
pgoff_t end = -1;
int i;

@@ -1886,8 +1887,26 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);

- if (!PageAnon(page))
- end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
+ if (!PageAnon(head)) {
+ struct address_space *mapping = head->mapping;
+ struct radix_tree_iter iter;
+ void **slot;
+
+ __dec_node_page_state(head, NR_SHMEM_THPS);
+
+ radix_tree_split(&mapping->page_tree, head->index, 0);
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+ head->index) {
+ if (iter.index >= head->index + HPAGE_PMD_NR)
+ break;
+ subpage = head + iter.index - head->index;
+ radix_tree_replace_slot(slot, subpage);
+ VM_BUG_ON_PAGE(compound_head(subpage) != head, subpage);
+ }
+ radix_tree_preload_end();
+
+ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ }

for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
@@ -1916,7 +1935,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
unfreeze_page(head);

for (i = 0; i < HPAGE_PMD_NR; i++) {
- struct page *subpage = head + i;
+ subpage = head + i;
if (subpage == page)
continue;
unlock_page(subpage);
@@ -2073,8 +2092,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}

- /* Addidional pins from radix tree */
- extra_pins = HPAGE_PMD_NR;
+ /* Addidional pin from radix tree */
+ extra_pins = 1;
anon_vma = NULL;
i_mmap_lock_read(mapping);
}
@@ -2096,6 +2115,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mlocked)
lru_add_drain();

+ if (mapping && radix_tree_split_preload(HPAGE_PMD_ORDER, 0,
+ GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto unfreeze;
+ }
+
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);

@@ -2105,10 +2130,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_lock(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(head));
- /*
- * Check if the head page is present in radix tree.
- * We assume all tail are present too, if head is there.
- */
+ /* Check if the page is present in radix tree */
if (radix_tree_deref_slot_protected(pslot,
&mapping->tree_lock) != head)
goto fail;
@@ -2123,8 +2145,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
- if (mapping)
- __dec_node_page_state(page, NR_SHMEM_THPS);
spin_unlock(&pgdata->split_queue_lock);
__split_huge_page(page, list, flags);
ret = 0;
@@ -2138,9 +2158,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
BUG();
}
spin_unlock(&pgdata->split_queue_lock);
-fail: if (mapping)
+fail: if (mapping) {
spin_unlock(&mapping->tree_lock);
+ radix_tree_preload_end();
+ }
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+unfreeze:
unfreeze_page(head);
ret = -EBUSY;
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 728d7790dc2d..c232541eec8c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1349,10 +1349,8 @@ static void collapse_shmem(struct mm_struct *mm,
break;
}
nr_none += n;
- for (; index < min(iter.index, end); index++) {
- radix_tree_insert(&mapping->page_tree, index,
- new_page + (index % HPAGE_PMD_NR));
- }
+ for (; index < min(iter.index, end); index++)
+ radix_tree_insert(&mapping->page_tree, index, new_page);

/* We are done. */
if (index >= end)
@@ -1421,8 +1419,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);

/* Finally, replace with the new page. */
- radix_tree_replace_slot(slot,
- new_page + (index % HPAGE_PMD_NR));
+ radix_tree_replace_slot(slot, new_page);

index++;
continue;
@@ -1439,24 +1436,17 @@ static void collapse_shmem(struct mm_struct *mm,
break;
}

- /*
- * Handle hole in radix tree at the end of the range.
- * This code only triggers if there's nothing in radix tree
- * beyond 'end'.
- */
- if (result == SCAN_SUCCEED && index < end) {
+ if (result == SCAN_SUCCEED) {
int n = end - index;

- if (!shmem_charge(mapping->host, n)) {
+ if (n && !shmem_charge(mapping->host, n)) {
result = SCAN_FAIL;
goto tree_locked;
}
-
- for (; index < end; index++) {
- radix_tree_insert(&mapping->page_tree, index,
- new_page + (index % HPAGE_PMD_NR));
- }
nr_none += n;
+
+ radix_tree_join(&mapping->page_tree, start,
+ HPAGE_PMD_ORDER, new_page);
}

tree_locked:
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..2bceff743346 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -541,33 +541,14 @@ static int shmem_add_to_page_cache(struct page *page,
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
VM_BUG_ON(expected && PageTransHuge(page));

- page_ref_add(page, nr);
+ get_page(page);
page->mapping = mapping;
page->index = index;

spin_lock_irq(&mapping->tree_lock);
- if (PageTransHuge(page)) {
- void __rcu **results;
- pgoff_t idx;
- int i;
-
- error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->page_tree,
- &results, &idx, index, 1) &&
- idx < index + HPAGE_PMD_NR) {
- error = -EEXIST;
- }
-
- if (!error) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->page_tree,
- index + i, page + i);
- VM_BUG_ON(error);
- }
- count_vm_event(THP_FILE_ALLOC);
- }
- } else if (!expected) {
- error = radix_tree_insert(&mapping->page_tree, index, page);
+ if (!expected) {
+ error = __radix_tree_insert(&mapping->page_tree, index,
+ compound_order(page), page);
} else {
error = shmem_radix_tree_replace(mapping, index, expected,
page);
@@ -575,15 +556,17 @@ static int shmem_add_to_page_cache(struct page *page,

if (!error) {
mapping->nrpages += nr;
- if (PageTransHuge(page))
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_ALLOC);
__inc_node_page_state(page, NR_SHMEM_THPS);
+ }
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
- page_ref_sub(page, nr);
+ put_page(page);
}
return error;
}
@@ -1734,8 +1717,7 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
PageTransHuge(page));
if (error)
goto unacct;
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
- compound_order(page));
+ error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, hindex,
NULL);
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
From: Matthew Wilcox <wi...@linux.intel.com>

In order to test the preload code, it is necessary to fail GFP_ATOMIC
allocations, which requires defining GFP_KERNEL and GFP_ATOMIC properly.
Remove the obsolete __GFP_WAIT and copy the definitions of the __GFP
flags which are used from the kernel include files. We also need the
real definition of gfpflags_allow_blocking() to persuade the radix tree
to actually use its preallocated nodes.

Signed-off-by: Matthew Wilcox <wi...@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
tools/testing/radix-tree/linux.c | 7 ++++++-
tools/testing/radix-tree/linux/gfp.h | 22 +++++++++++++++++++---
tools/testing/radix-tree/linux/slab.h | 5 -----
3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
index 154823737b20..3cfb04e98e2f 100644
--- a/tools/testing/radix-tree/linux.c
+++ b/tools/testing/radix-tree/linux.c
@@ -33,7 +33,12 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,

void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
{
- void *ret = malloc(cachep->size);
+ void *ret;
+
+ if (flags & __GFP_NOWARN)
+ return NULL;
+
+ ret = malloc(cachep->size);
if (cachep->ctor)
cachep->ctor(ret);
uatomic_inc(&nr_allocated);
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
index 5201b915f631..5b09b2ce6c33 100644
--- a/tools/testing/radix-tree/linux/gfp.h
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -3,8 +3,24 @@

#define __GFP_BITS_SHIFT 26
#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-#define __GFP_WAIT 1
-#define __GFP_ACCOUNT 0
-#define __GFP_NOWARN 0
+
+#define __GFP_HIGH 0x20u
+#define __GFP_IO 0x40u
+#define __GFP_FS 0x80u
+#define __GFP_NOWARN 0x200u
+#define __GFP_ATOMIC 0x80000u
+#define __GFP_ACCOUNT 0x100000u
+#define __GFP_DIRECT_RECLAIM 0x400000u
+#define __GFP_KSWAPD_RECLAIM 0x2000000u
+
+#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
+
+#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
+#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
+
+static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
+{
+ return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
+}

#endif
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
index 6d5a34770fd4..452e2bf502e3 100644
--- a/tools/testing/radix-tree/linux/slab.h
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -7,11 +7,6 @@
#define SLAB_PANIC 2
#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */

-static inline int gfpflags_allow_blocking(gfp_t mask)
-{
- return 1;
-}
-
struct kmem_cache {
int size;
void (*ctor)(void *);
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
Most of work happans on head page. Only when we need to do copy data to
userspace we find relevant subpage.

We are still limited by PAGE_SIZE per iteration. Lifting this limitation
would require some more work.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/filemap.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index f8387488636f..ca4536f2035e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1906,6 +1906,7 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
if (unlikely(page == NULL))
goto no_cached_page;
}
+ page = compound_head(page);
if (PageReadahead(page)) {
page_cache_async_readahead(mapping,
ra, filp, page,
@@ -1984,7 +1985,8 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
* now we can copy it to user space...
*/

- ret = copy_page_to_iter(page, offset, nr, iter);
+ ret = copy_page_to_iter(page + index - page->index, offset,
+ nr, iter);
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
@@ -2402,6 +2404,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* because there really aren't any performance issues here
* and we need to check for errors.
*/
+ page = compound_head(page);
ClearPageError(page);
error = mapping->a_ops->readpage(file, page);
if (!error) {
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
Slab pages can be compound, but we shouldn't threat them as THP for
pupose of hpage_* helpers, otherwise it would lead to confusing results.

For instance, ext4 uses slab pages for journal pages and we shouldn't
confuse them with THPs. The easiest way is to exclude them in hpage_*
helpers.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/huge_mm.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 42934769f256..1300f8bb7523 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -137,21 +137,21 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
}
static inline int hpage_nr_pages(struct page *page)
{
- if (unlikely(PageTransHuge(page)))
+ if (unlikely(!PageSlab(page) && PageTransHuge(page)))
return HPAGE_PMD_NR;
return 1;
}

static inline int hpage_size(struct page *page)
{
- if (unlikely(PageTransHuge(page)))
+ if (unlikely(!PageSlab(page) && PageTransHuge(page)))
return HPAGE_PMD_SIZE;
return PAGE_SIZE;
}

static inline unsigned long hpage_mask(struct page *page)
{
- if (unlikely(PageTransHuge(page)))
+ if (unlikely(!PageSlab(page) && PageTransHuge(page)))
return HPAGE_PMD_MASK;
return PAGE_MASK;
}
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
This reverts commit 356e1c23292a4f63cfdf1daf0e0ddada51f32de8.

After conversion of huge tmpfs to multi-order entries, we don't need
this anymore.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/radix-tree.h | 1 -
lib/radix-tree.c | 74 ----------------------------------------------
2 files changed, 75 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 76fec3f52203..f84780aefd06 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -292,7 +292,6 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
unsigned long first_index, unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
-int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, unsigned int tag);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6c4ac58ab687..cb1de07ae5a1 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,9 +38,6 @@
#include <linux/preempt.h> /* in_interrupt() */


-/* Number of nodes in fully populated tree of given height */
-static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
-
/*
* Radix tree node cache.
*/
@@ -433,51 +430,6 @@ int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
#endif

/*
- * The same as function above, but preload number of nodes required to insert
- * (1 << order) continuous naturally-aligned elements.
- */
-int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
-{
- unsigned long nr_subtrees;
- int nr_nodes, subtree_height;
-
- /* Preloading doesn't help anything with this gfp mask, skip it */
- if (!gfpflags_allow_blocking(gfp_mask)) {
- preempt_disable();
- return 0;
- }
-
- /*
- * Calculate number and height of fully populated subtrees it takes to
- * store (1 << order) elements.
- */
- nr_subtrees = 1 << order;
- for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE;
- subtree_height++)
- nr_subtrees >>= RADIX_TREE_MAP_SHIFT;
-
- /*
- * The worst case is zero height tree with a single item at index 0 and
- * then inserting items starting at ULONG_MAX - (1 << order).
- *
- * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to
- * 0-index item.
- */
- nr_nodes = RADIX_TREE_MAX_PATH;
-
- /* Plus branch to fully populated subtrees. */
- nr_nodes += RADIX_TREE_MAX_PATH - subtree_height;
-
- /* Root node is shared. */
- nr_nodes--;
-
- /* Plus nodes required to build subtrees. */
- nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height];
-
- return __radix_tree_preload(gfp_mask, nr_nodes);
-}
-
-/*
* The maximum index which can be stored in a radix tree
*/
static inline unsigned long shift_maxindex(unsigned int shift)
@@ -1845,31 +1797,6 @@ radix_tree_node_ctor(void *arg)
INIT_LIST_HEAD(&node->private_list);
}

-static __init unsigned long __maxindex(unsigned int height)
-{
- unsigned int width = height * RADIX_TREE_MAP_SHIFT;
- int shift = RADIX_TREE_INDEX_BITS - width;
-
- if (shift < 0)
- return ~0UL;
- if (shift >= BITS_PER_LONG)
- return 0UL;
- return ~0UL >> shift;
-}
-
-static __init void radix_tree_init_maxnodes(void)
-{
- unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1];
- unsigned int i, j;
-
- for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
- height_to_maxindex[i] = __maxindex(i);
- for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) {
- for (j = i; j > 0; j--)
- height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1;
- }
-}
-
static int radix_tree_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
@@ -1896,6 +1823,5 @@ void __init radix_tree_init(void)
sizeof(struct radix_tree_node), 0,
SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
radix_tree_node_ctor);
- radix_tree_init_maxnodes();
hotcpu_notifier(radix_tree_callback, 0);
}
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:20:13 PM10/24/16
to
Here's respin of my huge ext4 patchset on top of v4.9-rc2 with couple of
fixes (see below).

Please review and consider applying.

I don't see any xfstests regressions with huge pages enabled. Patch with
new configurations for xfstests-bld is below.

The basics are the same as with tmpfs[1] which is in Linus' tree now and
ext4 built on top of it. The main difference is that we need to handle
read out from and write-back to backing storage.

As with other THPs, the implementation is build around compound pages:
a naturally aligned collection of pages that memory management subsystem
[in most cases] treat as a single entity:

- head page (the first subpage) on LRU represents whole huge page;
- head page's flags represent state of whole huge page (with few
exceptions);
- mm can't migrate subpages of the compound page individually;

For THP, we use PMD-sized huge pages.

Head page links buffer heads for whole huge page. Dirty/writeback/etc.
tracking happens on per-hugepage level as all subpages share the same page
flags.

lock_page() on any subpage would lock whole hugepage for the same reason.

On radix-tree, a huge page represented as a multi-order entry of the same
order (HPAGE_PMD_ORDER). This allows us to track dirty/writeback on
radix-tree tags with the same granularity as on struct page.

We read out whole huge page at once. It required bumping BIO_MAX_PAGES to
not less than HPAGE_PMD_NR. I defined BIO_MAX_PAGES to HPAGE_PMD_NR if
huge pagecache enabled.

On IO via syscalls, we are still limited by copying upto PAGE_SIZE per
iteration. The limitation here comes from how copy_page_to_iter() and
copy_page_from_iter() work wrt. highmem: it can only handle one small
page a time.

On write side, we also have problem with assuming small pages: write
length and offset within page calculated before we know if small or huge
page is allocated. It's not easy to fix. Looks like it would require
change in ->write_begin() interface to accept len > PAGE_SIZE.

On split_huge_page() we need to free buffers before splitting the page.
Page buffers takes additional pin on the page and can be a vector to mess
with the page during split. We want to avoid this.
If try_to_free_buffers() fails, split_huge_page() would return -EBUSY.

Readahead doesn't play with huge pages well: 128k max readahead window,
assumption on page size, PageReadahead() to track hit/miss. I've got it
to allocate huge pages, but it doesn't provide any readahead as such.
I don't know how to do this right. It's not clear at this point if we
really need readahead with huge pages. I guess it's good enough for now.

Shadow entries ignored on allocation -- recently evicted page is not
promoted to active list. Not sure if current workingset logic is adequate
for huge pages. On eviction, we split the huge page and setup 4k shadow
entries as usual.

Unlike tmpfs, ext4 makes use of tags in radix-tree. The approach I used
for tmpfs -- 512 entries in radix-tree per-hugepages -- doesn't work well
if we want to have coherent view on tags. So the first 8 patches of the
patchset converts tmpfs to use multi-order entries in radix-tree.
The same infrastructure used for ext4.

Encryption doesn't handle huge pages yet. To avoid regressions we just
disable huge pages for the inode if it has EXT4_INODE_ENCRYPT.

Tested with 4k, 1k, encryption and bigalloc. All with and without
huge=always. I think it's reasonable coverage.

The patchset is also in git:

git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git hugeext4/v4

[1] http://lkml.kernel.org/r/1465222029-45942-1-git-s...@linux.intel.com

Changes since v3:
- account huge page to dirty/writeback/reclaimable/etc. according to its
size. It fixes background writback.
- move code that adds huge page to radix-tree to
page_cache_tree_insert() (Jan);
- make ramdisk work with huge pages;
- fix unaccont of shadow entries (Jan);
- use try_to_release_page() instead of try_to_free_buffers() in
split_huge_page() (Jan);
- make thp_get_unmapped_area() respect S_HUGE_MODE;
- use huge-page aligned address to zap page range in wp_huge_pmd();
- use ext4_kvmalloc in ext4_mpage_readpages() instead of
kmalloc() (Andreas);

Changes since v2:
- fix intermittent crash in generic/299;
- typo (condition inversion) in do_generic_file_read(),
reported by Jitendra;

TODO:
- on IO via syscalls, copy more than PAGE_SIZE per iteration to/from
userspace;
- readahead ?;
- wire up madvise()/fadvise();
- encryption with huge pages;
- reclaim of file huge pages can be optimized -- split_huge_page() is not
required for pages with backing storage;

From f523dd3aad026f5a3f8cbabc0ec69958a0618f6b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill....@linux.intel.com>
Date: Fri, 12 Aug 2016 19:44:30 +0300
Subject: [PATCH] Add few more configurations to test ext4 with huge pages

Four new configurations: huge_4k, huge_1k, huge_bigalloc, huge_encrypt.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
.../test-appliance/files/root/fs/ext4/cfg/all.list | 4 ++++
.../test-appliance/files/root/fs/ext4/cfg/huge_1k | 6 ++++++
.../test-appliance/files/root/fs/ext4/cfg/huge_4k | 6 ++++++
.../test-appliance/files/root/fs/ext4/cfg/huge_bigalloc | 14 ++++++++++++++
.../files/root/fs/ext4/cfg/huge_bigalloc.exclude | 7 +++++++
.../test-appliance/files/root/fs/ext4/cfg/huge_encrypt | 5 +++++
.../files/root/fs/ext4/cfg/huge_encrypt.exclude | 16 ++++++++++++++++
kvm-xfstests/util/parse_cli | 1 +
8 files changed, 59 insertions(+)
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_1k
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_4k
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc.exclude
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt
create mode 100644 kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt.exclude

diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/all.list b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/all.list
index 7ec37f4bafaa..14a8e72d2e6e 100644
--- a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/all.list
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/all.list
@@ -9,3 +9,7 @@ dioread_nolock
data_journal
bigalloc
bigalloc_1k
+huge_4k
+huge_1k
+huge_bigalloc
+huge_encrypt
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_1k b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_1k
new file mode 100644
index 000000000000..209c76a8a6c1
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_1k
@@ -0,0 +1,6 @@
+export FS=ext4
+export TEST_DEV=$SM_TST_DEV
+export TEST_DIR=$SM_TST_MNT
+export MKFS_OPTIONS="-q -b 1024"
+export EXT_MOUNT_OPTIONS="huge=always"
+TESTNAME="Ext4 1k block with huge pages"
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_4k b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_4k
new file mode 100644
index 000000000000..bae901cb2bab
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_4k
@@ -0,0 +1,6 @@
+export FS=ext4
+export TEST_DEV=$PRI_TST_DEV
+export TEST_DIR=$PRI_TST_MNT
+export MKFS_OPTIONS="-q"
+export EXT_MOUNT_OPTIONS="huge=always"
+TESTNAME="Ext4 4k block with huge pages"
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc
new file mode 100644
index 000000000000..b3d87562bce6
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc
@@ -0,0 +1,14 @@
+SIZE=large
+export MKFS_OPTIONS="-O bigalloc"
+export EXT_MOUNT_OPTIONS="huge=always"
+
+# Until we can teach xfstests the difference between cluster size and
+# block size, avoid collapse_range, insert_range, and zero_range since
+# these will fail due the fact that these operations require
+# cluster-aligned ranges.
+export FSX_AVOID="-C -I -z"
+export FSSTRESS_AVOID="-f collapse=0 -f insert=0 -f zero=0"
+export XFS_IO_AVOID="fcollapse finsert zero"
+
+TESTNAME="Ext4 4k block w/bigalloc"
+
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc.exclude b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc.exclude
new file mode 100644
index 000000000000..bd779be99518
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_bigalloc.exclude
@@ -0,0 +1,7 @@
+# bigalloc does not support on-line defrag
+ext4/301
+ext4/302
+ext4/303
+ext4/304
+ext4/307
+ext4/308
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt
new file mode 100644
index 000000000000..29f058ba937d
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt
@@ -0,0 +1,5 @@
+SIZE=small
+export MKFS_OPTIONS=""
+export EXT_MOUNT_OPTIONS="test_dummy_encryption,huge=always"
+REQUIRE_FEATURE=encryption
+TESTNAME="Ext4 encryption"
diff --git a/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt.exclude b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt.exclude
new file mode 100644
index 000000000000..b91cc58b5aa3
--- /dev/null
+++ b/kvm-xfstests/test-appliance/files/root/fs/ext4/cfg/huge_encrypt.exclude
@@ -0,0 +1,16 @@
+ext4/004 # dump/restore doesn't handle quotas
+
+# encryption doesn't play well with quota
+generic/082
+generic/219
+generic/230
+generic/231
+generic/232
+generic/233
+generic/235
+generic/270
+
+# generic/204 tests ENOSPC handling; it doesn't correctly
+# anticipate the external extended attribute required when
+# using a 1k block size
+generic/204
diff --git a/kvm-xfstests/util/parse_cli b/kvm-xfstests/util/parse_cli
index 83400ea71985..ba64ce5df016 100644
--- a/kvm-xfstests/util/parse_cli
+++ b/kvm-xfstests/util/parse_cli
@@ -36,6 +36,7 @@ print_help ()
echo "Common file system configurations are:"
echo " 4k 1k ext3 nojournal ext3conv metacsum dioread_nolock "
echo " data_journal bigalloc bigalloc_1k inline"
+ echo " huge_4k huge_1k huge_bigalloc huge_encrypt"
echo ""
echo "xfstest names have the form: ext4/NNN generic/NNN shared/NNN"
echo ""
--
2.9.3

Kirill A. Shutemov (37):
mm, shmem: swich huge tmpfs to multi-order radix-tree entries
Revert "radix-tree: implement radix_tree_maybe_preload_order()"
page-flags: relax page flag policy for few flags
mm, rmap: account file thp pages
thp: try to free page's buffers before attempt split
thp: handle write-protection faults for file THP
truncate: make sure invalidate_mapping_pages() can discard huge pages
filemap: allocate huge page in page_cache_read(), if allowed
filemap: handle huge pages in do_generic_file_read()
filemap: allocate huge page in pagecache_get_page(), if allowed
filemap: handle huge pages in filemap_fdatawait_range()
HACK: readahead: alloc huge pages, if allowed
block: define BIO_MAX_PAGES to HPAGE_PMD_NR if huge page cache enabled
brd: make it handle huge pages
mm: make write_cache_pages() work on huge pages
thp: introduce hpage_size() and hpage_mask()
thp: do not threat slab pages as huge in hpage_{nr_pages,size,mask}
thp: make thp_get_unmapped_area() respect S_HUGE_MODE
fs: make block_read_full_page() be able to read huge page
fs: make block_write_{begin,end}() be able to handle huge pages
fs: make block_page_mkwrite() aware about huge pages
truncate: make truncate_inode_pages_range() aware about huge pages
truncate: make invalidate_inode_pages2_range() aware about huge pages
mm: account huge pages to dirty, writaback, reclaimable, etc.
ext4: make ext4_mpage_readpages() hugepage-aware
ext4: make ext4_writepage() work on huge pages
ext4: handle huge pages in ext4_page_mkwrite()
ext4: handle huge pages in __ext4_block_zero_page_range()
ext4: make ext4_block_write_begin() aware about huge pages
ext4: handle huge pages in ext4_da_write_end()
ext4: make ext4_da_page_release_reservation() aware about huge pages
ext4: handle writeback with huge pages
ext4: make EXT4_IOC_MOVE_EXT work with huge pages
ext4: fix SEEK_DATA/SEEK_HOLE for huge pages
ext4: make fallocate() operations work with huge pages
mm, fs, ext4: expand use of page_mapping() and page_to_pgoff()
ext4, vfs: add huge= mount option

Matthew Wilcox (5):
tools: Add WARN_ON_ONCE
radix tree test suite: Allow GFP_ATOMIC allocations to fail
radix-tree: Add radix_tree_join
radix-tree: Add radix_tree_split
radix-tree: Add radix_tree_split_preload()

Naoya Horiguchi (1):
mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries

block/bio.c | 3 +-
drivers/base/node.c | 6 +
drivers/block/brd.c | 17 +-
fs/buffer.c | 106 +++---
fs/ext4/ext4.h | 5 +
fs/ext4/extents.c | 10 +-
fs/ext4/file.c | 18 +-
fs/ext4/inode.c | 171 ++++++----
fs/ext4/move_extent.c | 12 +-
fs/ext4/page-io.c | 11 +-
fs/ext4/readpage.c | 39 ++-
fs/ext4/super.c | 26 ++
fs/fs-writeback.c | 10 +-
fs/hugetlbfs/inode.c | 22 +-
fs/proc/meminfo.c | 4 +
fs/proc/task_mmu.c | 5 +-
include/linux/backing-dev.h | 10 +
include/linux/bio.h | 4 +
include/linux/buffer_head.h | 10 +-
include/linux/fs.h | 5 +
include/linux/huge_mm.h | 18 +-
include/linux/memcontrol.h | 5 +-
include/linux/mm.h | 1 +
include/linux/mmzone.h | 2 +
include/linux/page-flags.h | 12 +-
include/linux/pagemap.h | 32 +-
include/linux/radix-tree.h | 18 +-
include/linux/swap.h | 2 +
lib/radix-tree.c | 352 ++++++++++++++------
mm/filemap.c | 585 ++++++++++++++++++++++++----------
mm/huge_memory.c | 73 ++++-
mm/hugetlb.c | 19 +-
mm/khugepaged.c | 26 +-
mm/memory.c | 15 +-
mm/migrate.c | 1 +
mm/page-writeback.c | 86 +++--
mm/page_alloc.c | 5 +
mm/readahead.c | 17 +-
mm/rmap.c | 12 +-
mm/shmem.c | 36 +--
mm/truncate.c | 157 +++++++--
mm/vmstat.c | 2 +
mm/workingset.c | 23 ++
tools/include/asm/bug.h | 11 +
tools/testing/radix-tree/Makefile | 2 +-
tools/testing/radix-tree/linux.c | 7 +-
tools/testing/radix-tree/linux/bug.h | 2 +-
tools/testing/radix-tree/linux/gfp.h | 22 +-
tools/testing/radix-tree/linux/slab.h | 5 -
tools/testing/radix-tree/multiorder.c | 82 +++++
tools/testing/radix-tree/test.h | 9 +
51 files changed, 1542 insertions(+), 591 deletions(-)

--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:05 PM10/24/16
to
From: Matthew Wilcox <wi...@linux.intel.com>

The radix tree uses its own buggy WARN_ON_ONCE. Replace it with the
definition from asm-generic/bug.h

Signed-off-by: Matthew Wilcox <wi...@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
tools/include/asm/bug.h | 11 +++++++++++
tools/testing/radix-tree/Makefile | 2 +-
tools/testing/radix-tree/linux/bug.h | 2 +-
3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h
index 9e5f4846967f..beda1a884b50 100644
--- a/tools/include/asm/bug.h
+++ b/tools/include/asm/bug.h
@@ -12,6 +12,17 @@
unlikely(__ret_warn_on); \
})

+#define WARN_ON_ONCE(condition) ({ \
+ static int __warned; \
+ int __ret_warn_once = !!(condition); \
+ \
+ if (unlikely(__ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
#define WARN_ONCE(condition, format...) ({ \
static int __warned; \
int __ret_warn_once = !!(condition); \
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index f2e07f2fd4b4..3c338dc3b162 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -1,5 +1,5 @@

-CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE
+CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE
LDFLAGS += -lpthread -lurcu
TARGETS = main
OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
index ccbe444977df..23b8ed52f8c8 100644
--- a/tools/testing/radix-tree/linux/bug.h
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -1 +1 @@
-#define WARN_ON_ONCE(x) assert(x)
+#include "asm/bug.h"
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:05 PM10/24/16
to
From: Matthew Wilcox <wi...@linux.intel.com>

This new function allows for the replacement of many smaller entries in
the radix tree with one larger multiorder entry. From the point of view
of an RCU walker, they may see a mixture of the smaller entries and the
large entry during the same walk, but they will never see NULL for an
index which was populated before the join.

Signed-off-by: Matthew Wilcox <wi...@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/radix-tree.h | 2 +
lib/radix-tree.c | 159 +++++++++++++++++++++++++++-------
tools/testing/radix-tree/multiorder.c | 32 +++++++
3 files changed, 163 insertions(+), 30 deletions(-)

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index af3581b8a451..1efd81f21241 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -319,6 +319,8 @@ static inline void radix_tree_preload_end(void)
preempt_enable();
}

+int radix_tree_join(struct radix_tree_root *, unsigned long index,
+ unsigned new_order, void *);
/**
* struct radix_tree_iter - radix tree iterator state
*
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e6d552c40dd..6a76252c93a6 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -314,18 +314,14 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
{
struct radix_tree_node *node =
container_of(head, struct radix_tree_node, rcu_head);
- int i;

/*
- * must only free zeroed nodes into the slab. radix_tree_shrink
- * can leave us with a non-NULL entry in the first slot, so clear
- * that here to make sure.
+ * Must only free zeroed nodes into the slab. We can be left with
+ * non-NULL entries by radix_tree_free_nodes, so clear the entries
+ * and tags here.
*/
- for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
- tag_clear(node, i, 0);
-
- node->slots[0] = NULL;
- node->count = 0;
+ memset(node->slots, 0, sizeof(node->slots));
+ memset(node->tags, 0, sizeof(node->tags));

kmem_cache_free(radix_tree_node_cachep, node);
}
@@ -563,14 +559,14 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
shift = radix_tree_load_root(root, &child, &maxindex);

/* Make sure the tree is high enough. */
+ if (order > 0 && max == ((1UL << order) - 1))
+ max++;
if (max > maxindex) {
int error = radix_tree_extend(root, max, shift);
if (error < 0)
return error;
shift = error;
child = root->rnode;
- if (order == shift)
- shift += RADIX_TREE_MAP_SHIFT;
}

while (shift > order) {
@@ -582,6 +578,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
return -ENOMEM;
child->shift = shift;
child->offset = offset;
+ child->count = 0;
child->parent = node;
rcu_assign_pointer(*slot, node_to_entry(child));
if (node)
@@ -595,31 +592,113 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
slot = &node->slots[offset];
}

+ if (nodep)
+ *nodep = node;
+ if (slotp)
+ *slotp = slot;
+ return 0;
+}
+
#ifdef CONFIG_RADIX_TREE_MULTIORDER
- /* Insert pointers to the canonical entry */
- if (order > shift) {
- unsigned i, n = 1 << (order - shift);
+/*
+ * Free any nodes below this node. The tree is presumed to not need
+ * shrinking, and any user data in the tree is presumed to not need a
+ * destructor called on it. If we need to add a destructor, we can
+ * add that functionality later. Note that we may not clear tags or
+ * slots from the tree as an RCU walker may still have a pointer into
+ * this subtree. We could replace the entries with RADIX_TREE_RETRY,
+ * but we'll still have to clear those in rcu_free.
+ */
+static void radix_tree_free_nodes(struct radix_tree_node *node)
+{
+ unsigned offset = 0;
+ struct radix_tree_node *child = entry_to_node(node);
+
+ for (;;) {
+ void *entry = child->slots[offset];
+ if (radix_tree_is_internal_node(entry) &&
+ !is_sibling_entry(child, entry)) {
+ child = entry_to_node(entry);
+ offset = 0;
+ continue;
+ }
+ offset++;
+ while (offset == RADIX_TREE_MAP_SIZE) {
+ struct radix_tree_node *old = child;
+ offset = child->offset + 1;
+ child = child->parent;
+ radix_tree_node_free(old);
+ if (old == entry_to_node(node))
+ return;
+ }
+ }
+}
+
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+ void *ptr, unsigned order, bool replace)
+{
+ struct radix_tree_node *child;
+ unsigned i, n, tag, offset, tags = 0;
+
+ if (node) {
+ n = 1 << (order - node->shift);
+ offset = get_slot_offset(node, slot);
+ } else {
+ n = 1;
+ offset = 0;
+ }
+
+ if (n > 1) {
offset = offset & ~(n - 1);
slot = &node->slots[offset];
- child = node_to_entry(slot);
- for (i = 0; i < n; i++) {
- if (slot[i])
+ }
+ child = node_to_entry(slot);
+
+ for (i = 0; i < n; i++) {
+ if (slot[i]) {
+ if (replace) {
+ node->count--;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tag_get(node, tag, offset + i))
+ tags |= 1 << tag;
+ } else
return -EEXIST;
}
+ }

- for (i = 1; i < n; i++) {
+ for (i = 0; i < n; i++) {
+ struct radix_tree_node *old = slot[i];
+ if (i) {
rcu_assign_pointer(slot[i], child);
- node->count++;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_clear(node, tag, offset + i);
+ } else {
+ rcu_assign_pointer(slot[i], ptr);
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+ if (tags & (1 << tag))
+ tag_set(node, tag, offset);
}
+ if (radix_tree_is_internal_node(old) &&
+ !is_sibling_entry(node, old))
+ radix_tree_free_nodes(old);
}
-#endif
-
- if (nodep)
- *nodep = node;
- if (slotp)
- *slotp = slot;
- return 0;
+ if (node)
+ node->count += n;
+ return n;
+}
+#else
+static inline int insert_entries(struct radix_tree_node *node, void **slot,
+ void *ptr, unsigned order, bool replace)
+{
+ if (*slot)
+ return -EEXIST;
+ rcu_assign_pointer(*slot, ptr);
+ if (node)
+ node->count++;
+ return 1;
}
+#endif

/**
* __radix_tree_insert - insert into a radix tree
@@ -642,13 +721,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
error = __radix_tree_create(root, index, order, &node, &slot);
if (error)
return error;
- if (*slot != NULL)
- return -EEXIST;
- rcu_assign_pointer(*slot, item);
+
+ error = insert_entries(node, slot, item, order, false);
+ if (error < 0)
+ return error;

if (node) {
unsigned offset = get_slot_offset(node, slot);
- node->count++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +825,26 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
}
EXPORT_SYMBOL(radix_tree_lookup);

+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+int radix_tree_join(struct radix_tree_root *root, unsigned long index,
+ unsigned order, void *item)
+{
+ struct radix_tree_node *node;
+ void **slot;
+ int error;
+
+ BUG_ON(radix_tree_is_internal_node(item));
+
+ error = __radix_tree_create(root, index, order, &node, &slot);
+ if (!error)
+ error = insert_entries(node, slot, item, order, true);
+ if (error > 0)
+ error = 0;
+
+ return error;
+}
+#endif
+
/**
* radix_tree_tag_set - set a tag on a radix tree node
* @root: radix tree root
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 05d7bc488971..4c66acc6d0ea 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -325,6 +325,37 @@ void multiorder_tagged_iteration(void)
item_kill_tree(&tree);
}

+static void __multiorder_join(unsigned long index,
+ unsigned order1, unsigned order2)
+{
+ unsigned long loc;
+ void *item, *item2 = item_create(index + 1);
+ RADIX_TREE(tree, GFP_KERNEL);
+
+ item_insert_order(&tree, index, order2);
+ item = radix_tree_lookup(&tree, index);
+ radix_tree_join(&tree, index + 1, order1, item2);
+ loc = radix_tree_locate_item(&tree, item);
+ if (loc == -1)
+ free(item);
+ item = radix_tree_lookup(&tree, index + 1);
+ assert(item == item2);
+ item_kill_tree(&tree);
+}
+
+static void multiorder_join(void)
+{
+ int i, j, idx;
+
+ for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
+ for (i = 1; i < 15; i++) {
+ for (j = 0; j < i; j++) {
+ __multiorder_join(idx, i, j);
+ }
+ }
+ }
+}
+
void multiorder_checks(void)
{
int i;
@@ -342,4 +373,5 @@ void multiorder_checks(void)
multiorder_tag_tests();
multiorder_iteration();
multiorder_tagged_iteration();
+ multiorder_join();
}
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:05 PM10/24/16
to
We want mmap(NULL) to return PMD-aligned address if the inode can have
huge pages in page cache.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/huge_memory.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40fe91ac383c..2c1524a8d5d4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -518,10 +518,12 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+ struct inode *inode = filp->f_mapping->host;

if (addr)
goto out;
- if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+ if ((inode->i_flags & S_HUGE_MODE) == S_HUGE_NEVER &&
+ (!IS_DAX(inode) || !IS_ENABLED(CONFIG_FS_DAX_PMD)))
goto out;

addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:05 PM10/24/16
to
Write path allocate pages using pagecache_get_page(). We should be able
to allocate huge pages there, if it's allowed. As usually, fallback to
small pages, if failed.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
mm/filemap.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index ca4536f2035e..954720092cf8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1352,13 +1352,16 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,

no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
+ pgoff_t hoffset;
int err;
if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;

- page = __page_cache_alloc(gfp_mask);
+ page = page_cache_alloc_huge(mapping, offset, gfp_mask);
+no_huge: if (!page)
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;

@@ -1369,14 +1372,25 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);

- err = add_to_page_cache_lru(page, mapping, offset,
+ if (PageTransHuge(page))
+ hoffset = round_down(offset, HPAGE_PMD_NR);
+ else
+ hoffset = offset;
+
+ err = add_to_page_cache_lru(page, mapping, hoffset,
gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
+ if (PageTransHuge(page)) {
+ put_page(page);
+ page = NULL;
+ goto no_huge;
+ }
put_page(page);
page = NULL;
if (err == -EEXIST)
goto repeat;
}
+ page += offset - hoffset;
}

return page;
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:06 PM10/24/16
to
From: Naoya Horiguchi <n-hor...@ah.jp.nec.com>

Currently, hugetlb pages are linked to page cache on the basis of hugepage
offset (derived from vma_hugecache_offset()) for historical reason, which
doesn't match to the generic usage of page cache and requires some routines
to covert page offset <=> hugepage offset in common path. This patch
adjusts code for multi-order radix-tree to avoid the situation.

Main change is on the behavior of page->index for hugetlbfs. Before this
patch, it represented hugepage offset, but with this patch it represents
page offset. So index-related code have to be updated.
Note that hugetlb_fault_mutex_hash() and reservation region handling are
still working with hugepage offset.

Signed-off-by: Naoya Horiguchi <n-hor...@ah.jp.nec.com>
[kirill....@linux.intel.com: reject fixed]
Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/hugetlbfs/inode.c | 22 ++++++++++------------
include/linux/pagemap.h | 10 +---------
mm/filemap.c | 31 ++++++++++++++++++-------------
mm/hugetlb.c | 19 ++++++-------------
4 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fb7b10f3a05..45992c839794 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -388,8 +388,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> huge_page_shift(h);
- const pgoff_t end = lend >> huge_page_shift(h);
+ const pgoff_t start = lstart >> PAGE_SHIFT;
+ const pgoff_t end = lend >> PAGE_SHIFT;
struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
@@ -446,8 +446,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,

i_mmap_lock_write(mapping);
hugetlb_vmdelete_list(&mapping->i_mmap,
- next * pages_per_huge_page(h),
- (next + 1) * pages_per_huge_page(h));
+ next, next + 1);
i_mmap_unlock_write(mapping);
}

@@ -466,7 +465,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
- next, next + 1, 1)))
+ (next) << huge_page_order(h),
+ (next + 1) << huge_page_order(h), 1)))
hugetlb_fix_reserve_counts(inode);
}

@@ -550,8 +550,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
struct hstate *h = hstate_inode(inode);
struct vm_area_struct pseudo_vma;
struct mm_struct *mm = current->mm;
- loff_t hpage_size = huge_page_size(h);
- unsigned long hpage_shift = huge_page_shift(h);
pgoff_t start, index, end;
int error;
u32 hash;
@@ -567,8 +565,8 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
* For this range, start is rounded down and end is rounded up
* as well as being converted to page offsets.
*/
- start = offset >> hpage_shift;
- end = (offset + len + hpage_size - 1) >> hpage_shift;
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len + huge_page_size(h) - 1) >> PAGE_SHIFT;

inode_lock(inode);

@@ -586,7 +584,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pseudo_vma.vm_file = file;

- for (index = start; index < end; index++) {
+ for (index = start; index < end; index += pages_per_huge_page(h)) {
/*
* This is supposed to be the vaddr where the page is being
* faulted in, but we have no vaddr here.
@@ -607,10 +605,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
}

/* Set numa allocation policy based on index */
- hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index >> huge_page_order(h));

/* addr is the offset within the file (zero based) */
- addr = index * hpage_size;
+ addr = index << PAGE_SHIFT & ~huge_page_mask(h);

/* mutex taken here, fault path and hole punch */
hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f9aa8bede15e..ff2c0fc7fff8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -390,15 +390,11 @@ static inline struct page *read_mapping_page(struct address_space *mapping,

/*
* Get the offset in PAGE_SIZE.
- * (TODO: hugepage should have ->index in PAGE_SIZE)
*/
static inline pgoff_t page_to_pgoff(struct page *page)
{
pgoff_t pgoff;

- if (unlikely(PageHeadHuge(page)))
- return page->index << compound_order(page);
-
if (likely(!PageTransTail(page)))
return page->index;

@@ -424,15 +420,11 @@ static inline loff_t page_file_offset(struct page *page)
return ((loff_t)page_index(page)) << PAGE_SHIFT;
}

-extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address);
-
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
pgoff_t pgoff;
- if (unlikely(is_vm_hugetlb_page(vma)))
- return linear_hugepage_index(vma, address);
+
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff;
diff --git a/mm/filemap.c b/mm/filemap.c
index ecf5c2dba3fb..85274c4ca2c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -121,9 +121,8 @@ static int page_cache_tree_insert_huge(struct address_space *mapping,
/* Wipe shadow entires */
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
page->index) {
- if (iter.index >= page->index + HPAGE_PMD_NR)
+ if (iter.index >= page->index + hpage_nr_pages(page))
break;
-
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!p)
continue;
@@ -149,10 +148,15 @@ static int page_cache_tree_insert_huge(struct address_space *mapping,
if (error)
return error;

- count_vm_event(THP_FILE_ALLOC);
- mapping->nrpages += HPAGE_PMD_NR;
- *shadowp = NULL;
- __inc_node_page_state(page, NR_FILE_THPS);
+ if (PageHuge(page)) {
+ mapping->nrpages += 1 << compound_order(page);
+ } else if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_ALLOC);
+ mapping->nrpages += HPAGE_PMD_NR;
+ *shadowp = NULL;
+ __inc_node_page_state(page, NR_FILE_THPS);
+ } else
+ BUG();

__radix_tree_lookup(&mapping->page_tree, page->index, &node, NULL);
if (node) {
@@ -178,7 +182,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;

- if (PageTransHuge(page))
+ if (PageCompound(page))
return page_cache_tree_insert_huge(mapping, page, shadowp);

error = __radix_tree_create(&mapping->page_tree, page->index, 0,
@@ -235,7 +239,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
{
struct radix_tree_node *node;
void **slot;
- int nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ int nr = hpage_nr_pages(page);

VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1186,9 +1190,9 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
}

/* For multi-order entries, find relevant subpage */
- if (PageTransHuge(page)) {
+ if (PageCompound(page)) {
VM_BUG_ON(offset - page->index < 0);
- VM_BUG_ON(offset - page->index >= HPAGE_PMD_NR);
+ VM_BUG_ON(offset - page->index >= 1 << compound_order(page));
page += offset - page->index;
}
}
@@ -1556,16 +1560,17 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
}

/* For multi-order entries, find relevant subpage */
- if (PageTransHuge(page)) {
+ if (PageCompound(page)) {
VM_BUG_ON(index - page->index < 0);
- VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
+ VM_BUG_ON(index - page->index >=
+ 1 << compound_order(page));
page += index - page->index;
}

pages[ret] = page;
if (++ret == nr_pages)
break;
- if (!PageTransCompound(page))
+ if (PageHuge(page) || !PageTransCompound(page))
continue;
for (refs = 0; ret < nr_pages &&
(index + 1) % HPAGE_PMD_NR;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ec49d9ef1eef..121e042c00eb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -622,13 +622,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}

-pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address)
-{
- return vma_hugecache_offset(hstate_vma(vma), vma, address);
-}
-EXPORT_SYMBOL_GPL(linear_hugepage_index);
-
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -3514,7 +3507,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
pgoff_t idx;

mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ idx = linear_page_index(vma, address);

return find_lock_page(mapping, idx);
}
@@ -3531,7 +3524,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
struct page *page;

mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ idx = linear_page_index(vma, address);

page = find_get_page(mapping, idx);
if (page)
@@ -3586,7 +3579,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
+ size = i_size_read(mapping->host) >> PAGE_SHIFT;
if (idx >= size)
goto out;
page = alloc_huge_page(vma, address, 0);
@@ -3648,7 +3641,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,

ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
+ size = i_size_read(mapping->host) >> PAGE_SHIFT;
if (idx >= size)
goto backout;

@@ -3695,7 +3688,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,

if (vma->vm_flags & VM_SHARED) {
key[0] = (unsigned long) mapping;
- key[1] = idx;
+ key[1] = idx >> huge_page_order(h);
} else {
key[0] = (unsigned long) mm;
key[1] = address >> huge_page_shift(h);
@@ -3751,7 +3744,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}

mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ idx = linear_page_index(vma, address);

/*
* Serialize hugepage allocation and instantiation, so that we don't
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:06 PM10/24/16
to
We writeback whole huge page a time. Let's adjust iteration this way.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/mm.h | 1 +
include/linux/pagemap.h | 1 +
mm/page-writeback.c | 17 ++++++++++++-----
3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3a191853faaa..315df8051d06 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1056,6 +1056,7 @@ extern pgoff_t __page_file_index(struct page *page);
*/
static inline pgoff_t page_index(struct page *page)
{
+ page = compound_head(page);
if (unlikely(PageSwapCache(page)))
return __page_file_index(page);
return page->index;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 712343108d31..f9aa8bede15e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -528,6 +528,7 @@ static inline void wait_on_page_locked(struct page *page)
*/
static inline void wait_on_page_writeback(struct page *page)
{
+ page = compound_head(page);
if (PageWriteback(page))
wait_on_page_bit(page, PG_writeback);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 439cc63ad903..c76fc90b7039 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2200,7 +2200,7 @@ int write_cache_pages(struct address_space *mapping,
* mapping. However, page->index will not change
* because we have a reference on the page.
*/
- if (page->index > end) {
+ if (page_to_pgoff(page) > end) {
/*
* can't be range_cyclic (1st pass) because
* end == -1 in that case.
@@ -2209,7 +2209,12 @@ int write_cache_pages(struct address_space *mapping,
break;
}

- done_index = page->index;
+ done_index = page_to_pgoff(page);
+ if (PageTransCompound(page)) {
+ index = round_up(index + 1, HPAGE_PMD_NR);
+ i += HPAGE_PMD_NR -
+ done_index % HPAGE_PMD_NR - 1;
+ }

lock_page(page);

@@ -2221,7 +2226,7 @@ int write_cache_pages(struct address_space *mapping,
* even if there is now a new, dirty page at the same
* pagecache address.
*/
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(page_mapping(page) != mapping)) {
continue_unlock:
unlock_page(page);
continue;
@@ -2259,7 +2264,8 @@ int write_cache_pages(struct address_space *mapping,
* not be suitable for data integrity
* writeout).
*/
- done_index = page->index + 1;
+ done_index = compound_head(page)->index
+ + hpage_nr_pages(page);
done = 1;
break;
}
@@ -2271,7 +2277,8 @@ int write_cache_pages(struct address_space *mapping,
* keep going until we have written all the pages
* we tagged for writeback prior to entering this loop.
*/
- if (--wbc->nr_to_write <= 0 &&
+ wbc->nr_to_write -= hpage_nr_pages(page);
+ if (wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:07 PM10/24/16
to
This patch adds basic functionality to put huge page into page cache.

At the moment we only put huge pages into radix-tree if the range covered
by the huge page is empty.

We ignore shadow entires for now, just remove them from the tree before
inserting huge page.

Later we can add logic to accumulate information from shadow entires to
return to caller (average eviction time?).

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
include/linux/fs.h | 5 ++
include/linux/pagemap.h | 21 +++++-
include/linux/radix-tree.h | 10 +++
include/linux/swap.h | 2 +
mm/filemap.c | 174 +++++++++++++++++++++++++++++++++++++++++----
mm/truncate.c | 18 +----
mm/workingset.c | 23 ++++++
7 files changed, 218 insertions(+), 35 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 16d2b6e874d6..d4c60f914610 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1838,6 +1838,11 @@ struct super_operations {
#else
#define S_DAX 0 /* Make all the DAX code disappear */
#endif
+#define S_HUGE_MODE 0xc000
+#define S_HUGE_NEVER 0x0000
+#define S_HUGE_ALWAYS 0x4000
+#define S_HUGE_WITHIN_SIZE 0x8000
+#define S_HUGE_ADVISE 0xc000

/*
* Note that nosuid etc flags are inode-specific: setting some file-system
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index dd15d39e1985..712343108d31 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -201,14 +201,20 @@ static inline int page_cache_add_speculative(struct page *page, int count)
}

#ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+extern struct page *__page_cache_alloc_order(gfp_t gfp, unsigned int order);
#else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct page *__page_cache_alloc_order(gfp_t gfp,
+ unsigned int order)
{
- return alloc_pages(gfp, 0);
+ return alloc_pages(gfp, order);
}
#endif

+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+ return __page_cache_alloc_order(gfp, 0);
+}
+
static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
@@ -225,6 +231,15 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x)
__GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
}

+extern bool __page_cache_allow_huge(struct address_space *x, pgoff_t offset);
+static inline bool page_cache_allow_huge(struct address_space *x,
+ pgoff_t offset)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return false;
+ return __page_cache_allow_huge(x, offset);
+}
+
typedef int filler_t(void *, struct page *);

pgoff_t page_cache_next_hole(struct address_space *mapping,
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index f84780aefd06..88ea6a6a0539 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -418,6 +418,16 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
return NULL;
}

+static inline __must_check
+struct radix_tree_node *radix_tree_iter_to_node(struct radix_tree_root *root,
+ struct radix_tree_iter *iter, void **slot)
+{
+ if ((void **)&root->rnode == slot)
+ return NULL;
+ slot -= (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
+ return container_of(slot, struct radix_tree_node, slots[0]);
+}
+
static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a56523cefb9b..7e961b5a2f98 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -246,6 +246,8 @@ struct swap_info_struct {
void *workingset_eviction(struct address_space *mapping, struct page *page);
bool workingset_refault(void *shadow);
void workingset_activation(struct page *page);
+void workingset_clear_exceptional_entry(struct address_space *mapping,
+ struct radix_tree_node *node, void **slot);
extern struct list_lru workingset_shadow_nodes;

static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
diff --git a/mm/filemap.c b/mm/filemap.c
index e9376610ad3c..f8387488636f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,6 +110,67 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/

+static int page_cache_tree_insert_huge(struct address_space *mapping,
+ struct page *page, void **shadowp)
+{
+ struct radix_tree_node *node;
+ struct radix_tree_iter iter;
+ void **slot, *p;
+ int error = 0;
+
+ /* Wipe shadow entires */
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
+ page->index) {
+ if (iter.index >= page->index + HPAGE_PMD_NR)
+ break;
+
+ p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+ if (!p)
+ continue;
+
+ if (!radix_tree_exception(p)) {
+ error = -EEXIST;
+ break;
+ }
+
+ node = radix_tree_iter_to_node(&mapping->page_tree,
+ &iter, slot);
+ if (node) {
+ workingset_clear_exceptional_entry(mapping,
+ node, slot);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+ }
+ }
+
+ if (!error)
+ error = __radix_tree_insert(&mapping->page_tree,
+ page->index, compound_order(page), page);
+
+ if (error)
+ return error;
+
+ count_vm_event(THP_FILE_ALLOC);
+ mapping->nrpages += HPAGE_PMD_NR;
+ *shadowp = NULL;
+ __inc_node_page_state(page, NR_FILE_THPS);
+
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, NULL);
+ if (node) {
+ /*
+ * Don't track node that contains actual pages.
+ *
+ * Avoid acquiring the list_lru lock if already
+ * untracked. The list_empty() test is safe as
+ * node->private_list is protected by
+ * mapping->tree_lock.
+ */
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ }
+ return 0;
+}
+
static int page_cache_tree_insert(struct address_space *mapping,
struct page *page, void **shadowp)
{
@@ -117,6 +178,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;

+ if (PageTransHuge(page))
+ return page_cache_tree_insert_huge(mapping, page, shadowp);
+
error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
@@ -653,14 +717,14 @@ static int __add_to_page_cache_locked(struct page *page,
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
- int huge = PageHuge(page);
+ int hugetlb = PageHuge(page);
struct mem_cgroup *memcg;
int error;

VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);

- if (!huge) {
+ if (!hugetlb) {
error = mem_cgroup_try_charge(page, current->mm,
gfp_mask, &memcg, false);
if (error)
@@ -669,7 +733,7 @@ static int __add_to_page_cache_locked(struct page *page,

error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
- if (!huge)
+ if (!hugetlb)
mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -685,10 +749,11 @@ static int __add_to_page_cache_locked(struct page *page,
goto err_insert;

/* hugetlb pages do not participate in page cache accounting. */
- if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
+ if (!hugetlb)
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES,
+ hpage_nr_pages(page));
spin_unlock_irq(&mapping->tree_lock);
- if (!huge)
+ if (!hugetlb)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
@@ -696,7 +761,7 @@ static int __add_to_page_cache_locked(struct page *page,
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
- if (!huge)
+ if (!hugetlb)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return error;
@@ -753,7 +818,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);

#ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct page *__page_cache_alloc_order(gfp_t gfp, unsigned int order)
{
int n;
struct page *page;
@@ -763,14 +828,14 @@ struct page *__page_cache_alloc(gfp_t gfp)
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = __alloc_pages_node(n, gfp, 0);
+ page = __alloc_pages_node(n, gfp, order);
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));

return page;
}
- return alloc_pages(gfp, 0);
+ return alloc_pages(gfp, order);
}
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(__page_cache_alloc_order);
#endif

/*
@@ -1165,6 +1230,69 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
}
EXPORT_SYMBOL(find_lock_entry);

+bool __page_cache_allow_huge(struct address_space *mapping, pgoff_t offset)
+{
+ struct inode *inode = mapping->host;
+ struct radix_tree_iter iter;
+ void **slot;
+ struct page *page;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+
+ offset = round_down(offset, HPAGE_PMD_NR);
+
+ switch (inode->i_flags & S_HUGE_MODE) {
+ case S_HUGE_NEVER:
+ return false;
+ case S_HUGE_ALWAYS:
+ break;
+ case S_HUGE_WITHIN_SIZE:
+ if (DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ offset + HPAGE_PMD_NR)
+ return false;
+ break;
+ case S_HUGE_ADVISE:
+ /* TODO */
+ return false;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, offset) {
+ if (iter.index >= offset + HPAGE_PMD_NR)
+ break;
+
+ /* Shadow entires are fine */
+ page = radix_tree_deref_slot(slot);
+ if (page && !radix_tree_exception(page)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
+ return true;
+
+}
+
+static struct page *page_cache_alloc_huge(struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ struct page *page;
+
+ if (!page_cache_allow_huge(mapping, offset))
+ return NULL;
+
+ gfp_mask |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
+ page = __page_cache_alloc_order(gfp_mask, HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
+}
+
/**
* pagecache_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -2044,18 +2172,34 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
+ pgoff_t hoffset;
int ret;

do {
- page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+ page = page_cache_alloc_huge(mapping, offset, gfp_mask);
+no_huge:
+ if (!page)
+ page = __page_cache_alloc(gfp_mask|__GFP_COLD);
if (!page)
return -ENOMEM;

- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
- if (ret == 0)
+ if (PageTransHuge(page))
+ hoffset = round_down(offset, HPAGE_PMD_NR);
+ else
+ hoffset = offset;
+
+ ret = add_to_page_cache_lru(page, mapping, hoffset,
+ gfp_mask & GFP_KERNEL);
+
+ if (ret == -EEXIST && PageTransHuge(page)) {
+ put_page(page);
+ page = NULL;
+ goto no_huge;
+ } else if (ret == 0) {
ret = mapping->a_ops->readpage(file, page);
- else if (ret == -EEXIST)
+ } else if (ret == -EEXIST) {
ret = 0; /* losing race to add is OK */
+ }

put_page(page);

diff --git a/mm/truncate.c b/mm/truncate.c
index 393bf9447231..f88e2f1eb6f0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -49,23 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping,
goto unlock;
if (*slot != entry)
goto unlock;
- radix_tree_replace_slot(slot, NULL);
- mapping->nrexceptional--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes,
- &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
+ workingset_clear_exceptional_entry(mapping, node, slot);
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 617475f529f4..915f1f76e1ac 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -322,6 +322,29 @@ void workingset_activation(struct page *page)
rcu_read_unlock();
}

+void workingset_clear_exceptional_entry(struct address_space *mapping,
+ struct radix_tree_node *node, void **slot)
+{
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrexceptional--;
+ if (!node)
+ return;
+
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+}
+
/*
* Shadow entries reflect the share of the working set that does not
* fit into memory, so their number depends on the access pattern of
--
2.9.3

Kirill A. Shutemov

unread,
Oct 24, 2016, 8:30:07 PM10/24/16
to
The approach is straight-forward: for compound pages we read out whole
huge page.

For huge page we cannot have array of buffer head pointers on stack --
it's 4096 pointers on x86-64 -- 'arr' is allocated with kmalloc() for
huge pages.

Signed-off-by: Kirill A. Shutemov <kirill....@linux.intel.com>
---
fs/buffer.c | 22 +++++++++++++++++-----
include/linux/buffer_head.h | 9 +++++----
include/linux/page-flags.h | 2 +-
3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001d..35b76b1c0308 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -870,7 +870,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,

try_again:
head = NULL;
- offset = PAGE_SIZE;
+ offset = hpage_size(page);
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(GFP_NOFS);
if (!bh)
@@ -1465,7 +1465,7 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
bh->b_page = page;
- BUG_ON(offset >= PAGE_SIZE);
+ BUG_ON(offset >= hpage_size(page));
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
@@ -2238,11 +2238,13 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *arr_on_stack[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head, **arr = arr_on_stack;
unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;

+ VM_BUG_ON_PAGE(PageTail(page), page);
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
@@ -2253,6 +2255,11 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
nr = 0;
i = 0;

+ if (PageTransHuge(page)) {
+ arr = kmalloc(sizeof(struct buffer_head *) * HPAGE_PMD_NR *
+ MAX_BUF_PER_PAGE, GFP_NOFS);
+ }
+
do {
if (buffer_uptodate(bh))
continue;
@@ -2268,7 +2275,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
SetPageError(page);
}
if (!buffer_mapped(bh)) {
- zero_user(page, i * blocksize, blocksize);
+ zero_user(page + (i * blocksize / PAGE_SIZE),
+ i * blocksize % PAGE_SIZE,
+ blocksize);
if (!err)
set_buffer_uptodate(bh);
continue;
@@ -2294,7 +2303,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
- return 0;
+ goto out;
}

/* Stage two: lock the buffers */
@@ -2316,6 +2325,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
else
submit_bh(REQ_OP_READ, 0, bh);
}
+out:
+ if (arr != arr_on_stack)
+ kfree(arr);
return 0;
}
EXPORT_SYMBOL(block_read_full_page);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 006a8a42acfb..194a85822d5f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -131,13 +131,14 @@ BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
BUFFER_FNS(Defer_Completion, defer_completion)

-#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
+#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~hpage_mask(bh->b_page))

/* If we *know* page->private refers to buffer_heads */
-#define page_buffers(page) \
+#define page_buffers(__page) \
({ \
- BUG_ON(!PagePrivate(page)); \
- ((struct buffer_head *)page_private(page)); \
+ struct page *p = compound_head(__page); \
+ BUG_ON(!PagePrivate(p)); \
+ ((struct buffer_head *)page_private(p)); \
})
#define page_has_buffers(page) PagePrivate(page)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index a2bef9a41bcf..20b7684e9298 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -730,7 +730,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page)
*/
static inline int page_has_private(struct page *page)
{
- return !!(page->flags & PAGE_FLAGS_PRIVATE);
+ return !!(compound_head(page)->flags & PAGE_FLAGS_PRIVATE);
}

#undef PF_ANY
--
2.9.3
0 new messages