[PATCH 4/4] zram: make deduplication feature optional

js1...@gmail.com

unread,

Mar 15, 2017, 10:50:05 PM3/15/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---
drivers/block/zram/zram_drv.c | 80 ++++++++++++++++++++++++++++++++++++++-----
drivers/block/zram/zram_drv.h | 1 +
2 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 012425f..e45aa9f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -328,6 +328,39 @@ static ssize_t comp_algorithm_store(struct device *dev,
return len;
}

+static ssize_t use_dedup_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->use_dedup;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t use_dedup_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change dedup usage for initialized device\n");
+ return -EBUSY;
+ }
+ zram->use_dedup = val;
+ up_write(&zram->init_lock);
+ return len;
+}
+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -422,11 +455,23 @@ static ssize_t debug_stat_show(struct device *dev,
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

-static u32 zram_calc_checksum(unsigned char *mem)
+static u32 zram_calc_checksum(struct zram *zram, unsigned char *mem)
{
+ if (!zram->use_dedup)
+ return 0;
+
return jhash(mem, PAGE_SIZE, 0);
}

+static unsigned long zram_entry_handle(struct zram *zram,
+ struct zram_entry *entry)
+{
+ if (!zram->use_dedup)
+ return (unsigned long)entry;
+
+ return entry->handle;
+}
+
static struct zram_entry *zram_entry_alloc(struct zram *zram,
unsigned int len, gfp_t flags)
{
@@ -438,6 +483,9 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
if (!handle)
return NULL;

+ if (!zram->use_dedup)
+ return (struct zram_entry *)handle;
+
entry = kzalloc(sizeof(*entry), flags);
if (!entry) {
zs_free(meta->mem_pool, handle);
@@ -462,6 +510,9 @@ static void zram_entry_insert(struct zram *zram, struct zram_entry *new,
struct rb_node **rb_node, *parent = NULL;
struct zram_entry *entry;

+ if (!zram->use_dedup)
+ return;
+
new->checksum = checksum;
hash = &meta->hash[checksum % meta->hash_size];
rb_root = &hash->rb_root;
@@ -492,7 +543,8 @@ static bool zram_entry_match(struct zram *zram, struct zram_entry *entry,
struct zram_meta *meta = zram->meta;
struct zcomp_strm *zstrm;

- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ cmem = zs_map_object(meta->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_RO);
if (entry->len == PAGE_SIZE) {
match = !memcmp(mem, cmem, PAGE_SIZE);
} else {
@@ -501,7 +553,7 @@ static bool zram_entry_match(struct zram *zram, struct zram_entry *entry,
match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));

return match;
}
@@ -521,6 +573,11 @@ static bool zram_entry_put(struct zram *zram, struct zram_meta *meta,
struct zram_hash *hash;
u32 checksum;

+ if (!zram->use_dedup) {
+ zs_free(meta->mem_pool, zram_entry_handle(zram, entry));
+ return false;
+ }
+
if (!populated)
goto free;

@@ -551,6 +608,9 @@ static struct zram_entry *zram_entry_get(struct zram *zram,
struct zram_entry *entry;
struct rb_node *rb_node;

+ if (!zram->use_dedup)
+ return NULL;
+
hash = &meta->hash[checksum % meta->hash_size];

spin_lock(&hash->lock);
@@ -713,7 +773,8 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
return 0;
}

- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ cmem = zs_map_object(meta->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_RO);
if (size == PAGE_SIZE) {
copy_page(mem, cmem);
} else {
@@ -722,7 +783,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
ret = zcomp_decompress(zstrm, cmem, size, mem);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

/* Should NEVER happen. Return bio error if it does. */
@@ -840,7 +901,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

- checksum = zram_calc_checksum(uncmem);
+ checksum = zram_calc_checksum(zram, uncmem);
if (!entry) {
entry = zram_entry_get(zram, uncmem, checksum);
if (entry) {
@@ -915,7 +976,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_WO);
+ cmem = zs_map_object(meta->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_WO);

if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
src = kmap_atomic(page);
@@ -927,7 +989,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zcomp_stream_put(zram->comp);
zstrm = NULL;
- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));
zram_entry_insert(zram, entry, checksum);

found_duplication:
@@ -1310,6 +1372,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode)
static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+static DEVICE_ATTR_RW(use_dedup);

static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1320,6 +1383,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode)
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 07d1f8d..b823555 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -141,5 +141,6 @@ struct zram {
* zram is claimed so open request will be failed
*/
bool claim; /* Protected by bdev->bd_mutex */
+ int use_dedup;
};
#endif
--
1.9.1

js1...@gmail.com

unread,

Mar 15, 2017, 10:50:05 PM3/15/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

This patch implements deduplication feature in zram. The purpose
of this work is naturally to save amount of memory usage by zram.

Android is one of the biggest users to use zram as swap and it's
really important to save amount of memory usage. There is a paper
that reports that duplication ratio of Android's memory content is
rather high [1]. And, there is a similar work on zswap that also
reports that experiments has shown that around 10-15% of pages
stored in zswp are duplicates and deduplicate them provides some
benefits [2].

Also, there is a different kind of workload that uses zram as blockdev
and store build outputs into it to reduce wear-out problem of real
blockdev. In this workload, deduplication hit is very high
although I don't know exact reason about it.

Anyway, if we can detect duplicated content and avoid to store duplicated
content at different memory space, we can save memory. This patch
tries to do that.

Implementation is almost simple and intuitive but I should note
one thing about implementation detail.

To check duplication, this patch uses checksum of the page and
collision of this checksum could be possible. There would be
many choices to handle this situation but this patch chooses
to allow entry with duplicated checksum to be added to the hash,
but, not to compare all entries with duplicated checksum
when checking duplication. I guess that checksum collision is quite
rare event and we don't need to pay any attention to such a case.
Therefore, I decided the most simplest way to implement the feature.
If there is a different opinion, I can accept and go that way.

Following is the result of this patch.

Test result #1 (Swap):
Android Marshmallow, emulator, x86_64, Backporting to kernel v3.18

orig_data_size: 145297408
compr_data_size: 32408125
mem_used_total: 32276480
dup_data_size: 3188134
meta_data_size: 1444272

Last two metrics added to mm_stat are related to this work.
First one, dup_data_size, is amount of saved memory by avoiding
to store duplicated page. Later one, meta_data_size, is the amount of
data structure to support deduplication. If dup > meta, we can judge
that the patch improves memory usage.

In Adnroid, we can save 5% of memory usage by this work.

Test result #2 (Blockdev):
build the kernel and store output to ext4 FS on zram

<no-dedup>
Elapsed time: 249 s
mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0

<dedup>
Elapsed time: 250 s
mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038 3945792

There is no performance degradation and save 23% memory.

Test result #3 (Blockdev):
copy android build output dir(out/host) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 88 s
mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0

<dedup>
Elapsed time: out/host: 100 s
mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336

It shows performance degradation roughly 13% and save 24% memory. Maybe,
it is due to overhead of calculating checksum and comparison.

Test result #4 (Blockdev):
copy android build output dir(out/target/common) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 203 s
mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0

<dedup>
Elapsed time: out/host: 201 s
mm_stat: 4041666560 2310488276 1338150912 0 1338150912 476 0 989088794 24564336

Memory is saved by 42% and performance is the same. Even if there is overhead
of calculating checksum and comparison, large hit ratio compensate it since
hit leads to less compression attempt.

Anyway, benefit seems to be largely dependent on the workload so
following patch will make this feature optional. However, this feature
can help some usecases so is deserved to be merged.

[1]: MemScope: Analyzing Memory Duplication on Android Systems,
dl.acm.org/citation.cfm?id=2797023
[2]: zswap: Optimize compressed pool memory utilization,
lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_drv.c | 212 ++++++++++++++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 18 ++++
2 files changed, 214 insertions(+), 16 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index a3d9cbca..012425f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -32,6 +32,7 @@
#include <linux/idr.h>
#include <linux/sysfs.h>
#include <linux/cpuhotplug.h>
+#include <linux/jhash.h>

#include "zram_drv.h"

@@ -385,14 +386,16 @@ static ssize_t mm_stat_show(struct device *dev,
max_used = atomic_long_read(&zram->stats.max_used_pages);

ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ (u64)atomic64_read(&zram->stats.dup_data_size),
+ (u64)atomic64_read(&zram->stats.meta_data_size));
up_read(&zram->init_lock);

return ret;
@@ -419,29 +422,165 @@ static ssize_t debug_stat_show(struct device *dev,
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

-static struct zram_entry *zram_entry_alloc(struct zram_meta *meta,
+static u32 zram_calc_checksum(unsigned char *mem)
+{

+ return jhash(mem, PAGE_SIZE, 0);
+}

+
+static struct zram_entry *zram_entry_alloc(struct zram *zram,

unsigned int len, gfp_t flags)
{

+ struct zram_meta *meta = zram->meta;
struct zram_entry *entry;
+ unsigned long handle;

- entry = kzalloc(sizeof(*entry), flags);
- if (!entry)
+ handle = zs_malloc(meta->mem_pool, len, flags);
+ if (!handle)
return NULL;

- entry->handle = zs_malloc(meta->mem_pool, len, flags);
- if (!entry->handle) {
- kfree(entry);

+ entry = kzalloc(sizeof(*entry), flags);

+ if (!entry) {
+ zs_free(meta->mem_pool, handle);
return NULL;
}

+ entry->handle = handle;
+ RB_CLEAR_NODE(&entry->rb_node);
+ entry->refcount = 1;
+ entry->len = len;
+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+
return entry;
}

-static inline void zram_entry_free(struct zram_meta *meta,
- struct zram_entry *entry)
+static void zram_entry_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum)
+{
+ struct zram_meta *meta = zram->meta;
+ struct zram_hash *hash;
+ struct rb_root *rb_root;
+ struct rb_node **rb_node, *parent = NULL;
+ struct zram_entry *entry;
+

+ new->checksum = checksum;

+ hash = &meta->hash[checksum % meta->hash_size];

+ rb_root = &hash->rb_root;
+
+ spin_lock(&hash->lock);
+ rb_node = &rb_root->rb_node;
+ while (*rb_node) {
+ parent = *rb_node;
+ entry = rb_entry(parent, struct zram_entry, rb_node);
+ if (checksum < entry->checksum)
+ rb_node = &parent->rb_left;
+ else if (checksum > entry->checksum)
+ rb_node = &parent->rb_right;
+ else
+ rb_node = &parent->rb_left;
+ }
+
+ rb_link_node(&new->rb_node, parent, rb_node);
+ rb_insert_color(&new->rb_node, rb_root);
+ spin_unlock(&hash->lock);
+}
+
+static bool zram_entry_match(struct zram *zram, struct zram_entry *entry,
+ unsigned char *mem)
+{
+ bool match = false;
+ unsigned char *cmem;
+ struct zram_meta *meta = zram->meta;
+ struct zcomp_strm *zstrm;
+
+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }
+ zs_unmap_object(meta->mem_pool, entry->handle);
+
+ return match;
+}
+
+static inline void zram_entry_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry)
{
zs_free(meta->mem_pool, entry->handle);
kfree(entry);
+ if (zram)
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
+}
+
+static bool zram_entry_put(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry, bool populated)
+{
+ struct zram_hash *hash;
+ u32 checksum;
+
+ if (!populated)
+ goto free;
+
+ checksum = entry->checksum;

+ hash = &meta->hash[checksum % meta->hash_size];

+
+ spin_lock(&hash->lock);
+ entry->refcount--;
+ if (!entry->refcount) {
+ populated = false;
+ rb_erase(&entry->rb_node, &hash->rb_root);
+ RB_CLEAR_NODE(&entry->rb_node);
+ }
+ spin_unlock(&hash->lock);
+
+free:
+ if (!populated)
+ zram_entry_free(zram, meta, entry);
+
+ return populated;
+}
+
+static struct zram_entry *zram_entry_get(struct zram *zram,
+ unsigned char *mem, u32 checksum)
+{
+ struct zram_meta *meta = zram->meta;
+ struct zram_hash *hash;
+ struct zram_entry *entry;
+ struct rb_node *rb_node;
+

+ hash = &meta->hash[checksum % meta->hash_size];

+
+ spin_lock(&hash->lock);
+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {
+ entry = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (checksum == entry->checksum) {
+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+
+ if (zram_entry_match(zram, entry, mem))
+ return entry;
+
+ if (zram_entry_put(zram, meta, entry, true)) {
+ atomic64_sub(entry->len,
+ &zram->stats.dup_data_size);
+ }
+
+ return NULL;
+ }
+
+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;
+ }
+ spin_unlock(&hash->lock);
+
+ return NULL;
}

static void zram_meta_free(struct zram_meta *meta, u64 disksize)
@@ -459,18 +598,31 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
if (!entry || zram_test_flag(meta, index, ZRAM_SAME))
continue;

- zram_entry_free(meta, entry);
+ zram_entry_put(NULL, meta, entry, true);
}

zs_destroy_pool(meta->mem_pool);
+ vfree(meta->hash);
vfree(meta->table);
kfree(meta);
}

+static void zram_meta_init(struct zram_meta *meta)
+{
+ int i;
+ struct zram_hash *hash;
+
+ for (i = 0; i < meta->hash_size; i++) {
+ hash = &meta->hash[i];
+ spin_lock_init(&hash->lock);
+ hash->rb_root = RB_ROOT;
+ }
+}
+
static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
{
size_t num_pages;
- struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+ struct zram_meta *meta = kzalloc(sizeof(*meta), GFP_KERNEL);

if (!meta)
return NULL;
@@ -482,15 +634,27 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}

+ meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
+ meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
+ meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
+ meta->hash = vzalloc(meta->hash_size * sizeof(struct zram_hash));
+ if (!meta->hash) {
+ pr_err("Error allocating zram entry hash\n");
+ goto out_error;
+ }
+
meta->mem_pool = zs_create_pool(pool_name);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
goto out_error;
}

+ zram_meta_init(meta);
+
return meta;

out_error:
+ vfree(meta->hash);
vfree(meta->table);
kfree(meta);
return NULL;
@@ -520,7 +684,8 @@ static void zram_free_page(struct zram *zram, size_t index)
if (!entry)
return;

- zram_entry_free(meta, entry);
+ if (zram_entry_put(zram, meta, entry, true))
+ atomic64_sub(entry->len, &zram->stats.dup_data_size);

atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
@@ -631,6 +796,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
struct zcomp_strm *zstrm = NULL;
unsigned long alloced_pages;
unsigned long element;
+ u32 checksum;

page = bvec->bv_page;
if (is_partial_io(bvec)) {
@@ -674,6 +840,18 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

+ checksum = zram_calc_checksum(uncmem);
+ if (!entry) {
+ entry = zram_entry_get(zram, uncmem, checksum);
+ if (entry) {
+ if (!is_partial_io(bvec))
+ kunmap_atomic(user_mem);
+
+ clen = entry->len;
+ goto found_duplication;
+ }
+ }
+
zstrm = zcomp_stream_get(zram->comp);
ret = zcomp_compress(zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {
@@ -708,7 +886,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
* from the slow path and entry has already been allocated.
*/
if (!entry) {
- entry = zram_entry_alloc(meta, clen,
+ entry = zram_entry_alloc(zram, clen,
__GFP_KSWAPD_RECLAIM | __GFP_NOWARN);
}

@@ -718,7 +896,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

atomic64_inc(&zram->stats.writestall);

- entry = zram_entry_alloc(meta, clen, GFP_NOIO);
+ entry = zram_entry_alloc(zram, clen, GFP_NOIO);
if (entry)
goto compress_again;

@@ -732,7 +910,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
update_used_max(zram, alloced_pages);

if (zram->limit_pages && alloced_pages > zram->limit_pages) {
- zram_entry_free(meta, entry);
+ zram_entry_put(zram, meta, entry, false);
ret = -ENOMEM;
goto out;
}
@@ -750,7 +928,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zcomp_stream_put(zram->comp);
zstrm = NULL;

zs_unmap_object(meta->mem_pool, entry->handle);
+ zram_entry_insert(zram, entry, checksum);

+found_duplication:
/*
* Free memory associated with this sector
* before overwriting unused sectors.
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a7ae46c..07d1f8d 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -18,6 +18,7 @@
#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>

#include "zcomp.h"

@@ -45,6 +46,10 @@
#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \
(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))

+/* One slot will contain 128 pages theoretically */
+#define ZRAM_HASH_SHIFT 7
+#define ZRAM_HASH_SIZE_MIN (1 << 10)
+#define ZRAM_HASH_SIZE_MAX (1 << 31)

/*
* The lower ZRAM_FLAG_SHIFT bits of table.value is for
@@ -70,6 +75,10 @@ enum zram_pageflags {
/*-- Data structures */

struct zram_entry {
+ struct rb_node rb_node;
+ u32 len;
+ u32 checksum;
+ unsigned long refcount;
unsigned long handle;
};

@@ -94,11 +103,20 @@ struct zram_stats {
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
+ atomic64_t dup_data_size; /* compressed size of pages duplicated */
+ atomic64_t meta_data_size; /* size of zram_entries */
+};
+
+struct zram_hash {
+ spinlock_t lock;
+ struct rb_root rb_root;
};

struct zram_meta {
struct zram_table_entry *table;
struct zs_pool *mem_pool;
+ struct zram_hash *hash;
+ size_t hash_size;
};

struct zram {
--
1.9.1

js1...@gmail.com

unread,

Mar 15, 2017, 10:50:05 PM3/15/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch 3 so please
refer it.

Thanks.

Joonsoo Kim (4):
mm/zsmalloc: always set movable/highmem flag to the zspage
zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram
zram: make deduplication feature optional

drivers/block/zram/zram_drv.c | 338 +++++++++++++++++++++++++++++++++++++-----
drivers/block/zram/zram_drv.h | 25 +++-
mm/zsmalloc.c | 10 +-
3 files changed, 330 insertions(+), 43 deletions(-)

--
1.9.1

js1...@gmail.com

unread,

Mar 15, 2017, 10:50:06 PM3/15/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Zspage is always movable and is used through zs_map_object() function
which returns directly accessible pointer that contains content of
zspage. It is independent on the user's allocation flag.
Therefore, it's better to always set movable/highmem flag to the zspage.
After that, we don't need __GFP_MOVABLE/__GFP_HIGHMEM clearing in
cache_alloc_handle()/cache_alloc_zspage() since there is no zs_malloc
caller who specifies __GFP_MOVABLE/__GFP_HIGHMEM.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_drv.c | 9 ++-------
mm/zsmalloc.c | 10 ++++------
2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0194441..f65dcd1 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -684,19 +684,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
*/
if (!handle)
handle = zs_malloc(meta->mem_pool, clen,
- __GFP_KSWAPD_RECLAIM |
- __GFP_NOWARN |
- __GFP_HIGHMEM |
- __GFP_MOVABLE);
+ __GFP_KSWAPD_RECLAIM | __GFP_NOWARN);
if (!handle) {

zcomp_stream_put(zram->comp);
zstrm = NULL;

atomic64_inc(&zram->stats.writestall);

- handle = zs_malloc(meta->mem_pool, clen,
- GFP_NOIO | __GFP_HIGHMEM |
- __GFP_MOVABLE);
+ handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO);
if (handle)
goto compress_again;

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7ee9c3..fada232 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -347,8 +347,7 @@ static void destroy_cache(struct zs_pool *pool)

static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
{
- return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep, gfp);
}

static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
@@ -358,9 +357,8 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle)

static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
- return kmem_cache_alloc(pool->zspage_cachep,
- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
-}
+ return kmem_cache_alloc(pool->zspage_cachep, flags);
+};

static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
{
@@ -1120,7 +1118,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
for (i = 0; i < class->pages_per_zspage; i++) {
struct page *page;

- page = alloc_page(gfp);
+ page = alloc_page(gfp | __GFP_MOVABLE | __GFP_HIGHMEM);
if (!page) {
while (--i >= 0) {
dec_zone_page_state(pages[i], NR_ZSPAGES);
--
1.9.1

Minchan Kim

unread,

Mar 21, 2017, 7:20:05 AM3/21/17

to

Hi Joonsoo,

On Thu, Mar 16, 2017 at 11:46:35AM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim <iamjoon...@lge.com>
>
> Zspage is always movable and is used through zs_map_object() function
> which returns directly accessible pointer that contains content of
> zspage. It is independent on the user's allocation flag.
> Therefore, it's better to always set movable/highmem flag to the zspage.
> After that, we don't need __GFP_MOVABLE/__GFP_HIGHMEM clearing in
> cache_alloc_handle()/cache_alloc_zspage() since there is no zs_malloc
> caller who specifies __GFP_MOVABLE/__GFP_HIGHMEM.

Hmm, I wanted this when you pointed out to me firstly but when I think
again, I don't see it's improvement. Sorry for that.
The zs_malloc is exported symbol and it has gfp_t argument so user can
do whatever he want with any zone modifiers flags. IOW, if someuser want
to allocate pages from {normal|dma} zone by whatever reason, he can
omit __GFP_HIGHMEM from the gfp flag to fullfill the goal.

Minchan Kim

unread,

Mar 21, 2017, 7:50:06 PM3/21/17

to

Hi Joonsoo,

On Thu, Mar 16, 2017 at 11:46:37AM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim <iamjoon...@lge.com>
>
> This patch implements deduplication feature in zram. The purpose
> of this work is naturally to save amount of memory usage by zram.
>
> Android is one of the biggest users to use zram as swap and it's
> really important to save amount of memory usage. There is a paper
> that reports that duplication ratio of Android's memory content is
> rather high [1]. And, there is a similar work on zswap that also
> reports that experiments has shown that around 10-15% of pages
> stored in zswp are duplicates and deduplicate them provides some
> benefits [2].
>
> Also, there is a different kind of workload that uses zram as blockdev
> and store build outputs into it to reduce wear-out problem of real
> blockdev. In this workload, deduplication hit is very high
> although I don't know exact reason about it.

Hmm, Isn't it due to binary composed by obj files so that many of
part between binary and object are sharable?

I'd like to clear it out because dedup was not useful for swap workload
for the testing in android unlike papers you mentioned.
Of course, it depends on the workload so someone might find it's
huge useful for his swap workload. However, I want to focus clear
winning case scenario rather than "might be better".

That's why I want to know clear reason the saving. If my assumption
is right(i.e., object file vs. binary file), it would be enough
justfication to merge this feature because user can turn on the feature
with reasonable expectation.

>
> Anyway, if we can detect duplicated content and avoid to store duplicated
> content at different memory space, we can save memory. This patch
> tries to do that.
>
> Implementation is almost simple and intuitive but I should note
> one thing about implementation detail.
>
> To check duplication, this patch uses checksum of the page and
> collision of this checksum could be possible. There would be
> many choices to handle this situation but this patch chooses
> to allow entry with duplicated checksum to be added to the hash,
> but, not to compare all entries with duplicated checksum
> when checking duplication. I guess that checksum collision is quite

Hmm, if there are many duplicated checksum, what a specific checksum
is compared among them?

> rare event and we don't need to pay any attention to such a case.

If it's rare event, why can't we iterate all of entries?

Cool! It would help a lot for users have used zram to build output directory.

>
> Anyway, benefit seems to be largely dependent on the workload so
> following patch will make this feature optional. However, this feature
> can help some usecases so is deserved to be merged.
>
> [1]: MemScope: Analyzing Memory Duplication on Android Systems,
> dl.acm.org/citation.cfm?id=2797023
> [2]: zswap: Optimize compressed pool memory utilization,
> lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2

Overall, it looks good. Thanks!
However, I want to change function naming/structuring into my preference style
to maintain because it's non-trival feature. So, please look below.

zram_dedeup_stat_read?

The reason to use wrapper function is I reallly want to remove any dedup
related code unless users turns on the feature in Kconfig.
Wrapper function would be more clean rather than ifdef.

> + (u64)atomic64_read(&zram->stats.meta_data_size));

> up_read(&zram->init_lock);
>
> return ret;
> @@ -419,29 +422,165 @@ static ssize_t debug_stat_show(struct device *dev,
> static DEVICE_ATTR_RO(mm_stat);
> static DEVICE_ATTR_RO(debug_stat);
>
> -static struct zram_entry *zram_entry_alloc(struct zram_meta *meta,
> +static u32 zram_calc_checksum(unsigned char *mem)

zram_dedup_checksum.

I want to use 'dedup' term all of dedup related to functions.

> +{
> + return jhash(mem, PAGE_SIZE, 0);
> +}
> +
> +static struct zram_entry *zram_entry_alloc(struct zram *zram,
> unsigned int len, gfp_t flags)
> {
> + struct zram_meta *meta = zram->meta;
> struct zram_entry *entry;
> + unsigned long handle;
>
> - entry = kzalloc(sizeof(*entry), flags);
> - if (!entry)
> + handle = zs_malloc(meta->mem_pool, len, flags);
> + if (!handle)
> return NULL;

Separate allocate zram_entry and dedeup.
IOW,

struct zram_entry *entry = zram_entry_alloc(zram, xxx);
if (!zram_dedup_init(zram, entry, xxx))
zram_entry_free(entry);

>
> - entry->handle = zs_malloc(meta->mem_pool, len, flags);
> - if (!entry->handle) {
> - kfree(entry);
> + entry = kzalloc(sizeof(*entry), flags);
> + if (!entry) {
> + zs_free(meta->mem_pool, handle);
> return NULL;
> }
>
> + entry->handle = handle;
> + RB_CLEAR_NODE(&entry->rb_node);
> + entry->refcount = 1;
> + entry->len = len;
> + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
> +
> return entry;
> }
>
> -static inline void zram_entry_free(struct zram_meta *meta,
> - struct zram_entry *entry)
> +static void zram_entry_insert(struct zram *zram, struct zram_entry *new,

zram_dedup_insert or add?

Ditto.

zram_dedup_match?

Please, separate entry and dedup part like zram_entry_alloc.
Then, Can't we remove 'populated'?

We can rename it to zram_entry_match.

> + unsigned char *mem, u32 checksum)
> +{
> + struct zram_meta *meta = zram->meta;
> + struct zram_hash *hash;
> + struct zram_entry *entry;
> + struct rb_node *rb_node;
> +
> + hash = &meta->hash[checksum % meta->hash_size];
> +
> + spin_lock(&hash->lock);
> + rb_node = hash->rb_root.rb_node;
> + while (rb_node) {
> + entry = rb_entry(rb_node, struct zram_entry, rb_node);
> + if (checksum == entry->checksum) {
> + entry->refcount++;
> + atomic64_add(entry->len, &zram->stats.dup_data_size);
> + spin_unlock(&hash->lock);
> +
> + if (zram_entry_match(zram, entry, mem))

__zram_entry_match? Or just open-code.

zram_dedup_init?

Can't we put above hash initialization routines to zram_meta_init?

> +
> return meta;
>
> out_error:
> + vfree(meta->hash);
> vfree(meta->table);
> kfree(meta);
> return NULL;
> @@ -520,7 +684,8 @@ static void zram_free_page(struct zram *zram, size_t index)
> if (!entry)
> return;
>
> - zram_entry_free(meta, entry);
> + if (zram_entry_put(zram, meta, entry, true))
> + atomic64_sub(entry->len, &zram->stats.dup_data_size);

Hmm,

I want to put dedup stat logic in dedup functions without exporting
higher level functions like zram_free_page.

So,

zram_dedup_put:
...
...
atomic64_sub(entry->len, &zram->stats.dup_data_size);

zram_free_page:
...
if (zram_dedup_put())
zram_entry_free

>
> atomic64_sub(zram_get_obj_size(meta, index),
> &zram->stats.compr_data_size);
> @@ -631,6 +796,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> struct zcomp_strm *zstrm = NULL;
> unsigned long alloced_pages;
> unsigned long element;
> + u32 checksum;
>
> page = bvec->bv_page;
> if (is_partial_io(bvec)) {
> @@ -674,6 +840,18 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
> goto out;
> }
>
> + checksum = zram_calc_checksum(uncmem);
> + if (!entry) {
> + entry = zram_entry_get(zram, uncmem, checksum);

entry = zram_dedup_get

Nit:

I prefer found_dup.

It should be compiled out if users doesn't turn on the feature in Kconfig.
I hope you makes Kconfig option to [4/4].

Minchan Kim

unread,

Mar 21, 2017, 8:10:05 PM3/21/17

to

On Thu, Mar 16, 2017 at 11:46:38AM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim <iamjoon...@lge.com>
>
> Benefit of deduplication is dependent on the workload so it's not
> preferable to always enable. Therefore, make it optional.

Please make it to Kconfig, too. And write down the description to impress
"help a lot for users who uses zram to build output directory"
And the feature should be disabled as default.

Hmm, I don't like this style which every dedup functions have the check
"use_dedup".

Can't we abstract like this?

I want to find more simple way to no need to add the check when new dedup
functions pop up.

bool zram_dedup_check
if (!zram->dedup)
return false;

zram_dedup_checksum
entry = zram_dedup_get
if (!entry) {
zram_dedup_add
return false;
}
..
return true;

zram_bvec_write:
...
...

if (zram_dedup_match())
goto found_dup;

For binary result, I want to use 'bool'

> };
> #endif
> --
> 1.9.1
>

Joonsoo Kim

unread,

Mar 22, 2017, 10:10:05 PM3/22/17

to

Hello,

I don't think that such flexibility makes things better. User cannot
fully understand what flags are the best since it highly depends on
implementation detail. For example, __GFP_MOVABLE is needed to
optimize memory fragmentation and user cannot know it and there is no
reason that user need to know it. __GFP_HIGHMEM is the similar case.
He cannot know that he can pass __GFP_HIGHMEM without knowing the
implementation detail and he cannot know the impact of __GFP_HIGHMEM
here. So, I think that adding these flags in zsmalloc can be justified.

Anyway, this patch isn't so important for this series so if you don't
like it, I will drop it.

Thanks.

Joonsoo Kim

unread,

Mar 22, 2017, 11:10:06 PM3/22/17

to

On Wed, Mar 22, 2017 at 09:00:59AM +0900, Minchan Kim wrote:
> On Thu, Mar 16, 2017 at 11:46:38AM +0900, js1...@gmail.com wrote:
> > From: Joonsoo Kim <iamjoon...@lge.com>
> >
> > Benefit of deduplication is dependent on the workload so it's not
> > preferable to always enable. Therefore, make it optional.
>
> Please make it to Kconfig, too. And write down the description to impress
> "help a lot for users who uses zram to build output directory"
> And the feature should be disabled as default.

Okay.

I will try but I'm not sure it can be.

Okay.

Thanks.

Joonsoo Kim

unread,

Mar 22, 2017, 11:10:06 PM3/22/17

to

Okay. I checked the reason of benefit on the kernel build now. There are
some cases that deduplication happens.

1) *.cmd
Build command is usually similar in one directory. So, content of
these file are very similar. Please check fs/ext4/.namei.o.cmd and
fs/ext4/.inode.o.cmd. In my system, more than 789 lines are the same
in 944 and 939 line of the file, respectively.

2) object file
built-in.o and temporal object file have the similar content. More
than 50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o. I
think that we can optimize in this case. ext4.o is temporal object
file and it isn't necessarily needed. We can change following
fs/ext4/Makefile to optimized one.

orig>
obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := XXX YYY ZZZ

optimized>
obj-$(CONFIG_EXT4_FS) += XXX YYY ZZZ

3) vmlinux
.tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boot/compressed/vmlinux.bin
have the similar content.

I did similar checking for Android and it also has similar case.
Some of object files (.class and .so) are similar with another object
files. (./host/linux-x86/lib/libartd.so and
./host/linux-x86/lib/libartd-compiler.so)

>
> >
> > Anyway, if we can detect duplicated content and avoid to store duplicated
> > content at different memory space, we can save memory. This patch
> > tries to do that.
> >
> > Implementation is almost simple and intuitive but I should note
> > one thing about implementation detail.
> >
> > To check duplication, this patch uses checksum of the page and
> > collision of this checksum could be possible. There would be
> > many choices to handle this situation but this patch chooses
> > to allow entry with duplicated checksum to be added to the hash,
> > but, not to compare all entries with duplicated checksum
> > when checking duplication. I guess that checksum collision is quite
>
> Hmm, if there are many duplicated checksum, what a specific checksum
> is compared among them?

I implemented it to check just first one.

> > rare event and we don't need to pay any attention to such a case.
>
> If it's rare event, why can't we iterate all of entries?

It's possible. If you prefer it, I can do it.

Okay. I understand all you commented. I will fix them and I won't reply to each one.

Thanks.

Sergey Senozhatsky

unread,

Mar 23, 2017, 9:50:08 AM3/23/17

to

Hello Joonsoo,

On (03/16/17 11:46), js1...@gmail.com wrote:
[..]

[ 2935.830100] BUG: unable to handle kernel NULL pointer dereference at 0000000000000154
[ 2935.830106] IP: zram_entry_put+0x15/0xbb [zram]
[ 2935.830107] PGD 4063aa067
[ 2935.830108] P4D 4063aa067
[ 2935.830108] PUD 19d426067
[ 2935.830109] PMD 0

[ 2935.830110] Oops: 0000 [#1] PREEMPT SMP
[ 2935.830111] Modules linked in: lzo zram(-) zsmalloc mousedev nls_iso8859_1 nls_cp437 vfat fat psmouse serio_raw atkbd libps2 coretemp hwmon iwlmvm crc32c_intel i2c_i801 r8169 iwlwifi lpc_ich mii mfd_core ie31200_edac edac_core thermal i8042 serio wmi evdev acpi_cpufreq sd_mod
[ 2935.830127] CPU: 3 PID: 1599 Comm: rmmod Tainted: G W 4.11.0-rc3-next-20170323-dbg-00012-gdda44065c67c-dirty #155
[ 2935.830128] task: ffff8804061c14c0 task.stack: ffff8801f7908000
[ 2935.830130] RIP: 0010:zram_entry_put+0x15/0xbb [zram]
[ 2935.830131] RSP: 0018:ffff8801f790bdc0 EFLAGS: 00010246
[ 2935.830132] RAX: 0000000000000000 RBX: ffff88041cdbfce0 RCX: 0000000000000001
[ 2935.830132] RDX: ffff880405f4cdb8 RSI: ffff88041cdbfce0 RDI: 0000000000000000
[ 2935.830133] RBP: ffff8801f790bde8 R08: ffffffffffffff80 R09: 00000000000000ff
[ 2935.830134] R10: ffff8801f790bd90 R11: 0000000000000000 R12: 0000000000000000
[ 2935.830134] R13: ffff88041cdbfce0 R14: ffff88041cdbfd00 R15: ffff88041cdbfce0
[ 2935.830135] FS: 00007fb350b62b40(0000) GS:ffff88042fac0000(0000) knlGS:0000000000000000
[ 2935.830136] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2935.830137] CR2: 0000000000000154 CR3: 00000002d3f53000 CR4: 00000000001406e0
[ 2935.830137] Call Trace:
[ 2935.830139] zram_meta_free+0x50/0x7e [zram]
[ 2935.830141] zram_reset_device+0xd2/0xe5 [zram]
[ 2935.830142] zram_remove+0x9f/0xf1 [zram]
[ 2935.830143] ? zram_remove+0xf1/0xf1 [zram]
[ 2935.830145] zram_remove_cb+0x11/0x15 [zram]
[ 2935.830148] idr_for_each+0x3b/0x87
[ 2935.830149] destroy_devices+0x2a/0x56 [zram]
[ 2935.830150] zram_exit+0x9/0x6f6 [zram]
[ 2935.830153] SyS_delete_module+0xf1/0x181
[ 2935.830156] entry_SYSCALL_64_fastpath+0x18/0xad
[ 2935.830157] RIP: 0033:0x7fb3502659b7
[ 2935.830158] RSP: 002b:00007fff90ca5468 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0
[ 2935.830159] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fb3502659b7
[ 2935.830159] RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000000002187208
[ 2935.830160] RBP: 0000000000000000 R08: 00007fff90ca43e1 R09: 000000000000000a
[ 2935.830161] R10: 0000000000000892 R11: 0000000000000202 R12: 00000000021871a0
[ 2935.830162] R13: 00007fff90ca4450 R14: 00000000021871a0 R15: 0000000002186010
[ 2935.830163] Code: 1e 6f fb ff 4c 89 e7 e8 9e 05 f5 e0 48 89 d8 5b 41 5c 41 5d 5d c3 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 41 55 49 89 f5 41 54 53 <83> bf 54 01 00 00 00 48 89 d3 75 0e 48 8b 7e 08 48 89 d6 e8 0b
[ 2935.830184] RIP: zram_entry_put+0x15/0xbb [zram] RSP: ffff8801f790bdc0
[ 2935.830184] CR2: 0000000000000154
[ 2935.838309] ---[ end trace f7f95fd0ed6c72a0 ]---

-ss

Minchan Kim

unread,

Mar 23, 2017, 9:00:05 PM3/23/17

to

Hi Joonsoo,

On Thu, Mar 23, 2017 at 11:10:23AM +0900, Joonsoo Kim wrote:
> On Tue, Mar 21, 2017 at 08:10:05PM +0900, Minchan Kim wrote:
> > Hi Joonsoo,
> >
> > On Thu, Mar 16, 2017 at 11:46:35AM +0900, js1...@gmail.com wrote:
> > > From: Joonsoo Kim <iamjoon...@lge.com>
> > >
> > > Zspage is always movable and is used through zs_map_object() function
> > > which returns directly accessible pointer that contains content of
> > > zspage. It is independent on the user's allocation flag.
> > > Therefore, it's better to always set movable/highmem flag to the zspage.
> > > After that, we don't need __GFP_MOVABLE/__GFP_HIGHMEM clearing in
> > > cache_alloc_handle()/cache_alloc_zspage() since there is no zs_malloc
> > > caller who specifies __GFP_MOVABLE/__GFP_HIGHMEM.
> >
> > Hmm, I wanted this when you pointed out to me firstly but when I think
> > again, I don't see it's improvement. Sorry for that.
> > The zs_malloc is exported symbol and it has gfp_t argument so user can
> > do whatever he want with any zone modifiers flags. IOW, if someuser want
> > to allocate pages from {normal|dma} zone by whatever reason, he can
> > omit __GFP_HIGHMEM from the gfp flag to fullfill the goal.
>
> Hello,
>
> I don't think that such flexibility makes things better. User cannot
> fully understand what flags are the best since it highly depends on
> implementation detail. For example, __GFP_MOVABLE is needed to

> optimize memory fragmentation and user cannot know it and there is no
> reason that user need to know it. __GFP_HIGHMEM is the similar case.
> He cannot know that he can pass __GFP_HIGHMEM without knowing the
> implementation detail and he cannot know the impact of __GFP_HIGHMEM
> here. So, I think that adding these flags in zsmalloc can be justified.
>
> Anyway, this patch isn't so important for this series so if you don't
> like it, I will drop it.

Yes, I don't feel strongly we need it at this moment.

Thanks.

Minchan Kim

unread,

Mar 23, 2017, 9:00:06 PM3/23/17

to

Great.

Please include your analysis in the patch's description.
With that, we can recommend to turn on dedup feature to users who want to use
zram to build output directory. Please guide it in Kconfig.
One thing I forgot when the review is "add document about use_dedup on
zram.txt"

>
> >
> > >
> > > Anyway, if we can detect duplicated content and avoid to store duplicated
> > > content at different memory space, we can save memory. This patch
> > > tries to do that.
> > >
> > > Implementation is almost simple and intuitive but I should note
> > > one thing about implementation detail.
> > >
> > > To check duplication, this patch uses checksum of the page and
> > > collision of this checksum could be possible. There would be
> > > many choices to handle this situation but this patch chooses
> > > to allow entry with duplicated checksum to be added to the hash,
> > > but, not to compare all entries with duplicated checksum
> > > when checking duplication. I guess that checksum collision is quite
> >
> > Hmm, if there are many duplicated checksum, what a specific checksum
> > is compared among them?
>
> I implemented it to check just first one.
>
> > > rare event and we don't need to pay any attention to such a case.
> >
> > If it's rare event, why can't we iterate all of entries?
>
> It's possible. If you prefer it, I can do it.

Yes, please do. I want to give expected behavior to the user unless it
gives big overhead/make the code mess.

Thanks for the nice work!

Sergey Senozhatsky

unread,

Mar 27, 2017, 4:20:05 AM3/27/17

to

On (03/23/17 12:05), Joonsoo Kim wrote:
> On Wed, Mar 22, 2017 at 09:00:59AM +0900, Minchan Kim wrote:
> > On Thu, Mar 16, 2017 at 11:46:38AM +0900, js1...@gmail.com wrote:
> > > From: Joonsoo Kim <iamjoon...@lge.com>
> > >
> > > Benefit of deduplication is dependent on the workload so it's not
> > > preferable to always enable. Therefore, make it optional.
> >
> > Please make it to Kconfig, too. And write down the description to impress
> > "help a lot for users who uses zram to build output directory"
> > And the feature should be disabled as default.
>
> Okay.

so I was thinking for a moment -- do we want to keep this
functionality in zram or may be it belongs to allocator (zsmalloc)?
what do you think? just a question.

-ss

Joonsoo Kim

unread,

Mar 27, 2017, 8:30:06 PM3/27/17

to

Hello,

Thanks for reporting.
I have noticed this bug and fixed it in version 2.

Thanks.

Joonsoo Kim

unread,

Mar 27, 2017, 9:10:05 PM3/27/17

to

I think that zram is more appropriate layer to implement this feature.
I may be wrong so please let me know if I'm missing something.

First, I'd like to leave allocator to just allocator. If it awares the
contents, further improvement would be restricted. For example, we
should use map/unmap semantic to store contents, since, without them,
we can't know when the content is changed and when deduplication check
should be done. I know that zsmalloc is already implemented by that
way but I guess that similar issue could happen in the future.

Second, we always need to compress the page to check duplication
if it is implemented in zsmalloc since we store compressed page to
zsmalloc. I guess that less compression would be better in performance
wise.

Third, in case of zsmalloc dedup, we always need to allocate zs memory
before checking duplication and need to free it if duplication is
found. It's also undesirable.

If you are okay with above arguments, I will send v2 soon.

Thanks.

Sergey Senozhatsky

unread,

Mar 27, 2017, 10:40:05 PM3/27/17

to

Cc Seth and Dan, just in case

Hello Joonsoo,

On (03/28/17 10:02), Joonsoo Kim wrote:
[..]

> > so I was thinking for a moment -- do we want to keep this
> > functionality in zram or may be it belongs to allocator (zsmalloc)?
> > what do you think? just a question.
>
> I think that zram is more appropriate layer to implement this feature.
> I may be wrong so please let me know if I'm missing something.
>
> First, I'd like to leave allocator to just allocator. If it awares the
> contents, further improvement would be restricted. For example, we
> should use map/unmap semantic to store contents, since, without them,
> we can't know when the content is changed and when deduplication check
> should be done. I know that zsmalloc is already implemented by that
> way but I guess that similar issue could happen in the future.
>
> Second, we always need to compress the page to check duplication
> if it is implemented in zsmalloc since we store compressed page to
> zsmalloc. I guess that less compression would be better in performance
> wise.
>
> Third, in case of zsmalloc dedup, we always need to allocate zs memory
> before checking duplication and need to free it if duplication is
> found. It's also undesirable.
>
> If you are okay with above arguments, I will send v2 soon.

thanks.
I'm OK with your arguments.

to explain my point a bit further (zsmalloc was a bad call,
I guess I meant zpool):

the reason I asked was that both zram and zswap sort of trying to
have same optimizations - zero filled pages handling, for example.
zram is a bit ahead now (to the best of my knowledge), because of
the recent 'same element' filled pages. zswap, probably, will have
something like this as well some day. or may be it won't, up to Seth
and Dan. de-duplication definitely can improve both zram and zswap,
which, once again, suggests that at some point zswap will have its
own implementation. well, or it won't.

so I though that may be we could have zero filled pages handling/same
element pages handling/de-duplication somewhere in the "middle" layer.
like zpool for instance (zram does not support zpool as of now) so we
could unify things.

just an idea. no pressure.

-ss

Minchan Kim

unread,

Mar 27, 2017, 11:00:04 PM3/27/17

to

Hi Sergey,

As I pointed out, at least, dedup was no benefit for the swap case.
I don't want to disrupt zsmalloc without any *proved* benefit.
Even though it *might* have benefit, it shouldn't be in allocator
layer unless it's really huge benefit like performance.
It makes hard zram's allocator change in future.
And please consider zswap is born for the latency in server workload
while zram is memory efficiency in embedded world.
dedup feature is trade-off for them and zram is perfectly matched.

Sergey Senozhatsky

unread,

Mar 28, 2017, 1:20:05 AM3/28/17

to

Hello Minchan,

On (03/28/17 11:50), Minchan Kim wrote:
[..]

> > the reason I asked was that both zram and zswap sort of trying to
> > have same optimizations - zero filled pages handling, for example.
> > zram is a bit ahead now (to the best of my knowledge), because of
> > the recent 'same element' filled pages. zswap, probably, will have
> > something like this as well some day. or may be it won't, up to Seth
> > and Dan. de-duplication definitely can improve both zram and zswap,
> > which, once again, suggests that at some point zswap will have its
> > own implementation. well, or it won't.
>
> As I pointed out, at least, dedup was no benefit for the swap case.
> I don't want to disrupt zsmalloc without any *proved* benefit.
> Even though it *might* have benefit, it shouldn't be in allocator
> layer unless it's really huge benefit like performance.

sure.

zpool, I meant zpool. I mistakenly used the word 'allocator'.

I meant some intermediate layer between zram and actual memory allocator,
a common layer which both zram and zswap can use and which can have
common functionality. just an idea. haven't really thought about it yet.

> It makes hard zram's allocator change in future.
> And please consider zswap is born for the latency in server workload
> while zram is memory efficiency in embedded world.

may be. I do suspect zswap is used in embedded as well [1]. there is even
a brand new allocator that 'reportedly' uses less memory than zsmalloc
and outperforms zsmalloc in embedded setups [1] (once again, reportedly.
I haven't tried it).

if z3fold is actually this good (I'm not saying it is not, haven't
tested it), then it makes sense to switch to zpool API in zram and let
zram users to select the allocator that fits their setups better.

just saying.

[1] http://events.linuxfoundation.org/sites/events/files/slides/zram1.pdf

-ss

Minchan Kim

unread,

Mar 28, 2017, 2:10:07 AM3/28/17

to

I do not want to support multiple allocators in zram.
It's really maintainance headache as well as making zram's goal float.
If new allocator *saves* much memory compared to zsmalloc, it might be
good candidate for replacing zsmalloc.

If so, feel free to send patches with test workload without *any noise*.
Please, do not tell "it's good" with just simple test. What we need is
"why it's good" so that we can investigate what is current problem and
if it is caused by zsmalloc's design so it's hard to change, then
we might think of new allocator seriously.

Anyway, it's off-topic with Joonsoo's patch.

js1...@gmail.com

unread,

Mar 30, 2017, 1:40:04 AM3/30/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description

about motivation and experimental results on patch #2 so please

refer it.

Thanks.

Joonsoo Kim (4):

zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram
zram: make deduplication feature optional

zram: compare all the entries with same checksum for deduplication

Documentation/ABI/testing/sysfs-block-zram | 10 +
Documentation/blockdev/zram.txt | 3 +
drivers/block/zram/Kconfig | 14 ++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/zram_dedup.c | 288 +++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 56 ++++++
drivers/block/zram/zram_drv.c | 170 +++++++++++++----
drivers/block/zram/zram_drv.h | 25 ++-
8 files changed, 535 insertions(+), 34 deletions(-)
create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

--
2.7.4

js1...@gmail.com

unread,

Mar 30, 2017, 1:40:05 AM3/30/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Following patch will implement deduplication functionality
in the zram and it requires an indirection layer to manage
the life cycle of zsmalloc handle. To prepare that, this patch
introduces zram_entry which can be used to manage the life-cycle
of zsmalloc handle. Many lines are changed due to rename but
core change is just simple introduction about newly data structure.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_drv.c | 82 +++++++++++++++++++++++++++++--------------
drivers/block/zram/zram_drv.h | 6 +++-
2 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0194441..f3949da 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -419,6 +419,32 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

+static struct zram_entry *zram_entry_alloc(struct zram *zram,

+ unsigned int len, gfp_t flags)

+{
+ struct zram_meta *meta = zram->meta;

+ struct zram_entry *entry;
+

+ entry = kzalloc(sizeof(*entry), flags);
+ if (!entry)

+ return NULL;
+
+ entry->handle = zs_malloc(meta->mem_pool, len, flags);
+ if (!entry->handle) {
+ kfree(entry);

+ return NULL;
+ }
+

+ return entry;
+}
+
+static inline void zram_entry_free(struct zram_meta *meta,
+ struct zram_entry *entry)
+{
+ zs_free(meta->mem_pool, entry->handle);
+ kfree(entry);
+}
+

static void zram_meta_free(struct zram_meta *meta, u64 disksize)

{
size_t num_pages = disksize >> PAGE_SHIFT;
@@ -426,15 +452,15 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)

/* Free all pages that are still in this zram device */
for (index = 0; index < num_pages; index++) {
- unsigned long handle = meta->table[index].handle;
+ struct zram_entry *entry = meta->table[index].entry;
/*
* No memory is allocated for same element filled pages.
* Simply clear same page flag.
*/
- if (!handle || zram_test_flag(meta, index, ZRAM_SAME))
+ if (!entry || zram_test_flag(meta, index, ZRAM_SAME))
continue;

- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);
}

zs_destroy_pool(meta->mem_pool);
@@ -479,7 +505,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)

static void zram_free_page(struct zram *zram, size_t index)

{

struct zram_meta *meta = zram->meta;

- unsigned long handle = meta->table[index].handle;
+ struct zram_entry *entry = meta->table[index].entry;

/*
* No memory is allocated for same element filled pages.
@@ -492,16 +518,16 @@ static void zram_free_page(struct zram *zram, size_t index)
return;
}

- if (!handle)
+ if (!entry)
return;

- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);

atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);

- meta->table[index].handle = 0;
+ meta->table[index].entry = NULL;
zram_set_obj_size(meta, index, 0);
}

@@ -510,20 +536,20 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
int ret = 0;
unsigned char *cmem;

struct zram_meta *meta = zram->meta;

- unsigned long handle;
+ struct zram_entry *entry;
unsigned int size;

bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- handle = meta->table[index].handle;
+ entry = meta->table[index].entry;
size = zram_get_obj_size(meta, index);

- if (!handle || zram_test_flag(meta, index, ZRAM_SAME)) {
+ if (!entry || zram_test_flag(meta, index, ZRAM_SAME)) {

bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

zram_fill_page(mem, PAGE_SIZE, meta->table[index].element);
return 0;
}

- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);

+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);

if (size == PAGE_SIZE) {
copy_page(mem, cmem);
} else {

@@ -532,7 +558,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)

ret = zcomp_decompress(zstrm, cmem, size, mem);
zcomp_stream_put(zram->comp);
}

- zs_unmap_object(meta->mem_pool, handle);

+ zs_unmap_object(meta->mem_pool, entry->handle);

bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

/* Should NEVER happen. Return bio error if it does. */

@@ -554,7 +580,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
page = bvec->bv_page;

bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
- if (unlikely(!meta->table[index].handle) ||
+ if (unlikely(!meta->table[index].entry) ||
zram_test_flag(meta, index, ZRAM_SAME)) {

bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

handle_same_page(bvec, meta->table[index].element);
@@ -599,7 +625,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
{
int ret = 0;
unsigned int clen;
- unsigned long handle = 0;
+ struct zram_entry *entry = NULL;
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;

struct zram_meta *meta = zram->meta;

@@ -670,34 +696,36 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
}

/*
- * handle allocation has 2 paths:
+ * entry allocation has 2 paths:
* a) fast path is executed with preemption disabled (for
* per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
* since we can't sleep;
* b) slow path enables preemption and attempts to allocate
* the page with __GFP_DIRECT_RECLAIM bit set. we have to
* put per-cpu compression stream and, thus, to re-do
- * the compression once handle is allocated.
+ * the compression once entry is allocated.
*
- * if we have a 'non-null' handle here then we are coming
- * from the slow path and handle has already been allocated.
+ * if we have a 'non-null' entry here then we are coming
+ * from the slow path and entry has already been allocated.
*/
- if (!handle)

- handle = zs_malloc(meta->mem_pool, clen,

+ if (!entry) {
+ entry = zram_entry_alloc(zram, clen,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
- if (!handle) {
+ }
+
+ if (!entry) {

zcomp_stream_put(zram->comp);
zstrm = NULL;

atomic64_inc(&zram->stats.writestall);

- handle = zs_malloc(meta->mem_pool, clen,

+ entry = zram_entry_alloc(zram, clen,

GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
- if (handle)
+ if (entry)
goto compress_again;

pr_err("Error allocating memory for compressed page: %u, size=%u\n",
@@ -710,12 +738,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

update_used_max(zram, alloced_pages);

if (zram->limit_pages && alloced_pages > zram->limit_pages) {

- zs_free(meta->mem_pool, handle);
+ zram_entry_free(meta, entry);

ret = -ENOMEM;
goto out;
}

- cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_WO);

if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
src = kmap_atomic(page);

@@ -727,7 +755,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zcomp_stream_put(zram->comp);
zstrm = NULL;

- zs_unmap_object(meta->mem_pool, handle);

+ zs_unmap_object(meta->mem_pool, entry->handle);

/*
* Free memory associated with this sector

@@ -736,7 +764,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
zram_free_page(zram, index);

- meta->table[index].handle = handle;
+ meta->table[index].entry = entry;
zram_set_obj_size(meta, index, clen);

bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index caeff51..a7ae46c 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,10 +69,14 @@ enum zram_pageflags {

/*-- Data structures */

+struct zram_entry {
+ unsigned long handle;
+};
+
/* Allocated for each disk page */
struct zram_table_entry {
union {
- unsigned long handle;
+ struct zram_entry *entry;
unsigned long element;
};
unsigned long value;
--
2.7.4

js1...@gmail.com

unread,

Mar 30, 2017, 1:50:05 AM3/30/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

This patch implements deduplication feature in zram. The purpose
of this work is naturally to save amount of memory usage by zram.

Android is one of the biggest users to use zram as swap and it's
really important to save amount of memory usage. There is a paper
that reports that duplication ratio of Android's memory content is
rather high [1]. And, there is a similar work on zswap that also
reports that experiments has shown that around 10-15% of pages
stored in zswp are duplicates and deduplicate them provides some
benefits [2].

Also, there is a different kind of workload that uses zram as blockdev
and store build outputs into it to reduce wear-out problem of real

blockdev. In this workload, deduplication hit is very high due to
temporary files and intermediate object files. Detailed analysis is
on the bottom of this description.

Anyway, if we can detect duplicated content and avoid to store duplicated
content at different memory space, we can save memory. This patch
tries to do that.

Implementation is almost simple and intuitive but I should note
one thing about implementation detail.

To check duplication, this patch uses checksum of the page and
collision of this checksum could be possible. There would be
many choices to handle this situation but this patch chooses
to allow entry with duplicated checksum to be added to the hash,
but, not to compare all entries with duplicated checksum
when checking duplication. I guess that checksum collision is quite

rare event and we don't need to pay any attention to such a case.

I checked the detailed reason of savings on kernel build workload and
there are some cases that deduplication happens.

1) *.cmd
Build command is usually similar in one directory so content of these file
are very similar. In my system, more than 789 lines in fs/ext4/.namei.o.cmd
and fs/ext4/.inode.o.cmd are the same in 944 and 938 lines of the file,
respectively.

2) intermediate object files
built-in.o and temporary object file have the similar contents. More than

50% of fs/ext4/ext4.o is the same with fs/ext4/built-in.o.

3) vmlinux
.tmp_vmlinux1 and .tmp_vmlinux2 and arch/x86/boo/compressed/vmlinux.bin
have the similar contents.

Android test has similar case that some of object files(.class
and .so) are similar with another ones.
(./host/linux-x86/lib/libartd.so and
./host/linux-x86-lib/libartd-comiler.so)

Anyway, benefit seems to be largely dependent on the workload so
following patch will make this feature optional. However, this feature
can help some usecases so is deserved to be merged.

[1]: MemScope: Analyzing Memory Duplication on Android Systems,
dl.acm.org/citation.cfm?id=2797023
[2]: zswap: Optimize compressed pool memory utilization,
lkml.kernel.org/r/1341407574.7551.1471584870761.JavaMail.weblogic@epwas3p2

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

Documentation/blockdev/zram.txt | 2 +
drivers/block/zram/Makefile | 2 +-
drivers/block/zram/zram_dedup.c | 222 ++++++++++++++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 25 +++++
drivers/block/zram/zram_drv.c | 62 ++++++++---
drivers/block/zram/zram_drv.h | 18 ++++
6 files changed, 314 insertions(+), 17 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ dup_data_size deduplicated data size
+ meta_data_size the amount of metadata allocated for deduplication feature

9) Deactivate:
swapoff /dev/zram0
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y := zcomp.o zram_drv.o
+zram-y := zcomp.o zram_drv.o zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
new file mode 100644
index 0000000..d313fc8
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2017 Joonsoo Kim.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+
+#include "zram_drv.h"
+

+/* One slot will contain 128 pages theoretically */
+#define ZRAM_HASH_SHIFT 7
+#define ZRAM_HASH_SIZE_MIN (1 << 10)
+#define ZRAM_HASH_SIZE_MAX (1 << 31)

+
+u64 zram_dedup_dup_size(struct zram *zram)
+{
+ return (u64)atomic64_read(&zram->stats.dup_data_size);
+}
+
+u64 zram_dedup_meta_size(struct zram *zram)
+{
+ return (u64)atomic64_read(&zram->stats.meta_data_size);
+}
+
+static u32 zram_dedup_checksum(unsigned char *mem)

+{
+ return jhash(mem, PAGE_SIZE, 0);
+}
+

+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum)

+{
+ struct zram_meta *meta = zram->meta;

+ struct zram_hash *hash;
+ struct rb_root *rb_root;
+ struct rb_node **rb_node, *parent = NULL;

+ struct zram_entry *entry;
+

+ new->checksum = checksum;

+ hash = &meta->hash[checksum % meta->hash_size];

+ rb_root = &hash->rb_root;
+
+ spin_lock(&hash->lock);

+ rb_node = &rb_root->rb_node;
+ while (*rb_node) {
+ parent = *rb_node;
+ entry = rb_entry(parent, struct zram_entry, rb_node);
+ if (checksum < entry->checksum)
+ rb_node = &parent->rb_left;
+ else if (checksum > entry->checksum)
+ rb_node = &parent->rb_right;
+ else
+ rb_node = &parent->rb_left;
+ }
+
+ rb_link_node(&new->rb_node, parent, rb_node);
+ rb_insert_color(&new->rb_node, rb_root);
+ spin_unlock(&hash->lock);
+}
+

+static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,

+ unsigned char *mem)
+{
+ bool match = false;
+ unsigned char *cmem;

+ struct zram_meta *meta = zram->meta;

+ struct zcomp_strm *zstrm;
+

+ cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);

+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }

+ zs_unmap_object(meta->mem_pool, entry->handle);

+
+ return match;
+}
+

+static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,

+ struct zram_entry *entry)
+{

+ struct zram_hash *hash;
+ u32 checksum;

+ unsigned long refcount;

+
+ checksum = entry->checksum;
+ hash = &meta->hash[checksum % meta->hash_size];
+
+ spin_lock(&hash->lock);
+ entry->refcount--;

+ refcount = entry->refcount;
+ if (!entry->refcount) {

+ rb_erase(&entry->rb_node, &hash->rb_root);
+ RB_CLEAR_NODE(&entry->rb_node);

+ } else if (zram) {

+ atomic64_sub(entry->len, &zram->stats.dup_data_size);

+ }
+ spin_unlock(&hash->lock);
+

+ return refcount;
+}
+
+static struct zram_entry *zram_dedup_get(struct zram *zram,

+ unsigned char *mem, u32 checksum)

+{
+ struct zram_meta *meta = zram->meta;

+ struct zram_hash *hash;
+ struct zram_entry *entry;
+ struct rb_node *rb_node;
+

+ hash = &meta->hash[checksum % meta->hash_size];
+
+ spin_lock(&hash->lock);

+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {
+ entry = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (checksum == entry->checksum) {
+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+
+ zram_entry_free(zram, meta, entry);
+

+ return NULL;
+ }
+

+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;

+ }
+ spin_unlock(&hash->lock);
+

+ return NULL;
+}
+

+struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
+ u32 *checksum)
+{
+ *checksum = zram_dedup_checksum(mem);
+
+ return zram_dedup_get(zram, mem, *checksum);
+}
+
+struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len,
+ gfp_t flags)
+{

+ struct zram_entry *entry;
+
+ entry = kzalloc(sizeof(*entry),

+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));

+ if (!entry)
+ return NULL;
+

+ entry->handle = handle;
+ RB_CLEAR_NODE(&entry->rb_node);
+ entry->refcount = 1;
+ entry->len = len;
+

+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);

+
+ return entry;
+}
+

+unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,

+ struct zram_entry *entry)
+{

+ unsigned long handle;
+

+ if (zram_dedup_put(zram, meta, entry))
+ return 0;
+
+ handle = entry->handle;
+ kfree(entry);
+
+ /* !zram happens when reset/fail and updating stat is useless */

+ if (zram)
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
+

+ return handle;
+}
+
+int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
+{
+ int i;

+ struct zram_hash *hash;
+

+ meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;
+ meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);
+ meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);
+ meta->hash = vzalloc(meta->hash_size * sizeof(struct zram_hash));
+ if (!meta->hash) {
+ pr_err("Error allocating zram entry hash\n");

+ return -ENOMEM;
+ }

+
+ for (i = 0; i < meta->hash_size; i++) {
+ hash = &meta->hash[i];
+ spin_lock_init(&hash->lock);
+ hash->rb_root = RB_ROOT;
+ }
+

+ return 0;
+}
+
+void zram_dedup_fini(struct zram_meta *meta)
+{
+ vfree(meta->hash);
+}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
new file mode 100644
index 0000000..7071f32
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.h
@@ -0,0 +1,25 @@
+#ifndef _ZRAM_DEDUP_H_
+#define _ZRAM_DEDUP_H_
+
+struct zram;
+struct zram_meta;
+struct zram_entry;
+
+u64 zram_dedup_dup_size(struct zram *zram);
+u64 zram_dedup_meta_size(struct zram *zram);
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum);
+struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
+ u32 *checksum);
+
+struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len,
+ gfp_t flags);
+unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,

+ struct zram_entry *entry);
+

+int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
+void zram_dedup_fini(struct zram_meta *meta);
+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f3949da..15cecd6 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -385,14 +385,16 @@ static ssize_t mm_stat_show(struct device *dev,

max_used = atomic_long_read(&zram->stats.max_used_pages);

ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,

+ zram_dedup_dup_size(zram),
+ zram_dedup_meta_size(zram));
up_read(&zram->init_lock);

return ret;
@@ -424,25 +426,29 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
{

struct zram_meta *meta = zram->meta;

struct zram_entry *entry;
+ unsigned long handle;

- entry = kzalloc(sizeof(*entry), flags);
- if (!entry)

+ handle = zs_malloc(meta->mem_pool, len, flags);
+ if (!handle)
return NULL;

- entry->handle = zs_malloc(meta->mem_pool, len, flags);
- if (!entry->handle) {
- kfree(entry);

+ entry = zram_dedup_alloc(zram, handle, len, flags);
+ if (!entry) {

+ zs_free(meta->mem_pool, handle);
return NULL;
}

return entry;
}

-static inline void zram_entry_free(struct zram_meta *meta,
- struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry)
{
- zs_free(meta->mem_pool, entry->handle);
- kfree(entry);

+ unsigned long handle;
+

+ handle = zram_dedup_free(zram, meta, entry);
+ if (handle)
+ zs_free(meta->mem_pool, handle);

}

static void zram_meta_free(struct zram_meta *meta, u64 disksize)

@@ -460,10 +466,11 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)

if (!entry || zram_test_flag(meta, index, ZRAM_SAME))
continue;

- zram_entry_free(meta, entry);
+ zram_entry_free(NULL, meta, entry);
}

zs_destroy_pool(meta->mem_pool);
+ zram_dedup_fini(meta);
vfree(meta->table);
kfree(meta);
}
@@ -471,7 +478,7 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)

static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)

{
size_t num_pages;
- struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+ struct zram_meta *meta = kzalloc(sizeof(*meta), GFP_KERNEL);

if (!meta)
return NULL;

@@ -483,6 +490,11 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}

+ if (zram_dedup_init(meta, num_pages)) {
+ pr_err("Error initializing zram entry hash\n");

+ goto out_error;
+ }
+
meta->mem_pool = zs_create_pool(pool_name);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");

@@ -492,6 +504,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
return meta;

out_error:
+ zram_dedup_fini(meta);

vfree(meta->table);
kfree(meta);
return NULL;

@@ -521,7 +534,7 @@ static void zram_free_page(struct zram *zram, size_t index)

if (!entry)
return;

- zram_entry_free(meta, entry);

+ zram_entry_free(zram, meta, entry);

atomic64_sub(zram_get_obj_size(meta, index),
&zram->stats.compr_data_size);
@@ -625,13 +638,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

{
int ret = 0;
unsigned int clen;

- struct zram_entry *entry = NULL;
+ struct zram_entry *entry = NULL, *found_entry;

struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
struct zram_meta *meta = zram->meta;

struct zcomp_strm *zstrm = NULL;
unsigned long alloced_pages;
unsigned long element;
+ u32 checksum;

page = bvec->bv_page;
if (is_partial_io(bvec)) {

@@ -675,6 +689,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

+ found_entry = zram_dedup_find(zram, uncmem, &checksum);
+ if (found_entry) {

+ if (!is_partial_io(bvec))
+ kunmap_atomic(user_mem);
+

+ if (entry)

+ zram_entry_free(zram, meta, entry);
+

+ entry = found_entry;

+ clen = entry->len;
+

+ goto found_dup;

+ }
+
zstrm = zcomp_stream_get(zram->comp);
ret = zcomp_compress(zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {

@@ -738,7 +766,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

update_used_max(zram, alloced_pages);

if (zram->limit_pages && alloced_pages > zram->limit_pages) {

- zram_entry_free(meta, entry);
+ zram_entry_free(zram, meta, entry);

ret = -ENOMEM;
goto out;
}

@@ -756,7 +784,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zcomp_stream_put(zram->comp);
zstrm = NULL;

zs_unmap_object(meta->mem_pool, entry->handle);
+ zram_dedup_insert(zram, entry, checksum);

+found_dup:

/*
* Free memory associated with this sector

* before overwriting unused sectors.

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index a7ae46c..968e269 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -18,8 +18,10 @@

#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>

#include "zcomp.h"

+#include "zram_dedup.h"

/*-- Configurable parameters */

@@ -70,6 +72,10 @@ enum zram_pageflags {
/*-- Data structures */

struct zram_entry {
+ struct rb_node rb_node;
+ u32 len;
+ u32 checksum;
+ unsigned long refcount;

unsigned long handle;
};

@@ -94,11 +100,20 @@ struct zram_stats {

atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
+ atomic64_t dup_data_size; /* compressed size of pages duplicated */
+ atomic64_t meta_data_size; /* size of zram_entries */
+};
+
+struct zram_hash {
+ spinlock_t lock;
+ struct rb_root rb_root;
};

struct zram_meta {
struct zram_table_entry *table;
struct zs_pool *mem_pool;

+ struct zram_hash *hash;
+ size_t hash_size;
};

struct zram {
@@ -124,4 +139,7 @@ struct zram {

*/
bool claim; /* Protected by bdev->bd_mutex */
};
+

+void zram_entry_free(struct zram *zram, struct zram_meta *meta,
+ struct zram_entry *entry);
#endif
--
2.7.4

js1...@gmail.com

unread,

Mar 30, 2017, 1:50:05 AM3/30/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_dedup.c | 59 +++++++++++++++++++++++++++++++++--------
1 file changed, 48 insertions(+), 11 deletions(-)

diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 1df1ce1..c4dfd21 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -125,6 +125,51 @@ static unsigned long zram_dedup_put(struct zram *zram, struct zram_meta *meta,
return refcount;
}

+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+ struct zram_hash *hash, unsigned char *mem,

+ struct zram_entry *entry)
+{

+ struct zram_entry *tmp, *prev = NULL;

+ struct rb_node *rb_node;
+

+ /* find left-most entry with same checksum */
+ while ((rb_node = rb_prev(&entry->rb_node))) {
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (tmp->checksum != entry->checksum)
+ break;
+
+ entry = tmp;
+ }
+
+again:

+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (prev)
+ zram_entry_free(zram, zram->meta, prev);

+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+

+ spin_lock(&hash->lock);
+ tmp = NULL;
+ rb_node = rb_next(&entry->rb_node);
+ if (rb_node)
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+
+ if (tmp && (tmp->checksum == entry->checksum)) {
+ prev = entry;
+ entry = tmp;
+ goto again;
+ }
+
+ spin_unlock(&hash->lock);
+ zram_entry_free(zram, zram->meta, entry);

+
+ return NULL;
+}
+

static struct zram_entry *zram_dedup_get(struct zram *zram,

unsigned char *mem, u32 checksum)

{
@@ -139,18 +184,10 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
rb_node = hash->rb_root.rb_node;
while (rb_node) {

entry = rb_entry(rb_node, struct zram_entry, rb_node);

- if (checksum == entry->checksum) {
- entry->refcount++;
- atomic64_add(entry->len, &zram->stats.dup_data_size);
- spin_unlock(&hash->lock);
-
- if (zram_dedup_match(zram, entry, mem))
- return entry;
-
- zram_entry_free(zram, meta, entry);

- return NULL;
- }
+ /* lock will be released in the following function */

+ if (checksum == entry->checksum)

+ return __zram_dedup_get(zram, hash, mem, entry);

if (checksum < entry->checksum)
rb_node = rb_node->rb_left;
--
2.7.4

js1...@gmail.com

unread,

Mar 30, 2017, 1:50:05 AM3/30/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Benefit of deduplication is dependent on the workload so it's not

preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

Documentation/ABI/testing/sysfs-block-zram | 10 +++++
Documentation/blockdev/zram.txt | 1 +
drivers/block/zram/Kconfig | 14 +++++++
drivers/block/zram/Makefile | 5 ++-
drivers/block/zram/zram_dedup.c | 31 ++++++++++++++-
drivers/block/zram/zram_dedup.h | 33 +++++++++++++++-
drivers/block/zram/zram_drv.c | 62 ++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 1 +
8 files changed, 146 insertions(+), 11 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d8..3c1945f 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,13 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
+
+What: /sys/block/zram<id>/use_dedup
+Date: March 2017
+Contact: Joonsoo Kim <iamjoon...@lge.com>
+Description:
+ The use_dedup file is read-write and specifies deduplication
+ feature is used or not. If enabled, duplicated data is
+ managed by reference count and will not be stored in memory
+ twice. Benefit of this feature largely depends on the workload
+ so keep attention when use.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 2cdc303..cbbe39b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
+use_dedup RW show and set deduplication feature

User space is advised to use the following files to read the device statistics.
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6..2f3dd1f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,17 @@ config ZRAM
disks and maybe many more.

See zram.txt for more information.
+
+config ZRAM_DEDUP
+ bool "Deduplication support for ZRAM data"
+ depends on ZRAM
+ default n
+ help
+ Deduplicate ZRAM data to reduce amount of memory consumption.
+ Advantage largely depends on the workload. In some cases, this
+ option reduces memory usage to the half. However, if there is no
+ duplicated data, the amount of memory consumption would be
+ increased due to additional metadata usage. And, there is
+ computation time trade-off. Please check the benefit before
+ enabling this option. Experiment shows the positive effect when
+ the zram is used as blockdev and is used to store build output.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 29cb008..1f6fecd 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,4 @@
-zram-y := zcomp.o zram_drv.o zram_dedup.o
+zram-y := zcomp.o zram_drv.o

-obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index d313fc8..1df1ce1 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -27,6 +27,19 @@ u64 zram_dedup_meta_size(struct zram *zram)
return (u64)atomic64_read(&zram->stats.meta_data_size);
}

+static inline bool zram_dedup_enabled(struct zram_meta *meta)
+{
+ return meta->hash;
+}
+
+unsigned long zram_dedup_handle(struct zram *zram, struct zram_entry *entry)
+{
+ if (!zram_dedup_enabled(zram->meta))

+ return (unsigned long)entry;
+
+ return entry->handle;
+}
+

static u32 zram_dedup_checksum(unsigned char *mem)
{
return jhash(mem, PAGE_SIZE, 0);
@@ -41,6 +54,9 @@ void zram_dedup_insert(struct zram *zram, struct zram_entry *new,

struct rb_node **rb_node, *parent = NULL;

struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram->meta))
+ return;

+
new->checksum = checksum;

hash = &meta->hash[checksum % meta->hash_size];

rb_root = &hash->rb_root;
@@ -149,6 +165,9 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
u32 *checksum)
{
+ if (!zram_dedup_enabled(zram->meta))
+ return NULL;
+
*checksum = zram_dedup_checksum(mem);

return zram_dedup_get(zram, mem, *checksum);
@@ -160,6 +179,9 @@ struct zram_entry *zram_dedup_alloc(struct zram *zram,
{
struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram->meta))

+ return (struct zram_entry *)handle;
+
entry = kzalloc(sizeof(*entry),

flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
if (!entry)
@@ -180,6 +202,9 @@ unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
{
unsigned long handle;

+ if (!zram_dedup_enabled(meta))
+ return (unsigned long)entry;

+
if (zram_dedup_put(zram, meta, entry))

return 0;

@@ -193,11 +218,14 @@ unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
return handle;
}

-int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
+int zram_dedup_init(struct zram *zram, struct zram_meta *meta, size_t num_pages)
{
int i;
struct zram_hash *hash;

+ if (!zram->use_dedup)
+ return 0;

+
meta->hash_size = num_pages >> ZRAM_HASH_SHIFT;

meta->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, meta->hash_size);

meta->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, meta->hash_size);

@@ -219,4 +247,5 @@ int zram_dedup_init(struct zram_meta *meta, size_t num_pages)
void zram_dedup_fini(struct zram_meta *meta)
{
vfree(meta->hash);
+ meta->hash = NULL;
}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
index 7071f32..470009a 100644
--- a/drivers/block/zram/zram_dedup.h
+++ b/drivers/block/zram/zram_dedup.h
@@ -5,9 +5,11 @@ struct zram;
struct zram_meta;
struct zram_entry;

+#ifdef CONFIG_ZRAM_DEDUP
u64 zram_dedup_dup_size(struct zram *zram);
u64 zram_dedup_meta_size(struct zram *zram);

+unsigned long zram_dedup_handle(struct zram *zram, struct zram_entry *entry);

void zram_dedup_insert(struct zram *zram, struct zram_entry *new,

u32 checksum);
struct zram_entry *zram_dedup_find(struct zram *zram, unsigned char *mem,
@@ -19,7 +21,36 @@ struct zram_entry *zram_dedup_alloc(struct zram *zram,
unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
struct zram_entry *entry);

-int zram_dedup_init(struct zram_meta *meta, size_t num_pages);
+int zram_dedup_init(struct zram *zram, struct zram_meta *meta,
+ size_t num_pages);
void zram_dedup_fini(struct zram_meta *meta);
+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+
+static inline unsigned long zram_dedup_handle(struct zram *zram,
+ struct zram_entry *entry) { return (unsigned long)entry; }
+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,
+ unsigned char *mem, u32 *checksum) { return NULL; }
+
+static inline struct zram_entry *zram_dedup_alloc(struct zram *zram,
+ unsigned long handle, unsigned int len, gfp_t flags)
+{

+ return (struct zram_entry *)handle;
+}

+static inline unsigned long zram_dedup_free(struct zram *zram,
+ struct zram_meta *meta, struct zram_entry *entry)
+{

+ return (unsigned long)entry;
+}
+

+static inline int zram_dedup_init(struct zram *zram, struct zram_meta *meta,
+ size_t num_pages) { return 0; }

+void zram_dedup_fini(struct zram_meta *meta) { }
+
+#endif

#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index 15cecd6..016b2ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -327,6 +327,43 @@ static ssize_t comp_algorithm_store(struct device *dev,

return len;
}

+static ssize_t use_dedup_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{

+ bool val;

+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->use_dedup;
+ up_read(&zram->init_lock);
+

+ return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val);

+}
+
+static ssize_t use_dedup_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{

+#ifdef CONFIG_ZRAM_DEDUP

+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change dedup usage for initialized device\n");
+ return -EBUSY;
+ }
+ zram->use_dedup = val;
+ up_write(&zram->init_lock);
+ return len;

+#else
+ return -EINVAL;
+#endif

+}
+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{

@@ -421,6 +458,12 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

+static unsigned long zram_entry_handle(struct zram *zram,

+ struct zram_entry *entry)
+{

+ return zram_dedup_handle(zram, entry);
+}
+
static struct zram_entry *zram_entry_alloc(struct zram *zram,

unsigned int len, gfp_t flags)

{
@@ -475,9 +518,10 @@ static void zram_meta_free(struct zram_meta *meta, u64 disksize)
kfree(meta);
}

-static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
+static struct zram_meta *zram_meta_alloc(struct zram *zram, u64 disksize)
{
size_t num_pages;
+ char *pool_name = zram->disk->disk_name;

struct zram_meta *meta = kzalloc(sizeof(*meta), GFP_KERNEL);

if (!meta)

@@ -490,7 +534,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
goto out_error;
}

- if (zram_dedup_init(meta, num_pages)) {
+ if (zram_dedup_init(zram, meta, num_pages)) {

pr_err("Error initializing zram entry hash\n");

goto out_error;
}
@@ -562,7 +606,8 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
return 0;

}

- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_RO);
+ cmem = zs_map_object(meta->mem_pool,

+ zram_entry_handle(zram, entry), ZS_MM_RO);

if (size == PAGE_SIZE) {
copy_page(mem, cmem);
} else {

@@ -571,7 +616,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)

ret = zcomp_decompress(zstrm, cmem, size, mem);
zcomp_stream_put(zram->comp);
}

- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));

bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);

/* Should NEVER happen. Return bio error if it does. */

@@ -771,7 +816,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
goto out;
}

- cmem = zs_map_object(meta->mem_pool, entry->handle, ZS_MM_WO);

+ cmem = zs_map_object(meta->mem_pool,

+ zram_entry_handle(zram, entry), ZS_MM_WO);

if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
src = kmap_atomic(page);

@@ -783,7 +829,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,

zcomp_stream_put(zram->comp);
zstrm = NULL;

- zs_unmap_object(meta->mem_pool, entry->handle);
+ zs_unmap_object(meta->mem_pool, zram_entry_handle(zram, entry));
zram_dedup_insert(zram, entry, checksum);

found_dup:
@@ -1055,7 +1101,7 @@ static ssize_t disksize_store(struct device *dev,
return -EINVAL;

disksize = PAGE_ALIGN(disksize);
- meta = zram_meta_alloc(zram->disk->disk_name, disksize);
+ meta = zram_meta_alloc(zram, disksize);
if (!meta)
return -ENOMEM;

@@ -1166,6 +1212,7 @@ static DEVICE_ATTR_WO(mem_limit);

static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+static DEVICE_ATTR_RW(use_dedup);

static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,

@@ -1176,6 +1223,7 @@ static struct attribute *zram_disk_attrs[] = {

&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 968e269..53be62a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -138,6 +138,7 @@ struct zram {

* zram is claimed so open request will be failed

*/
bool claim; /* Protected by bdev->bd_mutex */

+ bool use_dedup;
};

void zram_entry_free(struct zram *zram, struct zram_meta *meta,
--
2.7.4

Andrew Morton

unread,

Mar 30, 2017, 6:30:57 PM3/30/17

to

On Thu, 30 Mar 2017 14:38:05 +0900 js1...@gmail.com wrote:

> This patchset implements deduplication feature in zram. Motivation
> is to save memory usage by zram. There are detailed description
> about motivation and experimental results on patch #2 so please
> refer it.

I'm getting a lot of rejects against the current -mm tree. So this is
one of those patchsets which should be prepared against -mm, please.

I'll be attempting to release another mmotm in the next couple of hours.

Minchan Kim

unread,

Mar 30, 2017, 7:50:05 PM3/30/17

to

Hi Andrew and Joonsoo,

On Thu, Mar 30, 2017 at 03:25:02PM -0700, Andrew Morton wrote:
> On Thu, 30 Mar 2017 14:38:05 +0900 js1...@gmail.com wrote:
>
> > This patchset implements deduplication feature in zram. Motivation
> > is to save memory usage by zram. There are detailed description
> > about motivation and experimental results on patch #2 so please
> > refer it.
>
> I'm getting a lot of rejects against the current -mm tree. So this is
> one of those patchsets which should be prepared against -mm, please.

Due to my partial IO refactoring. Sorry for the inconvenience.
It's still on-going and I want to merge it before new feature like
dedup.

Joonsoo,

I will send partial-IO rework formal patch next week and start to
review your patchset so I hope you resend your patchset based on it
with my comments.

Thanks.

Joonsoo Kim

unread,

Apr 6, 2017, 10:00:04 PM4/6/17

to

No problem. I will wait. :)

Thanks.

Minchan Kim

unread,

Apr 16, 2017, 9:40:05 PM4/16/17

to

Hi Joonsoo,

I reviewed this patch and overall, looks great! Thanks.

However, as you know, recently, zram had lots of clean up so
this patchset should be rebased on it massively.
Sorry for the inconvenience.

And there are some minor in below. I hope you handle them in
next submit, please.

What's the purpose of this RB_CLEAR_NODE?
I think we don't need it.

zram_entry_alloc can mask off GFP_HIGHMEM|__GFP_MOVABLE and
it can pass filtered gfp flags to zs_malloc and zram_dedup_alloc.

> + if (!entry)
> + return NULL;
> +
> + entry->handle = handle;
> + RB_CLEAR_NODE(&entry->rb_node);

Ditto. I think we don't need to clear rb_node.

> + entry->refcount = 1;
> + entry->len = len;
> +
> + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
> +
> + return entry;
> +}
> +
> +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry)
> +{
> + unsigned long handle;
> +
> + if (zram_dedup_put(zram, meta, entry))
> + return 0;
> +
> + handle = entry->handle;
> + kfree(entry);
> +
> + /* !zram happens when reset/fail and updating stat is useless */

With recent version, it's not true so your code would be more clean. :)

I don't like to hide zram_entry allocation behind zram_dedup_alloc.
It's never wrong because zram_table allocation happens on only
with dedup feature but zram_table is owned by zram_drv so I want
for zram_drv to control zram_table alloc/free.

How about this?

zram_entry_alloc
unsigned long handle;
struct zram_entry *entry;

handle = zs_malloc;
if (zram_dedup_enabled())
return handle;

entry = kzalloc();
zram_dedup_init(entry, handle, len);

return entry;

> + if (!entry) {
> + zs_free(meta->mem_pool, handle);
> return NULL;
> }
>
> return entry;
> }
>
> -static inline void zram_entry_free(struct zram_meta *meta,
> - struct zram_entry *entry)
> +void zram_entry_free(struct zram *zram, struct zram_meta *meta,
> + struct zram_entry *entry)
> {
> - zs_free(meta->mem_pool, entry->handle);
> - kfree(entry);
> + unsigned long handle;
> +
> + handle = zram_dedup_free(zram, meta, entry);

Ditto.
unsigned long handle = entry->handle;

if (zram_dedup_put(zram, meta, entry))
kfree(entry);

zs_free(meta->mem_pool, handle);

With this, In [4/4], I want that zram_entry_handle
doesn't rely on zram_dedup_handle.

unsigned long zram_entry_handle(strucdt zram *zram, strucdt zram_entry *entry)
{
unsigned long handle;

if (!zram_dedup_enabled(zram->meta))
handle = (unsigned long)entry;
else
handle = entry->handle;
return handle;

Joonsoo Kim

unread,

Apr 16, 2017, 10:30:05 PM4/16/17

to

On Mon, Apr 17, 2017 at 10:38:16AM +0900, Minchan Kim wrote:
> Hi Joonsoo,
>
> I reviewed this patch and overall, looks great! Thanks.

Thanks!

> However, as you know, recently, zram had lots of clean up so
> this patchset should be rebased on it massively.
> Sorry for the inconvenience.

No problem.

>
> And there are some minor in below. I hope you handle them in
> next submit, please.

Okay.

I thought that it is needed to initialize the rb entry but with
re-check it seems to be unnecessary. I will remove it.

Okay.

> > + if (!entry)
> > + return NULL;
> > +
> > + entry->handle = handle;
> > + RB_CLEAR_NODE(&entry->rb_node);
>
> Ditto. I think we don't need to clear rb_node.

Okay.

> > + entry->refcount = 1;
> > + entry->len = len;
> > +
> > + atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
> > +
> > + return entry;
> > +}
> > +
> > +unsigned long zram_dedup_free(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry)
> > +{
> > + unsigned long handle;
> > +
> > + if (zram_dedup_put(zram, meta, entry))
> > + return 0;
> > +
> > + handle = entry->handle;
> > + kfree(entry);
> > +
> > + /* !zram happens when reset/fail and updating stat is useless */
>
> With recent version, it's not true so your code would be more clean. :)

Good!

Okay.

> How about this?
>
> zram_entry_alloc
> unsigned long handle;
> struct zram_entry *entry;
>
> handle = zs_malloc;
> if (zram_dedup_enabled())
> return handle;
>
> entry = kzalloc();
> zram_dedup_init(entry, handle, len);
>
> return entry;
>
>

I will change it as you suggested.

> > + if (!entry) {
> > + zs_free(meta->mem_pool, handle);
> > return NULL;
> > }
> >
> > return entry;
> > }
> >
> > -static inline void zram_entry_free(struct zram_meta *meta,
> > - struct zram_entry *entry)
> > +void zram_entry_free(struct zram *zram, struct zram_meta *meta,
> > + struct zram_entry *entry)
> > {
> > - zs_free(meta->mem_pool, entry->handle);
> > - kfree(entry);
> > + unsigned long handle;
> > +
> > + handle = zram_dedup_free(zram, meta, entry);
>
> Ditto.
> unsigned long handle = entry->handle;
>
> if (zram_dedup_put(zram, meta, entry))
> kfree(entry);
>
> zs_free(meta->mem_pool, handle);
>
> With this, In [4/4], I want that zram_entry_handle
> doesn't rely on zram_dedup_handle.

Okay.

Thanks.

js1...@gmail.com

unread,

Apr 20, 2017, 9:20:05 PM4/20/17

to

drivers/block/zram/zram_dedup.c | 204 ++++++++++++++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 22 +++++
drivers/block/zram/zram_drv.c | 38 ++++++--
drivers/block/zram/zram_drv.h | 17 ++++
6 files changed, 278 insertions(+), 7 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ dup_data_size deduplicated data size
+ meta_data_size the amount of metadata allocated for deduplication feature

9) Deactivate:
swapoff /dev/zram0
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y := zcomp.o zram_drv.o
+zram-y := zcomp.o zram_drv.o zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
new file mode 100644

index 0000000..a8427f7
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.c
@@ -0,0 +1,204 @@

+/*
+ * Copyright (C) 2017 Joonsoo Kim.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>

+#include <linux/highmem.h>

+ struct zram_hash *hash;
+ struct rb_root *rb_root;
+ struct rb_node **rb_node, *parent = NULL;
+ struct zram_entry *entry;
+
+ new->checksum = checksum;

+ hash = &zram->hash[checksum % zram->hash_size];

+ struct zcomp_strm *zstrm;
+

+ cmem = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);

+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }

+ zs_unmap_object(zram->mem_pool, entry->handle);

+
+ return match;
+}
+
+static unsigned long zram_dedup_put(struct zram *zram,

+ struct zram_entry *entry)
+{
+ struct zram_hash *hash;
+ u32 checksum;
+

+ checksum = entry->checksum;

+ hash = &zram->hash[checksum % zram->hash_size];

+
+ spin_lock(&hash->lock);
+

+ entry->refcount--;

+ if (!entry->refcount)
+ rb_erase(&entry->rb_node, &hash->rb_root);

+ else

+ atomic64_sub(entry->len, &zram->stats.dup_data_size);
+
+ spin_unlock(&hash->lock);
+

+ return entry->refcount;

+}
+
+static struct zram_entry *zram_dedup_get(struct zram *zram,
+ unsigned char *mem, u32 checksum)
+{

+ struct zram_hash *hash;
+ struct zram_entry *entry;
+ struct rb_node *rb_node;
+

+ hash = &zram->hash[checksum % zram->hash_size];

+
+ spin_lock(&hash->lock);
+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {
+ entry = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (checksum == entry->checksum) {
+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+

+ zram_entry_free(zram, entry);

+
+ return NULL;
+ }
+
+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;
+ }
+ spin_unlock(&hash->lock);
+
+ return NULL;
+}
+

+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+ u32 *checksum)
+{
+ void *mem;

+ struct zram_entry *entry;
+

+ mem = kmap_atomic(page);

+ *checksum = zram_dedup_checksum(mem);
+

+ entry = zram_dedup_get(zram, mem, *checksum);
+ kunmap_atomic(mem);

+
+ return entry;
+}
+

+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len)

+{
+ entry->handle = handle;

+ entry->refcount = 1;
+ entry->len = len;
+}
+

+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
+{
+ if (zram_dedup_put(zram, entry))
+ return false;
+
+ return true;
+}
+
+int zram_dedup_init(struct zram *zram, size_t num_pages)

+{
+ int i;
+ struct zram_hash *hash;
+

+ zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;
+ zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size);
+ zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size);
+ zram->hash = vzalloc(zram->hash_size * sizeof(struct zram_hash));
+ if (!zram->hash) {

+ pr_err("Error allocating zram entry hash\n");
+ return -ENOMEM;
+ }
+

+ for (i = 0; i < zram->hash_size; i++) {
+ hash = &zram->hash[i];

+ spin_lock_init(&hash->lock);
+ hash->rb_root = RB_ROOT;
+ }
+
+ return 0;
+}
+

+void zram_dedup_fini(struct zram *zram)
+{
+ vfree(zram->hash);
+ zram->hash = NULL;
+ zram->hash_size = 0;

+}
diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
new file mode 100644

index 0000000..ebe6bff
--- /dev/null
+++ b/drivers/block/zram/zram_dedup.h
@@ -0,0 +1,22 @@

+#ifndef _ZRAM_DEDUP_H_
+#define _ZRAM_DEDUP_H_
+
+struct zram;

+struct zram_entry;
+
+u64 zram_dedup_dup_size(struct zram *zram);
+u64 zram_dedup_meta_size(struct zram *zram);
+
+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum);

+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+ u32 *checksum);
+
+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len);
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);
+
+int zram_dedup_init(struct zram *zram, size_t num_pages);
+void zram_dedup_fini(struct zram *zram);

+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index 26dc4e5..8eab8a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -389,14 +389,16 @@ static ssize_t mm_stat_show(struct device *dev,

max_used = atomic_long_read(&zram->stats.max_used_pages);

ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ zram_dedup_dup_size(zram),
+ zram_dedup_meta_size(zram));
up_read(&zram->init_lock);

return ret;

@@ -481,26 +483,34 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,

unsigned int len, gfp_t flags)

{

struct zram_entry *entry;
+ unsigned long handle;

entry = kzalloc(sizeof(*entry),
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
if (!entry)

return NULL;

- entry->handle = zs_malloc(zram->mem_pool, len, flags);
- if (!entry->handle) {
+ handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!handle) {
kfree(entry);
return NULL;
}

+ zram_dedup_init_entry(zram, entry, handle, len);

+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+

return entry;
}

-static inline void zram_entry_free(struct zram *zram,
- struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_entry *entry)
{
+ if (!zram_dedup_put_entry(zram, entry))
+ return;
+
zs_free(zram->mem_pool, entry->handle);
kfree(entry);
+
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
}

static void zram_meta_free(struct zram *zram, u64 disksize)
@@ -513,6 +523,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
zram_free_page(zram, index);

zs_destroy_pool(zram->mem_pool);
+ zram_dedup_fini(zram);
vfree(zram->table);
}

@@ -531,6 +542,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
return false;
}

+ if (zram_dedup_init(zram, num_pages)) {
+ vfree(zram->table);
+ zs_destroy_pool(zram->mem_pool);

+ return false;
+ }
+

return true;
}

@@ -715,10 +732,17 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
void *src, *dst;
struct zcomp_strm *zstrm;
struct page *page = bvec->bv_page;
+ u32 checksum;

if (zram_same_page_write(zram, index, page))
return 0;

+ entry = zram_dedup_find(zram, page, &checksum);
+ if (entry) {
+ comp_len = entry->len;
+ goto found_dup;
+ }

+
zstrm = zcomp_stream_get(zram->comp);

ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
if (ret) {
@@ -737,7 +761,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

zcomp_stream_put(zram->comp);
zs_unmap_object(zram->mem_pool, entry->handle);

+ zram_dedup_insert(zram, entry, checksum);

+found_dup:
/*
* Free memory associated with this sector
* before overwriting unused sectors.

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index fe3d216..4b86921 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h

@@ -18,8 +18,10 @@
#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>

#include "zcomp.h"
+#include "zram_dedup.h"

/*-- Configurable parameters */

@@ -70,6 +72,10 @@ enum zram_pageflags {
/*-- Data structures */

struct zram_entry {
+ struct rb_node rb_node;
+ u32 len;

+ u32 checksum;
+ unsigned long refcount;

unsigned long handle;
};

@@ -94,6 +100,13 @@ struct zram_stats {

atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
+ atomic64_t dup_data_size; /* compressed size of pages duplicated */
+ atomic64_t meta_data_size; /* size of zram_entries */
+};
+
+struct zram_hash {
+ spinlock_t lock;
+ struct rb_root rb_root;
};

struct zram {
@@ -101,6 +114,8 @@ struct zram {
struct zs_pool *mem_pool;
struct zcomp *comp;
struct gendisk *disk;
+ struct zram_hash *hash;
+ size_t hash_size;
/* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
/*
@@ -120,4 +135,6 @@ struct zram {

*/
bool claim; /* Protected by bdev->bd_mutex */
};
+

+void zram_entry_free(struct zram *zram, struct zram_entry *entry);
#endif
--
2.7.4

js1...@gmail.com

unread,

Apr 20, 2017, 9:20:06 PM4/20/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

Documentation/ABI/testing/sysfs-block-zram | 10 ++++

Documentation/blockdev/zram.txt | 1 +
drivers/block/zram/Kconfig | 14 ++++++

drivers/block/zram/Makefile | 5 +-
drivers/block/zram/zram_dedup.c | 15 ++++++
drivers/block/zram/zram_dedup.h | 23 +++++++++
drivers/block/zram/zram_drv.c | 81 ++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 5 ++
8 files changed, 141 insertions(+), 13 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d8..3c1945f 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,13 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
+
+What: /sys/block/zram<id>/use_dedup
+Date: March 2017
+Contact: Joonsoo Kim <iamjoon...@lge.com>
+Description:
+ The use_dedup file is read-write and specifies deduplication
+ feature is used or not. If enabled, duplicated data is
+ managed by reference count and will not be stored in memory
+ twice. Benefit of this feature largely depends on the workload
+ so keep attention when use.

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 2cdc303..cbbe39b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt

diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 29cb008..1f6fecd 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile

@@ -1,3 +1,4 @@
-zram-y := zcomp.o zram_drv.o zram_dedup.o
+zram-y := zcomp.o zram_drv.o

-obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM) += zram.o
+obj-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c

index a8427f7..560b1f5 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -41,6 +41,9 @@ void zram_dedup_insert(struct zram *zram, struct zram_entry *new,

struct rb_node **rb_node, *parent = NULL;

struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram))
+ return;

+
new->checksum = checksum;

hash = &zram->hash[checksum % zram->hash_size];

rb_root = &hash->rb_root;
@@ -148,6 +151,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
void *mem;
struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram))
+ return NULL;
+
mem = kmap_atomic(page);
*checksum = zram_dedup_checksum(mem);

@@ -160,6 +166,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,

void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,

unsigned long handle, unsigned int len)
{

+ if (!zram_dedup_enabled(zram))
+ return;

+
entry->handle = handle;

entry->refcount = 1;
entry->len = len;
@@ -167,6 +176,9 @@ void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,

bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
{

+ if (!zram_dedup_enabled(zram))
+ return true;
+
if (zram_dedup_put(zram, entry))
return false;

@@ -178,6 +190,9 @@ int zram_dedup_init(struct zram *zram, size_t num_pages)

int i;
struct zram_hash *hash;

+ if (!zram_dedup_enabled(zram))
+ return 0;

+
zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;

zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size);

zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size);

diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
index ebe6bff..8ab267b 100644
--- a/drivers/block/zram/zram_dedup.h
+++ b/drivers/block/zram/zram_dedup.h
@@ -4,6 +4,8 @@
struct zram;
struct zram_entry;

+#ifdef CONFIG_ZRAM_DEDUP
+

u64 zram_dedup_dup_size(struct zram *zram);
u64 zram_dedup_meta_size(struct zram *zram);

@@ -18,5 +20,26 @@ bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);

int zram_dedup_init(struct zram *zram, size_t num_pages);
void zram_dedup_fini(struct zram *zram);

+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+

+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,

+ struct page *page, u32 *checksum) { return NULL; }
+
+static inline void zram_dedup_init_entry(struct zram *zram,
+ struct zram_entry *entry, unsigned long handle,
+ unsigned int len) { }
+static inline bool zram_dedup_put_entry(struct zram *zram,
+ struct zram_entry *entry) { return true; }

+
+static inline int zram_dedup_init(struct zram *zram,

+ size_t num_pages) { return 0; }

+static inline void zram_dedup_fini(struct zram *zram) { }

+
+#endif

#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c

index 8eab8a0..372602c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -333,6 +333,41 @@ static ssize_t comp_algorithm_store(struct device *dev,

return len;
}

+static ssize_t use_dedup_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ bool val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->use_dedup;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val);
+}
+

+#ifdef CONFIG_ZRAM_DEDUP

+static ssize_t use_dedup_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{

+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change dedup usage for initialized device\n");
+ return -EBUSY;
+ }
+ zram->use_dedup = val;
+ up_write(&zram->init_lock);
+ return len;
+}

+#endif

+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{

@@ -425,6 +460,15 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

+static unsigned long zram_entry_handle(struct zram *zram,

+ struct zram_entry *entry)
+{

+ if (zram_dedup_enabled(zram))
+ return entry->handle;
+ else

+ return (unsigned long)entry;
+}
+

static void zram_slot_lock(struct zram *zram, u32 index)
{
bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
@@ -485,14 +529,17 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
struct zram_entry *entry;

unsigned long handle;

- entry = kzalloc(sizeof(*entry),

- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
- if (!entry)

+ handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!handle)

return NULL;

- handle = zs_malloc(zram->mem_pool, len, flags);
- if (!handle) {
- kfree(entry);
+ if (!zram_dedup_enabled(zram))

+ return (struct zram_entry *)handle;

+
+ entry = kzalloc(sizeof(*entry),
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));

+ if (!entry) {
+ zs_free(zram->mem_pool, handle);
return NULL;
}

@@ -507,7 +554,11 @@ void zram_entry_free(struct zram *zram, struct zram_entry *entry)
if (!zram_dedup_put_entry(zram, entry))
return;

- zs_free(zram->mem_pool, entry->handle);
+ zs_free(zram->mem_pool, zram_entry_handle(zram, entry));
+
+ if (!zram_dedup_enabled(zram))
+ return;
+
kfree(entry);

atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
@@ -598,7 +649,8 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
entry = zram_get_entry(zram, index);
size = zram_get_obj_size(zram, index);

- src = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
+ src = zs_map_object(zram->mem_pool,

+ zram_entry_handle(zram, entry), ZS_MM_RO);
if (size == PAGE_SIZE) {

dst = kmap_atomic(page);
memcpy(dst, src, PAGE_SIZE);
@@ -612,7 +664,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
kunmap_atomic(dst);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(zram->mem_pool, entry->handle);
+ zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));
zram_slot_unlock(zram, index);

/* Should NEVER happen. Return bio error if it does. */

@@ -750,7 +802,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
return ret;
}

- dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);
+ dst = zs_map_object(zram->mem_pool,
+ zram_entry_handle(zram, entry), ZS_MM_WO);

src = zstrm->buffer;
if (comp_len == PAGE_SIZE)
@@ -760,7 +813,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
kunmap_atomic(src);

zcomp_stream_put(zram->comp);
- zs_unmap_object(zram->mem_pool, entry->handle);
+ zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));
zram_dedup_insert(zram, entry, checksum);

found_dup:
@@ -1159,6 +1212,11 @@ static DEVICE_ATTR_WO(mem_limit);

static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);

+#ifdef CONFIG_ZRAM_DEDUP
+static DEVICE_ATTR_RW(use_dedup);
+#else
+static DEVICE_ATTR_RO(use_dedup);
+#endif

static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,

@@ -1169,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = {

&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4b86921..3f7649a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,7 +134,12 @@ struct zram {

* zram is claimed so open request will be failed

*/
bool claim; /* Protected by bdev->bd_mutex */

+ bool use_dedup;
};

+static inline bool zram_dedup_enabled(struct zram *zram)
+{
+ return zram->use_dedup;
+}

js1...@gmail.com

unread,

Apr 20, 2017, 9:20:06 PM4/20/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Changes from v2
o rebase to latest zram code
o manage alloc/free of the zram_entry in zram_drv.c
o remove useless RB_CLEAR_NODE
o set RO permission tor use_deup sysfs entry if CONFIG_ZRAM_DEDUP=n

Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch #2 so please
refer it.

Thanks.

Joonsoo Kim (4):
zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram
zram: make deduplication feature optional
zram: compare all the entries with same checksum for deduplication

Documentation/ABI/testing/sysfs-block-zram | 10 ++

Documentation/blockdev/zram.txt | 3 +
drivers/block/zram/Kconfig | 14 ++
drivers/block/zram/Makefile | 3 +-

drivers/block/zram/zram_dedup.c | 254 +++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 45 +++++
drivers/block/zram/zram_drv.c | 184 +++++++++++++++++----
drivers/block/zram/zram_drv.h | 28 +++-
8 files changed, 503 insertions(+), 38 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

--
2.7.4

js1...@gmail.com

unread,

Apr 20, 2017, 9:20:06 PM4/20/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_dedup.c | 59 ++++++++++++++++++++++++++++++++---------
1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 560b1f5..14c4988 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -109,6 +109,51 @@ static unsigned long zram_dedup_put(struct zram *zram,
return entry->refcount;

}

+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+ struct zram_hash *hash, unsigned char *mem,

+ struct zram_entry *entry)
+{

+ struct zram_entry *tmp, *prev = NULL;

+ struct rb_node *rb_node;
+

+ /* find left-most entry with same checksum */
+ while ((rb_node = rb_prev(&entry->rb_node))) {
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (tmp->checksum != entry->checksum)
+ break;
+
+ entry = tmp;
+ }
+
+again:

+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (prev)
+ zram_entry_free(zram, prev);

+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+

+ spin_lock(&hash->lock);
+ tmp = NULL;
+ rb_node = rb_next(&entry->rb_node);
+ if (rb_node)

+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+

+ if (tmp && (tmp->checksum == entry->checksum)) {
+ prev = entry;
+ entry = tmp;
+ goto again;
+ }

+
+ spin_unlock(&hash->lock);

+ zram_entry_free(zram, entry);
+
+ return NULL;
+}
+

static struct zram_entry *zram_dedup_get(struct zram *zram,

unsigned char *mem, u32 checksum)

{
@@ -122,18 +167,8 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,

rb_node = hash->rb_root.rb_node;
while (rb_node) {

entry = rb_entry(rb_node, struct zram_entry, rb_node);

- if (checksum == entry->checksum) {
- entry->refcount++;
- atomic64_add(entry->len, &zram->stats.dup_data_size);
- spin_unlock(&hash->lock);
-
- if (zram_dedup_match(zram, entry, mem))
- return entry;
-

- zram_entry_free(zram, entry);
-
- return NULL;
- }

+ if (checksum == entry->checksum)

Minchan Kim

unread,

Apr 25, 2017, 3:00:08 AM4/25/17

to

Hi Joonsoo,

On Fri, Apr 21, 2017 at 10:14:47AM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim <iamjoon...@lge.com>
>
> Changes from v2
> o rebase to latest zram code
> o manage alloc/free of the zram_entry in zram_drv.c
> o remove useless RB_CLEAR_NODE
> o set RO permission tor use_deup sysfs entry if CONFIG_ZRAM_DEDUP=n
>
> Changes from v1
> o reogranize dedup specific functions
> o support Kconfig on/off
> o update zram documents
> o compare all the entries with same checksum (patch #4)
>
> This patchset implements deduplication feature in zram. Motivation
> is to save memory usage by zram. There are detailed description
> about motivation and experimental results on patch #2 so please
> refer it.
>
> Thanks.

To all patches:

Acked-by: Minchan Kim <min...@kernel.org>

If you send new version due to trivial stuff I mentioned,
feel free to add my Acked-by to the patchset.

Thanks!

Minchan Kim

unread,

Apr 25, 2017, 3:00:08 AM4/25/17

to

On Fri, Apr 21, 2017 at 10:14:50AM +0900, js1...@gmail.com wrote:
> From: Joonsoo Kim <iamjoon...@lge.com>
>
> Benefit of deduplication is dependent on the workload so it's not
> preferable to always enable. Therefore, make it optional in Kconfig
> and device param. Default is 'off'. This option will be beneficial
> for users who use the zram as blockdev and stores build output to it.
>
> Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>

< snip >

>
> static struct attribute *zram_disk_attrs[] = {
> &dev_attr_disksize.attr,
> @@ -1169,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = {
> &dev_attr_mem_used_max.attr,
> &dev_attr_max_comp_streams.attr,
> &dev_attr_comp_algorithm.attr,
> + &dev_attr_use_dedup.attr,
> &dev_attr_io_stat.attr,
> &dev_attr_mm_stat.attr,
> &dev_attr_debug_stat.attr,
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 4b86921..3f7649a 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -134,7 +134,12 @@ struct zram {
> * zram is claimed so open request will be failed
> */
> bool claim; /* Protected by bdev->bd_mutex */
> + bool use_dedup;
> };
>
> +static inline bool zram_dedup_enabled(struct zram *zram)
> +{
> + return zram->use_dedup;

#ifdef CONFIG_ZRAM_DEDUP
return zram->use_dedup;
#else
return false;
#endif

Otherwise, looks good to me.

Joonsoo Kim

unread,

Apr 25, 2017, 4:20:05 AM4/25/17

to

Thanks!

Here goes new one.

Thanks.

------------------>8---------------------
From fe6e25890aae945964dc254442cd830e3005627e Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoon...@lge.com>
Date: Fri, 21 Apr 2017 09:50:42 +0900
Subject: [PATCH v3 3/4 -fix] zram: make deduplication feature optional

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>

drivers/block/zram/zram_drv.h | 9 ++++
8 files changed, 145 insertions(+), 13 deletions(-)

static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1169,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h

index 4b86921..102fb7a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,7 +134,16 @@ struct zram {

* zram is claimed so open request will be failed
*/
bool claim; /* Protected by bdev->bd_mutex */
+ bool use_dedup;
};

+static inline bool zram_dedup_enabled(struct zram *zram)
+{

+#ifdef CONFIG_ZRAM_DEDUP
+ return zram->use_dedup;
+#else
+ return 0;
+#endif

Joonsoo Kim

unread,

Apr 25, 2017, 4:20:06 AM4/25/17

to

Hello, Andrew.

There is one trivial stuff so I sent the new 3/4 patch.
If you want to resend the full series, I will do it.
Please let me know your preference.

Thanks.

Sergey Senozhatsky

unread,

Apr 25, 2017, 6:30:05 AM4/25/17

to

Hello,

On (04/21/17 10:14), js1...@gmail.com wrote:
[..]

> int zram_dedup_init(struct zram *zram, size_t num_pages);
> void zram_dedup_fini(struct zram *zram);
> +#else
> +
> +static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
> +static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
> +
> +static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
> + u32 checksum) { }
> +static inline struct zram_entry *zram_dedup_find(struct zram *zram,
> + struct page *page, u32 *checksum) { return NULL; }
> +
> +static inline void zram_dedup_init_entry(struct zram *zram,
> + struct zram_entry *entry, unsigned long handle,
> + unsigned int len) { }
> +static inline bool zram_dedup_put_entry(struct zram *zram,
> + struct zram_entry *entry) { return true; }
> +
> +static inline int zram_dedup_init(struct zram *zram,
> + size_t num_pages) { return 0; }
> +static inline void zram_dedup_fini(struct zram *zram) { }
> +
> +#endif

doesn't compile on CONFIG_ZRAM=m config.

-ss

js1...@gmail.com

unread,

Apr 25, 2017, 9:00:07 PM4/25/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Acked-by: Minchan Kim <min...@kernel.org>

Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---
Documentation/ABI/testing/sysfs-block-zram | 10 ++++
Documentation/blockdev/zram.txt | 1 +
drivers/block/zram/Kconfig | 14 ++++++

drivers/block/zram/Makefile | 3 +-

drivers/block/zram/zram_dedup.c | 15 ++++++
drivers/block/zram/zram_dedup.h | 23 +++++++++
drivers/block/zram/zram_drv.c | 81 ++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 9 ++++

8 files changed, 144 insertions(+), 12 deletions(-)

index 29cb008..d7204ef 100644

--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,4 @@
-zram-y := zcomp.o zram_drv.o zram_dedup.o
+zram-y := zcomp.o zram_drv.o

+zram-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o

int zram_dedup_init(struct zram *zram, size_t num_pages);
void zram_dedup_fini(struct zram *zram);
+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+
+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,
+ struct page *page, u32 *checksum) { return NULL; }
+
+static inline void zram_dedup_init_entry(struct zram *zram,
+ struct zram_entry *entry, unsigned long handle,
+ unsigned int len) { }
+static inline bool zram_dedup_put_entry(struct zram *zram,
+ struct zram_entry *entry) { return true; }
+
+static inline int zram_dedup_init(struct zram *zram,
+ size_t num_pages) { return 0; }
+static inline void zram_dedup_fini(struct zram *zram) { }
+
+#endif

+static unsigned long zram_entry_handle(struct zram *zram,
+ struct zram_entry *entry)

index 4b86921..0091e23 100644

--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,7 +134,16 @@ struct zram {
* zram is claimed so open request will be failed
*/
bool claim; /* Protected by bdev->bd_mutex */
+ bool use_dedup;
};

+static inline bool zram_dedup_enabled(struct zram *zram)
+{
+#ifdef CONFIG_ZRAM_DEDUP
+ return zram->use_dedup;
+#else

+ return false;

js1...@gmail.com

unread,

Apr 25, 2017, 9:00:07 PM4/25/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.

Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_dedup.c | 59 ++++++++++++++++++++++++++++++++---------
1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 560b1f5..14c4988 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c

@@ -109,6 +109,51 @@ static unsigned long zram_dedup_put(struct zram *zram,
return entry->refcount;
}

+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+ struct zram_hash *hash, unsigned char *mem,

+ struct zram_entry *entry)
+{

js1...@gmail.com

unread,

Apr 25, 2017, 9:00:07 PM4/25/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Changes from v3
o fix module build problem
o make zram_dedup_enabled() statically return false if CONFIG_ZRAM_DEDUP=n

Changes from v2
o rebase to latest zram code
o manage alloc/free of the zram_entry in zram_drv.c
o remove useless RB_CLEAR_NODE
o set RO permission tor use_deup sysfs entry if CONFIG_ZRAM_DEDUP=n

Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch #2 so please
refer it.

Thanks.

Joonsoo Kim (4):
zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram

zram: make deduplication feature optional

zram: compare all the entries with same checksum for deduplication

Documentation/ABI/testing/sysfs-block-zram | 10 ++
Documentation/blockdev/zram.txt | 3 +
drivers/block/zram/Kconfig | 14 ++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/zram_dedup.c | 254 +++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 45 +++++
drivers/block/zram/zram_drv.c | 184 +++++++++++++++++----

drivers/block/zram/zram_drv.h | 32 +++-
8 files changed, 507 insertions(+), 38 deletions(-)

js1...@gmail.com

unread,

Apr 25, 2017, 9:00:07 PM4/25/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Following patch will implement deduplication functionality
in the zram and it requires an indirection layer to manage
the life cycle of zsmalloc handle. To prepare that, this patch
introduces zram_entry which can be used to manage the life-cycle
of zsmalloc handle. Many lines are changed due to rename but
core change is just simple introduction about newly data structure.

Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_drv.c | 95 +++++++++++++++++++++++++++----------------
drivers/block/zram/zram_drv.h | 6 ++-
2 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index debee95..26dc4e5 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -57,14 +57,15 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}

-static unsigned long zram_get_handle(struct zram *zram, u32 index)
+static struct zram_entry *zram_get_entry(struct zram *zram, u32 index)
{
- return zram->table[index].handle;
+ return zram->table[index].entry;
}

-static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+static void zram_set_entry(struct zram *zram, u32 index,
+ struct zram_entry *entry)
{
- zram->table[index].handle = handle;
+ zram->table[index].entry = entry;
}

/* flag operations require table entry bit_spin_lock() being held */
@@ -437,7 +438,7 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
unsigned int offset, unsigned int len)
{
zram_slot_lock(zram, index);
- if (unlikely(!zram_get_handle(zram, index) ||
+ if (unlikely(!zram_get_entry(zram, index) ||
zram_test_flag(zram, index, ZRAM_SAME))) {
void *mem;

@@ -476,6 +477,32 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
return false;
}

+static struct zram_entry *zram_entry_alloc(struct zram *zram,
+ unsigned int len, gfp_t flags)

+{
+ struct zram_entry *entry;

+
+ entry = kzalloc(sizeof(*entry),
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ if (!entry)

+ return NULL;
+
+ entry->handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!entry->handle) {
+ kfree(entry);

+ return NULL;
+ }
+

+ return entry;
+}
+

+static inline void zram_entry_free(struct zram *zram,

+ struct zram_entry *entry)
+{

+ zs_free(zram->mem_pool, entry->handle);
+ kfree(entry);
+}
+

static void zram_meta_free(struct zram *zram, u64 disksize)

{
size_t num_pages = disksize >> PAGE_SHIFT;

@@ -514,7 +541,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
*/

static void zram_free_page(struct zram *zram, size_t index)

{
- unsigned long handle = zram_get_handle(zram, index);
+ struct zram_entry *entry = zram_get_entry(zram, index);

/*
* No memory is allocated for same element filled pages.

@@ -527,23 +554,23 @@ static void zram_free_page(struct zram *zram, size_t index)

return;
}

- if (!handle)
+ if (!entry)
return;

- zs_free(zram->mem_pool, handle);
+ zram_entry_free(zram, entry);

atomic64_sub(zram_get_obj_size(zram, index),
&zram->stats.compr_data_size);
atomic64_dec(&zram->stats.pages_stored);

- zram_set_handle(zram, index, 0);
+ zram_set_entry(zram, index, NULL);
zram_set_obj_size(zram, index, 0);

}

static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)

{
int ret;

- unsigned long handle;
+ struct zram_entry *entry;
unsigned int size;

void *src, *dst;

@@ -551,10 +578,10 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
return 0;

zram_slot_lock(zram, index);
- handle = zram_get_handle(zram, index);
+ entry = zram_get_entry(zram, index);
size = zram_get_obj_size(zram, index);

- src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+ src = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);

if (size == PAGE_SIZE) {
dst = kmap_atomic(page);
memcpy(dst, src, PAGE_SIZE);

@@ -568,7 +595,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
kunmap_atomic(dst);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(zram->mem_pool, handle);

+ zs_unmap_object(zram->mem_pool, entry->handle);

zram_slot_unlock(zram, index);

/* Should NEVER happen. Return bio error if it does. */

@@ -612,14 +639,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
}

static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
- struct page *page,
- unsigned long *out_handle, unsigned int *out_comp_len)
+ struct page *page, struct zram_entry **out_entry,
+ unsigned int *out_comp_len)
{
int ret;
unsigned int comp_len;
void *src;
unsigned long alloced_pages;

- unsigned long handle = 0;

+ struct zram_entry *entry = NULL;

compress_again:
src = kmap_atomic(page);
@@ -628,8 +655,8 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,

if (unlikely(ret)) {
pr_err("Compression failed! err=%d\n", ret);
- if (handle)
- zs_free(zram->mem_pool, handle);
+ if (entry)
+ zram_entry_free(zram, entry);
return ret;
}

@@ -637,32 +664,32 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,
comp_len = PAGE_SIZE;

/*
- * handle allocation has 2 paths:
+ * entry allocation has 2 paths:
* a) fast path is executed with preemption disabled (for
* per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
* since we can't sleep;
* b) slow path enables preemption and attempts to allocate
* the page with __GFP_DIRECT_RECLAIM bit set. we have to
* put per-cpu compression stream and, thus, to re-do
- * the compression once handle is allocated.
+ * the compression once entry is allocated.
*
- * if we have a 'non-null' handle here then we are coming
- * from the slow path and handle has already been allocated.
+ * if we have a 'non-null' entry here then we are coming
+ * from the slow path and entry has already been allocated.
*/
- if (!handle)

- handle = zs_malloc(zram->mem_pool, comp_len,
+ if (!entry)
+ entry = zram_entry_alloc(zram, comp_len,

__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
__GFP_HIGHMEM |
__GFP_MOVABLE);
- if (!handle) {

+ if (!entry) {
zcomp_stream_put(zram->comp);
atomic64_inc(&zram->stats.writestall);
- handle = zs_malloc(zram->mem_pool, comp_len,
+ entry = zram_entry_alloc(zram, comp_len,
GFP_NOIO | __GFP_HIGHMEM |
__GFP_MOVABLE);
*zstrm = zcomp_stream_get(zram->comp);

- if (handle)
+ if (entry)
goto compress_again;

return -ENOMEM;
}
@@ -671,11 +698,11 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,

update_used_max(zram, alloced_pages);

if (zram->limit_pages && alloced_pages > zram->limit_pages) {

- zs_free(zram->mem_pool, handle);
+ zram_entry_free(zram, entry);
return -ENOMEM;
}

- *out_handle = handle;
+ *out_entry = entry;
*out_comp_len = comp_len;
return 0;
}
@@ -683,7 +710,7 @@ static int zram_compress(struct zram *zram, struct zcomp_strm **zstrm,

static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

{
int ret;

- unsigned long handle;
+ struct zram_entry *entry;

unsigned int comp_len;

void *src, *dst;
struct zcomp_strm *zstrm;

@@ -693,13 +720,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
return 0;

zstrm = zcomp_stream_get(zram->comp);
- ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+ ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
if (ret) {
zcomp_stream_put(zram->comp);
return ret;
}

- dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+ dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);

src = zstrm->buffer;
if (comp_len == PAGE_SIZE)

@@ -709,7 +736,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
kunmap_atomic(src);

zcomp_stream_put(zram->comp);
- zs_unmap_object(zram->mem_pool, handle);

+ zs_unmap_object(zram->mem_pool, entry->handle);

/*
* Free memory associated with this sector

@@ -717,7 +744,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
*/
zram_slot_lock(zram, index);
zram_free_page(zram, index);
- zram_set_handle(zram, index, handle);
+ zram_set_entry(zram, index, entry);
zram_set_obj_size(zram, index, comp_len);
zram_slot_unlock(zram, index);

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d..fe3d216 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,10 +69,14 @@ enum zram_pageflags {

/*-- Data structures */

+struct zram_entry {

+ unsigned long handle;
+};
+

Joonsoo Kim

unread,

Apr 25, 2017, 9:00:07 PM4/25/17

to

2017-04-25 19:24 GMT+09:00 Sergey Senozhatsky
<sergey.seno...@gmail.com>:

Hello,

Good catch!
I fixed it and sent update version, v4.

Thanks.

js1...@gmail.com

unread,

Apr 25, 2017, 9:00:08 PM4/25/17

to

Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

6 files changed, 278 insertions(+), 7 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt

@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ dup_data_size deduplicated data size
+ meta_data_size the amount of metadata allocated for deduplication feature

9) Deactivate:
swapoff /dev/zram0

diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y := zcomp.o zram_drv.o
+zram-y := zcomp.o zram_drv.o zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c

+u64 zram_dedup_dup_size(struct zram *zram)
+{

+ return (u64)atomic64_read(&zram->stats.dup_data_size);
+}
+

+u64 zram_dedup_meta_size(struct zram *zram)
+{

+ return (u64)atomic64_read(&zram->stats.meta_data_size);
+}
+
+static u32 zram_dedup_checksum(unsigned char *mem)
+{
+ return jhash(mem, PAGE_SIZE, 0);
+}
+

+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum)

+{
+ struct zram_hash *hash;
+ struct rb_root *rb_root;

+ struct rb_node **rb_node, *parent = NULL;

+ struct zram_entry *entry;
+

+ new->checksum = checksum;
+ hash = &zram->hash[checksum % zram->hash_size];
+ rb_root = &hash->rb_root;

+
+ spin_lock(&hash->lock);

+ rb_node = &rb_root->rb_node;
+ while (*rb_node) {
+ parent = *rb_node;
+ entry = rb_entry(parent, struct zram_entry, rb_node);
+ if (checksum < entry->checksum)
+ rb_node = &parent->rb_left;
+ else if (checksum > entry->checksum)
+ rb_node = &parent->rb_right;
+ else
+ rb_node = &parent->rb_left;
+ }
+
+ rb_link_node(&new->rb_node, parent, rb_node);
+ rb_insert_color(&new->rb_node, rb_root);

+ spin_unlock(&hash->lock);
+}
+

+static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
+ unsigned char *mem)
+{
+ bool match = false;
+ unsigned char *cmem;
+ struct zcomp_strm *zstrm;
+
+ cmem = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }

+ zs_unmap_object(zram->mem_pool, entry->handle);

+
+ return match;
+}
+

+static unsigned long zram_dedup_put(struct zram *zram,
+ struct zram_entry *entry)

+{
+ struct zram_hash *hash;
+ u32 checksum;
+
+ checksum = entry->checksum;

+ hash = &zram->hash[checksum % zram->hash_size];

+
+ spin_lock(&hash->lock);
+

+ entry->refcount--;
+ if (!entry->refcount)
+ rb_erase(&entry->rb_node, &hash->rb_root);
+ else
+ atomic64_sub(entry->len, &zram->stats.dup_data_size);

+
+ spin_unlock(&hash->lock);
+

+ return entry->refcount;
+}
+

+static struct zram_entry *zram_dedup_get(struct zram *zram,
+ unsigned char *mem, u32 checksum)

+{
+ struct zram_hash *hash;

+ struct zram_entry *entry;

+ struct rb_node *rb_node;
+

+ hash = &zram->hash[checksum % zram->hash_size];
+
+ spin_lock(&hash->lock);

+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {

+ entry = rb_entry(rb_node, struct zram_entry, rb_node);

+ if (checksum == entry->checksum) {

+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+
+ zram_entry_free(zram, entry);
+

+ return NULL;
+ }
+

+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;

+ }
+ spin_unlock(&hash->lock);
+

+ return NULL;
+}
+

+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+ u32 *checksum)
+{
+ void *mem;

+ struct zram_entry *entry;
+

+ mem = kmap_atomic(page);
+ *checksum = zram_dedup_checksum(mem);
+
+ entry = zram_dedup_get(zram, mem, *checksum);
+ kunmap_atomic(mem);

+
+ return entry;
+}
+

+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len)
+{

+ entry->handle = handle;
+ entry->refcount = 1;
+ entry->len = len;
+}
+

+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
+{

+ if (zram_dedup_put(zram, entry))
+ return false;
+
+ return true;
+}
+
+int zram_dedup_init(struct zram *zram, size_t num_pages)
+{
+ int i;
+ struct zram_hash *hash;
+

+ zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;

+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len);
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);
+
+int zram_dedup_init(struct zram *zram, size_t num_pages);
+void zram_dedup_fini(struct zram *zram);
+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 26dc4e5..8eab8a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c

@@ -389,14 +389,16 @@ static ssize_t mm_stat_show(struct device *dev,
max_used = atomic_long_read(&zram->stats.max_used_pages);

ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ zram_dedup_dup_size(zram),
+ zram_dedup_meta_size(zram));
up_read(&zram->init_lock);

return ret;

@@ -481,26 +483,34 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,

unsigned int len, gfp_t flags)

{
struct zram_entry *entry;
+ unsigned long handle;

entry = kzalloc(sizeof(*entry),
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
if (!entry)
return NULL;

- entry->handle = zs_malloc(zram->mem_pool, len, flags);
- if (!entry->handle) {
+ handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!handle) {
kfree(entry);
return NULL;
}

+ zram_dedup_init_entry(zram, entry, handle, len);
+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+
return entry;
}

-static inline void zram_entry_free(struct zram *zram,
- struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_entry *entry)
{
+ if (!zram_dedup_put_entry(zram, entry))
+ return;

+
zs_free(zram->mem_pool, entry->handle);

kfree(entry);
+
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
}

static void zram_meta_free(struct zram *zram, u64 disksize)

@@ -513,6 +523,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
zram_free_page(zram, index);

zs_destroy_pool(zram->mem_pool);
+ zram_dedup_fini(zram);
vfree(zram->table);
}

@@ -531,6 +542,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
return false;
}

+ if (zram_dedup_init(zram, num_pages)) {
+ vfree(zram->table);
+ zs_destroy_pool(zram->mem_pool);
+ return false;
+ }
+
return true;
}

@@ -715,10 +732,17 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

void *src, *dst;
struct zcomp_strm *zstrm;

struct page *page = bvec->bv_page;
+ u32 checksum;

if (zram_same_page_write(zram, index, page))
return 0;

+ entry = zram_dedup_find(zram, page, &checksum);
+ if (entry) {
+ comp_len = entry->len;
+ goto found_dup;
+ }
+
zstrm = zcomp_stream_get(zram->comp);

ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
if (ret) {

@@ -737,7 +761,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

zcomp_stream_put(zram->comp);
zs_unmap_object(zram->mem_pool, entry->handle);
+ zram_dedup_insert(zram, entry, checksum);

+found_dup:

/*
* Free memory associated with this sector

* before overwriting unused sectors.

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index fe3d216..4b86921 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h

@@ -18,8 +18,10 @@
#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>

#include "zcomp.h"
+#include "zram_dedup.h"

/*-- Configurable parameters */

@@ -70,6 +72,10 @@ enum zram_pageflags {
/*-- Data structures */

*/
bool claim; /* Protected by bdev->bd_mutex */
};
+

+void zram_entry_free(struct zram *zram, struct zram_entry *entry);
#endif
--
2.7.4

Sergey Senozhatsky

unread,

Apr 25, 2017, 10:20:04 PM4/25/17

to

Hello,

On (04/26/17 09:52), js1...@gmail.com wrote:
[..]

> ret = scnprintf(buf, PAGE_SIZE,
> - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> orig_size << PAGE_SHIFT,
> (u64)atomic64_read(&zram->stats.compr_data_size),
> mem_used << PAGE_SHIFT,
> zram->limit_pages << PAGE_SHIFT,
> max_used << PAGE_SHIFT,
> (u64)atomic64_read(&zram->stats.same_pages),
> - pool_stats.pages_compacted);
> + pool_stats.pages_compacted,
> + zram_dedup_dup_size(zram),
> + zram_dedup_meta_size(zram));

hm... should't we subtract zram_dedup_dup_size(zram) from
->stats.compr_data_size? we don't use extra memory for dedupped
pages. or don't inc ->stats.compr_data_size for dedupped pages?

for instance, same_page_write() does not inc ->stats.compr_data_size,
while successful zram_dedup_find() does (in __zram_bvec_write()).

-ss

Sergey Senozhatsky

unread,

Apr 25, 2017, 10:40:05 PM4/25/17

to

On (04/26/17 09:52), js1...@gmail.com wrote:
[..]

> struct zram_entry {
> + struct rb_node rb_node;
> + u32 len;
> + u32 checksum;
> + unsigned long refcount;

use refcount_t? what do you think?

-ss

Sergey Senozhatsky

unread,

Apr 26, 2017, 12:10:05 AM4/26/17

to

On (04/26/17 09:52), js1...@gmail.com wrote:
[..]

> <no-dedup>
> Elapsed time: out/host: 88 s
> mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
>
> <dedup>
> Elapsed time: out/host: 100 s
> mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
>
> It shows performance degradation roughly 13% and save 24% memory. Maybe,
> it is due to overhead of calculating checksum and comparison.

I like the patch set, and it makes sense. the benefit is, obviously,
case-by-case. on my system I've managed to save just 60MB on a 2.7G
data set, which is far less than I was hoping to save :)

I usually do DIRECT IO fio performance test. JFYI, the results
were as follows:

NO DEDUP DEDUP
#jobs1
WRITE: (586MB/s) (235MB/s)
WRITE: (535MB/s) (226MB/s)
READ: (415MB/s) (189MB/s)
WRITE: (416MB/s) (190MB/s)
READ: (382MB/s) (375MB/s)
READ: (312MB/s) (314MB/s)
READ: (197MB/s) (166MB/s)
WRITE: (197MB/s) (166MB/s)
#jobs2
WRITE: (1039MB/s) (447MB/s)
WRITE: (642MB/s) (262MB/s)
READ: (654MB/s) (337MB/s)
WRITE: (654MB/s) (337MB/s)
READ: (1444MB/s) (1184MB/s)
READ: (1291MB/s) (1140MB/s)
READ: (451MB/s) (223MB/s)
WRITE: (451MB/s) (224MB/s)
#jobs3
WRITE: (1175MB/s) (648MB/s)
WRITE: (678MB/s) (277MB/s)
READ: (921MB/s) (509MB/s)
WRITE: (921MB/s) (509MB/s)
READ: (1322MB/s) (1146MB/s)
READ: (1642MB/s) (1266MB/s)
READ: (583MB/s) (248MB/s)
WRITE: (582MB/s) (248MB/s)
#jobs4
WRITE: (1734MB/s) (878MB/s)
WRITE: (712MB/s) (291MB/s)
READ: (1298MB/s) (695MB/s)
WRITE: (1300MB/s) (695MB/s)
READ: (1885MB/s) (2197MB/s)
READ: (2908MB/s) (2256MB/s)
READ: (621MB/s) (268MB/s)
WRITE: (622MB/s) (268MB/s)
#jobs5
WRITE: (2260MB/s) (1104MB/s)
WRITE: (740MB/s) (296MB/s)
READ: (1578MB/s) (900MB/s)
WRITE: (1579MB/s) (901MB/s)
READ: (2378MB/s) (1969MB/s)
READ: (2573MB/s) (1753MB/s)
READ: (650MB/s) (275MB/s)
WRITE: (650MB/s) (275MB/s)
#jobs6
WRITE: (2907MB/s) (1331MB/s)
WRITE: (773MB/s) (299MB/s)
READ: (1733MB/s) (1010MB/s)
WRITE: (1734MB/s) (1011MB/s)
READ: (3010MB/s) (2465MB/s)
READ: (2715MB/s) (1772MB/s)
READ: (672MB/s) (280MB/s)
WRITE: (672MB/s) (280MB/s)
#jobs7
WRITE: (3098MB/s) (1526MB/s)
WRITE: (779MB/s) (299MB/s)
READ: (1800MB/s) (1151MB/s)
WRITE: (1800MB/s) (1151MB/s)
READ: (3294MB/s) (2765MB/s)
READ: (2861MB/s) (2031MB/s)
READ: (693MB/s) (283MB/s)
WRITE: (693MB/s) (283MB/s)
#jobs8
WRITE: (3425MB/s) (1739MB/s)
WRITE: (792MB/s) (302MB/s)
READ: (2003MB/s) (1289MB/s)
WRITE: (2004MB/s) (1289MB/s)
READ: (3901MB/s) (3218MB/s)
READ: (2954MB/s) (2297MB/s)
READ: (714MB/s) (285MB/s)
WRITE: (714MB/s) (286MB/s)
#jobs9
WRITE: (3744MB/s) (1944MB/s)
WRITE: (794MB/s) (303MB/s)
READ: (2038MB/s) (1400MB/s)
WRITE: (2040MB/s) (1401MB/s)
READ: (4159MB/s) (3583MB/s)
READ: (2963MB/s) (2425MB/s)
READ: (700MB/s) (286MB/s)
WRITE: (700MB/s) (286MB/s)
#jobs10
WRITE: (3863MB/s) (2122MB/s)
WRITE: (780MB/s) (304MB/s)
READ: (2059MB/s) (1551MB/s)
WRITE: (2060MB/s) (1552MB/s)
READ: (4573MB/s) (4004MB/s)
READ: (3025MB/s) (2271MB/s)
READ: (706MB/s) (287MB/s)
WRITE: (706MB/s) (287MB/s)

NO DEDUP DEDUP
jobs1 perfstat
stalled-cycles-frontend 86,881,019,525 ( 45.68%) 95,848,055,454 ( 44.71%)
stalled-cycles-backend 41,714,207,854 ( 21.93%) 44,958,946,213 ( 20.97%)
instructions 179,619,952,260 ( 0.94) 208,777,551,585 ( 0.97)
branches 40,332,931,254 ( 669.275) 46,951,257,852 ( 680.888)
branch-misses 211,977,694 ( 0.53%) 227,694,865 ( 0.48%)
jobs2 perfstat
stalled-cycles-frontend 126,409,074,631 ( 49.54%) 151,381,853,397 ( 50.04%)
stalled-cycles-backend 63,694,911,449 ( 24.96%) 75,662,875,621 ( 25.01%)
instructions 237,127,811,965 ( 0.93) 278,316,276,728 ( 0.92)
branches 51,822,635,738 ( 597.433) 60,986,851,952 ( 590.588)
branch-misses 315,755,016 ( 0.61%) 347,424,093 ( 0.57%)
jobs3 perfstat
stalled-cycles-frontend 191,089,532,462 ( 51.36%) 231,335,643,820 ( 52.60%)
stalled-cycles-backend 97,686,908,504 ( 26.26%) 120,989,629,180 ( 27.51%)
instructions 331,602,259,024 ( 0.89) 381,383,314,335 ( 0.87)
branches 72,970,225,738 ( 539.030) 84,708,904,219 ( 536.393)
branch-misses 507,931,782 ( 0.70%) 518,364,954 ( 0.61%)
jobs4 perfstat
stalled-cycles-frontend 223,445,892,011 ( 52.52%) 249,360,340,619 ( 53.35%)
stalled-cycles-backend 114,943,710,906 ( 27.02%) 128,993,039,366 ( 27.60%)
instructions 380,626,111,963 ( 0.89) 414,605,021,230 ( 0.89)
branches 82,445,041,251 ( 536.219) 89,259,825,600 ( 535.932)
branch-misses 583,497,374 ( 0.71%) 588,281,124 ( 0.66%)
jobs5 perfstat
stalled-cycles-frontend 263,619,571,807 ( 52.85%) 321,088,781,157 ( 54.21%)
stalled-cycles-backend 134,179,697,076 ( 26.90%) 167,682,863,875 ( 28.31%)
instructions 443,228,511,108 ( 0.89) 507,399,916,840 ( 0.86)
branches 95,466,677,594 ( 532.379) 110,214,238,221 ( 523.279)
branch-misses 672,890,143 ( 0.70%) 730,685,294 ( 0.66%)
jobs6 perfstat
stalled-cycles-frontend 291,216,196,229 ( 53.06%) 368,134,629,095 ( 54.77%)
stalled-cycles-backend 148,091,553,338 ( 26.98%) 196,277,284,992 ( 29.20%)
instructions 490,912,765,147 ( 0.89) 568,687,905,057 ( 0.85)
branches 105,042,602,744 ( 532.627) 123,581,871,771 ( 515.815)
branch-misses 714,275,025 ( 0.68%) 795,739,846 ( 0.64%)
jobs7 perfstat
stalled-cycles-frontend 330,653,507,547 ( 53.29%) 408,847,632,764 ( 54.82%)
stalled-cycles-backend 173,091,173,960 ( 27.90%) 225,113,359,025 ( 30.18%)
instructions 551,239,096,548 ( 0.89) 634,980,768,831 ( 0.85)
branches 117,955,044,019 ( 531.343) 137,597,472,747 ( 516.418)
branch-misses 803,168,054 ( 0.68%) 890,836,684 ( 0.65%)
jobs8 perfstat
stalled-cycles-frontend 356,648,221,633 ( 53.60%) 430,630,870,321 ( 54.78%)
stalled-cycles-backend 181,355,532,439 ( 27.26%) 225,833,853,276 ( 28.73%)
instructions 591,082,455,684 ( 0.89) 675,551,813,173 ( 0.86)
branches 125,997,333,678 ( 530.684) 145,611,739,589 ( 519.054)
branch-misses 823,592,583 ( 0.65%) 934,474,711 ( 0.64%)
jobs9 perfstat
stalled-cycles-frontend 390,346,479,419 ( 53.77%) 464,669,227,714 ( 54.83%)
stalled-cycles-backend 194,450,562,714 ( 26.79%) 231,567,413,546 ( 27.32%)
instructions 645,446,764,508 ( 0.89) 732,189,754,115 ( 0.86)
branches 137,219,009,187 ( 529.491) 157,387,961,718 ( 520.710)
branch-misses 898,414,437 ( 0.65%) 990,118,876 ( 0.63%)
jobs10 perfstat
stalled-cycles-frontend 424,813,625,460 ( 54.03%) 504,229,836,920 ( 54.93%)
stalled-cycles-backend 216,498,718,346 ( 27.54%) 264,019,747,607 ( 28.76%)
instructions 698,139,050,494 ( 0.89) 791,700,475,394 ( 0.86)
branches 148,117,364,023 ( 527.254) 170,053,726,692 ( 520.519)
branch-misses 962,807,047 ( 0.65%) 1,065,430,885 ( 0.63%)

NO DEDUP DEDUP
seconds elapsed 63.480344573 97.971439666
seconds elapsed 58.655823065 110.635292879
seconds elapsed 73.470677655 139.499601409
seconds elapsed 78.538327853 150.636241458
seconds elapsed 87.074751505 177.352050355
seconds elapsed 96.067528245 200.932336337
seconds elapsed 107.100552618 223.780631440
seconds elapsed 114.803603478 244.200975111
seconds elapsed 127.158686582 267.434811513
seconds elapsed 139.436204098 291.221754828

-ss

Sergey Senozhatsky

unread,

Apr 26, 2017, 12:30:05 AM4/26/17

to

On (04/26/17 09:52), js1...@gmail.com wrote:
[..]

> +struct zram_hash {
> + spinlock_t lock;
> + struct rb_root rb_root;
> };

just a note.

we can easily have N CPUs spinning on ->lock for __zram_dedup_get() lookup,
which can invole a potentially slow zcomp_decompress() [zlib, for example,
with 64k pages] and memcmp(). the larger PAGE_SHIFT is, the more serialized
IOs become. in theory, at least.

CPU0 CPU1 ... CPUN

__zram_bvec_write() __zram_bvec_write() __zram_bvec_write()
zram_dedup_find() zram_dedup_find() zram_dedup_find()
spin_lock(&hash->lock);
spin_lock(&hash->lock); spin_lock(&hash->lock);
__zram_dedup_get()
zcomp_decompress()
...

so may be there is a way to use read-write lock instead on spinlock for hash
and reduce write/read IO serialization.

-ss

Joonsoo Kim

unread,

Apr 26, 2017, 2:00:05 AM4/26/17

to

On Wed, Apr 26, 2017 at 11:14:52AM +0900, Sergey Senozhatsky wrote:
> Hello,
>
> On (04/26/17 09:52), js1...@gmail.com wrote:
> [..]
> > ret = scnprintf(buf, PAGE_SIZE,
> > - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
> > + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
> > orig_size << PAGE_SHIFT,
> > (u64)atomic64_read(&zram->stats.compr_data_size),
> > mem_used << PAGE_SHIFT,
> > zram->limit_pages << PAGE_SHIFT,
> > max_used << PAGE_SHIFT,
> > (u64)atomic64_read(&zram->stats.same_pages),
> > - pool_stats.pages_compacted);
> > + pool_stats.pages_compacted,
> > + zram_dedup_dup_size(zram),
> > + zram_dedup_meta_size(zram));
>
> hm... should't we subtract zram_dedup_dup_size(zram) from
> ->stats.compr_data_size? we don't use extra memory for dedupped
> pages. or don't inc ->stats.compr_data_size for dedupped pages?

Hmm... My intention is to keep previous stat as much as possible. User
can just notice the saving by only checking mem_used.

However, it's also odd that compr_data_size doesn't show actual
compressed data size.

Minchan, what do you think about it?

Thanks.

Sergey Senozhatsky

unread,

Apr 26, 2017, 2:10:05 AM4/26/17

to

On (04/26/17 14:59), Joonsoo Kim wrote:

> We don't need atomic operation for refcount but refcount_t does.
> API of refcount_t provides additional guarantee that refcount will not
> overflow but I'm not sure that this overhead is needed here.

refcount_t has some additional debugging features which probably
could be helpful one day, but not necessarily needed in current
zram_dedup implementation, agree. asked just in case.

-ss

Joonsoo Kim

unread,

Apr 26, 2017, 2:10:06 AM4/26/17

to

On Wed, Apr 26, 2017 at 01:02:43PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 09:52), js1...@gmail.com wrote:
> [..]
> > <no-dedup>
> > Elapsed time: out/host: 88 s
> > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> >
> > <dedup>
> > Elapsed time: out/host: 100 s
> > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> >
> > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > it is due to overhead of calculating checksum and comparison.
>
> I like the patch set, and it makes sense. the benefit is, obviously,
> case-by-case. on my system I've managed to save just 60MB on a 2.7G
> data set, which is far less than I was hoping to save :)
>
>
> I usually do DIRECT IO fio performance test. JFYI, the results
> were as follows:

Could you share your fio test setting? I will try to re-generate the
result and analyze it.

I guess that contention happens due to same data page. Could you check
it?

Thanks.

Joonsoo Kim

unread,

Apr 26, 2017, 2:10:06 AM4/26/17

to

On Wed, Apr 26, 2017 at 11:37:18AM +0900, Sergey Senozhatsky wrote:

We don't need atomic operation for refcount but refcount_t does.
API of refcount_t provides additional guarantee that refcount will not
overflow but I'm not sure that this overhead is needed here.

Thanks.

Joonsoo Kim

unread,

Apr 26, 2017, 2:10:07 AM4/26/17

to

In fact, dedup release hash->lock before doing zcomp_decompress(). So,
above contention cannot happen.

However, contention still possible when traversing the rb_tree. If
your fio shows that contention, I will change it to read-write lock.

Thanks.

Sergey Senozhatsky

unread,

Apr 26, 2017, 2:20:05 AM4/26/17

to

On (04/26/17 15:08), Joonsoo Kim wrote:
> > > +struct zram_hash {
> > > + spinlock_t lock;
> > > + struct rb_root rb_root;
> > > };
> >
> > just a note.
> >
> > we can easily have N CPUs spinning on ->lock for __zram_dedup_get() lookup,
> > which can invole a potentially slow zcomp_decompress() [zlib, for example,
> > with 64k pages] and memcmp(). the larger PAGE_SHIFT is, the more serialized
> > IOs become. in theory, at least.
> >
> > CPU0 CPU1 ... CPUN
> >
> > __zram_bvec_write() __zram_bvec_write() __zram_bvec_write()
> > zram_dedup_find() zram_dedup_find() zram_dedup_find()
> > spin_lock(&hash->lock);
> > spin_lock(&hash->lock); spin_lock(&hash->lock);
> > __zram_dedup_get()
> > zcomp_decompress()
> > ...
> >
> >
> > so may be there is a way to use read-write lock instead on spinlock for hash
> > and reduce write/read IO serialization.
>
> In fact, dedup release hash->lock before doing zcomp_decompress(). So,
> above contention cannot happen.

oh, my bad. you are right. somehow I didn't spot the unlock.

-ss

Sergey Senozhatsky

unread,

Apr 26, 2017, 2:30:05 AM4/26/17

to

sure.

I think I used this one: https://github.com/sergey-senozhatsky/zram-perf-test

// hm... may be slightly modified on my box.

I'll run more tests.

what I did:

#0
ZRAM_SIZE=2G ZRAM_COMP_ALG=lzo LOG_SUFFIX=NO-DEDUP FIO_LOOPS=2 ./zram-fio-test.sh

#1
add `echo 1 > /sys/block/zram0/use_dedup` to create_zram
ZRAM_SIZE=2G ZRAM_COMP_ALG=lzo LOG_SUFFIX=DEDUP FIO_LOOPS=2 ./zram-fio-test.sh

both in ./conf/fio-template-static-buffer fio config.

and then

#2
./fio-perf-o-meter.sh /tmp/test-fio-zram-NO-DEDUP /tmp/test-fio-zram-DEDUP > /tmp/RES

-ss

Minchan Kim

unread,

Apr 26, 2017, 2:30:05 AM4/26/17

to

Hi Sergey and Joonsoo,

Actually, I found it for the last review cycle but didn't say that
intentionally. Because it is also odd to me that pages_stored isn't
increased for same_pages so I thought we can fix it all.

I mean:

* normal page
inc pages_stored
inc compr_data_size
* same_page
inc pages_stored
inc same_pages
* dedup_page
inc pages_stored
inc dup_data_size

IOW, pages_stored should be increased for every write IO.
But the concern is we have said in zram.txt

orig_data_size uncompressed size of data stored in this disk.
This excludes same-element-filled pages (same_pages) since
no memory is allocated for them.

So, we might be too late. :-(
What do you think about it?
If anyone doesn't have any objection, I want to correct it all.

Thanks.

Joonsoo Kim

unread,

Apr 26, 2017, 3:10:06 AM4/26/17

to

I have no objection.
If so, do I need to postpone this patchset until others are fixed?

Thanks.

Sergey Senozhatsky

unread,

Apr 26, 2017, 3:20:06 AM4/26/17

to

On (04/26/17 15:29), Minchan Kim wrote:
[..]

> Actually, I found it for the last review cycle but didn't say that
> intentionally. Because it is also odd to me that pages_stored isn't
> increased for same_pages so I thought we can fix it all.
>
> I mean:
>
> * normal page
> inc pages_stored
> inc compr_data_size
> * same_page
> inc pages_stored
> inc same_pages
> * dedup_page
> inc pages_stored
> inc dup_data_size
>
> IOW, pages_stored should be increased for every write IO.

looks good to me.

> But the concern is we have said in zram.txt
>
> orig_data_size uncompressed size of data stored in this disk.
> This excludes same-element-filled pages (same_pages) since
> no memory is allocated for them.
>
> So, we might be too late. :-(
> What do you think about it?

personally, I don't think anyone cares :) the numbers are different
(and, apparently, not quite accurate) all the time. my point was that
stats were not represented in a very convenient way and I agree with
the proposed change.

-ss

Sergey Senozhatsky

unread,

Apr 26, 2017, 3:30:06 AM4/26/17

to

On (04/26/17 15:59), Joonsoo Kim wrote:
[..]

> > Actually, I found it for the last review cycle but didn't say that
> > intentionally. Because it is also odd to me that pages_stored isn't
> > increased for same_pages so I thought we can fix it all.
> >
> > I mean:
> >
> > * normal page
> > inc pages_stored
> > inc compr_data_size
> > * same_page
> > inc pages_stored
> > inc same_pages
> > * dedup_page
> > inc pages_stored
> > inc dup_data_size
> >
> > IOW, pages_stored should be increased for every write IO.
> > But the concern is we have said in zram.txt
> >
> > orig_data_size uncompressed size of data stored in this disk.
> > This excludes same-element-filled pages (same_pages) since
> > no memory is allocated for them.
> >
> > So, we might be too late. :-(
> > What do you think about it?
> > If anyone doesn't have any objection, I want to correct it all.
>
> I have no objection.
> If so, do I need to postpone this patchset until others are fixed?

this probably will mess with your series a lot. so I don't mind if you or
Minchan will send stats-fixup patch after the dedup series. may be/preferably
as the last patch in the series. but if you or Minchan want to fix stats
first, then I wouldn't mind either. I just don't make a big deal out of those
stats, a bunch of fun to know numbers. my 5cents.

-ss

Minchan Kim

unread,

Apr 26, 2017, 3:50:05 AM4/26/17

to

After Andrew takes dedup patchset, I will fix it later.
Thanks.

Joonsoo Kim

unread,

Apr 27, 2017, 3:00:07 AM4/27/17

to

On Wed, Apr 26, 2017 at 03:21:04PM +0900, Sergey Senozhatsky wrote:
> On (04/26/17 15:04), Joonsoo Kim wrote:
> > On Wed, Apr 26, 2017 at 01:02:43PM +0900, Sergey Senozhatsky wrote:
> > > On (04/26/17 09:52), js1...@gmail.com wrote:
> > > [..]
> > > > <no-dedup>
> > > > Elapsed time: out/host: 88 s
> > > > mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0
> > > >
> > > > <dedup>
> > > > Elapsed time: out/host: 100 s
> > > > mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336
> > > >
> > > > It shows performance degradation roughly 13% and save 24% memory. Maybe,
> > > > it is due to overhead of calculating checksum and comparison.
> > >
> > > I like the patch set, and it makes sense. the benefit is, obviously,
> > > case-by-case. on my system I've managed to save just 60MB on a 2.7G
> > > data set, which is far less than I was hoping to save :)
> > >
> > >
> > > I usually do DIRECT IO fio performance test. JFYI, the results
> > > were as follows:
> >
> > Could you share your fio test setting? I will try to re-generate the
> > result and analyze it.
>
> sure.
>
> I think I used this one: https://github.com/sergey-senozhatsky/zram-perf-test
>
> // hm... may be slightly modified on my box.
>
> I'll run more tests.
>
>

Hello,

I tested with your benchmark and found that contention happens
since the data page is perfectly the same. All the written data (2GB)
is de-duplicated.

I tried to optimize it with read-write lock but I failed since
there is another contention, which cannot be fixed simply. That is
zsmalloc. We need to map the object and compare the content of the
compressed page to check de-duplication. Zsmalloc pins the object
by using bit spinlock when mapping. So, parallel readers to the same
object contend here.

I think that this case is so artificial and, in practice, there
would be no case that the same data page is repeatedly and parallel
written as like this. So, I'd like to keep current code. How do you
think about it, Sergey?

Just note, if we do parallel read (direct-io) to the same offset,
zsmalloc contention would happen regardless deduplication feature.
It seems that it's fundamental issue in zsmalloc.

Thanks.

Sergey Senozhatsky

unread,

Apr 27, 2017, 3:50:08 AM4/27/17

to

Hello,

On (04/27/17 15:57), Joonsoo Kim wrote:
[..]

> I tested with your benchmark and found that contention happens
> since the data page is perfectly the same. All the written data (2GB)
> is de-duplicated.

yes, a statically filled buffer to guarantee that
compression/decompression numbers/impact will be stable.
otherwise the test results are "apples vs oranges" :)

> I tried to optimize it with read-write lock but I failed since
> there is another contention, which cannot be fixed simply. That is
> zsmalloc. We need to map the object and compare the content of the
> compressed page to check de-duplication. Zsmalloc pins the object
> by using bit spinlock when mapping. So, parallel readers to the same
> object contend here.
>
> I think that this case is so artificial and, in practice, there
> would be no case that the same data page is repeatedly and parallel
> written as like this. So, I'd like to keep current code. How do you
> think about it, Sergey?

I agree. thanks for taking a look!
I see no blockers for the patch set.

<off topic>

ok, in general, seems that (correct me if I'm wrong)

a) the higher the dedup ratio the slower zram _can_ perform.

because dedup can create parallel access scenarios where they previously
never existed: different offset writes now can compete for the same dedupped
zsmalloc object.

and... tricky and probably over exaggerated

b) the lower the dedup ratio the slower zram _can_ perform.

think of almost full zram device with dedup ratio of just 3-5%. tree lookups
are serialized by the hash->lock. a balanced tree gives us slow lookup
complexity growth, it's still there but can leave with it. at the same time
low dedup ratio means that we have wasted CPU cycles on checksum calculation
(potentially for millions of pages if zram device in question is X gigabytes
in size), this can't go unnoticed.

it's just I was slightly confused by the performance numbers that you
have observed. some tests were

: It shows performance degradation roughly 13% and save 24% memory. Maybe,

: it is due to overhead of calculating checksum and comparison.

while others were

: There is no performance degradation and save 23% memory.

I understand that you didn't perform direct io, flush, fsync, etc. and
there is a whole bunch of factors that could have affected your tests,
e.g. write back, etc. etc. but the numbers are still very unstable.
may be now we will have a bit better understanding :)

</off topic>

> Just note, if we do parallel read (direct-io) to the same offset,
> zsmalloc contention would happen regardless deduplication feature.
> It seems that it's fundamental issue in zsmalloc.

that's a good find.

-ss

Sergey Senozhatsky

unread,

Apr 27, 2017, 4:00:06 AM4/27/17

to

On (04/26/17 09:52), js1...@gmail.com wrote:

> From: Joonsoo Kim <iamjoon...@lge.com>
>
> Changes from v3
> o fix module build problem
> o make zram_dedup_enabled() statically return false if CONFIG_ZRAM_DEDUP=n

the entire patch set

Reviewed-by: Sergey Senozhatsky <sergey.se...@gmail.com>

-ss

Joonsoo Kim

unread,

May 2, 2017, 1:40:05 AM5/2/17

to

On Thu, Apr 27, 2017 at 04:46:22PM +0900, Sergey Senozhatsky wrote:
> Hello,
>
> On (04/27/17 15:57), Joonsoo Kim wrote:
> [..]
> > I tested with your benchmark and found that contention happens
> > since the data page is perfectly the same. All the written data (2GB)
> > is de-duplicated.
>
> yes, a statically filled buffer to guarantee that
> compression/decompression numbers/impact will be stable.
> otherwise the test results are "apples vs oranges" :)

Yes, but, we can maintain buffer set and using it on the test will
ensure "apples vs apples" test and will be less aftificial test.
Compression algorithm's effect also cannot be measured by a statically
filled buffer.

>
> > I tried to optimize it with read-write lock but I failed since
> > there is another contention, which cannot be fixed simply. That is
> > zsmalloc. We need to map the object and compare the content of the
> > compressed page to check de-duplication. Zsmalloc pins the object
> > by using bit spinlock when mapping. So, parallel readers to the same
> > object contend here.
> >
> > I think that this case is so artificial and, in practice, there
> > would be no case that the same data page is repeatedly and parallel
> > written as like this. So, I'd like to keep current code. How do you
> > think about it, Sergey?
>
> I agree. thanks for taking a look!
> I see no blockers for the patch set.

Thanks!

>
>
> <off topic>
>
> ok, in general, seems that (correct me if I'm wrong)
>
> a) the higher the dedup ratio the slower zram _can_ perform.
>
> because dedup can create parallel access scenarios where they previously
> never existed: different offset writes now can compete for the same dedupped
> zsmalloc object.

However, the higher the dedup ratio doesn't necessarily mean that all
the data is the same. Important thing is the distribution of the data
rather than dedup ratio. And, parallelism is also important.

> and... tricky and probably over exaggerated
>
> b) the lower the dedup ratio the slower zram _can_ perform.
>
> think of almost full zram device with dedup ratio of just 3-5%. tree lookups
> are serialized by the hash->lock. a balanced tree gives us slow lookup
> complexity growth, it's still there but can leave with it. at the same time
> low dedup ratio means that we have wasted CPU cycles on checksum calculation
> (potentially for millions of pages if zram device in question is X gigabytes
> in size), this can't go unnoticed.
>
>
> it's just I was slightly confused by the performance numbers that you
> have observed. some tests were
>
> : It shows performance degradation roughly 13% and save 24% memory. Maybe,
> : it is due to overhead of calculating checksum and comparison.
>
> while others were
>
> : There is no performance degradation and save 23% memory.

As I said above, dedup ratio itself doesn't say everything. And, they are
quite different tests (kernel build vs file copy) (multi thread vs
single thread) so I cannot say anything by the fact that their saving
ratio is similar. :)

>
> I understand that you didn't perform direct io, flush, fsync, etc. and
> there is a whole bunch of factors that could have affected your tests,
> e.g. write back, etc. etc. but the numbers are still very unstable.
> may be now we will have a bit better understanding :)

I also hope so.

Thanks.

js1...@gmail.com

unread,

May 11, 2017, 10:40:05 PM5/11/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Until now, we compare just one entry with same checksum when
checking duplication since it is the simplest way to implement.
However, for the completeness, checking all the entries is better
so this patch implement to compare all the entries with same checksum.
Since this event would be rare so there would be no performance loss.

Reviewed-by: Sergey Senozhatsky <sergey.se...@gmail.com>

Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_dedup.c | 59 ++++++++++++++++++++++++++++++++---------
1 file changed, 47 insertions(+), 12 deletions(-)

diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index 560b1f5..14c4988 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -109,6 +109,51 @@ static unsigned long zram_dedup_put(struct zram *zram,
return entry->refcount;
}

+static struct zram_entry *__zram_dedup_get(struct zram *zram,
+ struct zram_hash *hash, unsigned char *mem,

+ struct zram_entry *entry)
+{

+ struct zram_entry *tmp, *prev = NULL;

+ struct rb_node *rb_node;
+

+ /* find left-most entry with same checksum */
+ while ((rb_node = rb_prev(&entry->rb_node))) {
+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+ if (tmp->checksum != entry->checksum)
+ break;
+
+ entry = tmp;
+ }
+
+again:

+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (prev)
+ zram_entry_free(zram, prev);

+
+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+

+ spin_lock(&hash->lock);
+ tmp = NULL;
+ rb_node = rb_next(&entry->rb_node);
+ if (rb_node)

+ tmp = rb_entry(rb_node, struct zram_entry, rb_node);
+

+ if (tmp && (tmp->checksum == entry->checksum)) {
+ prev = entry;
+ entry = tmp;
+ goto again;
+ }

+
+ spin_unlock(&hash->lock);

+ zram_entry_free(zram, entry);
+
+ return NULL;
+}
+

static struct zram_entry *zram_dedup_get(struct zram *zram,

unsigned char *mem, u32 checksum)

{
@@ -122,18 +167,8 @@ static struct zram_entry *zram_dedup_get(struct zram *zram,
rb_node = hash->rb_root.rb_node;
while (rb_node) {

entry = rb_entry(rb_node, struct zram_entry, rb_node);

- if (checksum == entry->checksum) {
- entry->refcount++;
- atomic64_add(entry->len, &zram->stats.dup_data_size);
- spin_unlock(&hash->lock);
-
- if (zram_dedup_match(zram, entry, mem))
- return entry;
-
- zram_entry_free(zram, entry);
-
- return NULL;
- }

+ if (checksum == entry->checksum)

js1...@gmail.com

unread,

May 11, 2017, 10:40:05 PM5/11/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Following patch will implement deduplication functionality
in the zram and it requires an indirection layer to manage
the life cycle of zsmalloc handle. To prepare that, this patch
introduces zram_entry which can be used to manage the life-cycle
of zsmalloc handle. Many lines are changed due to rename but
core change is just simple introduction about newly data structure.

Reviewed-by: Sergey Senozhatsky <sergey.se...@gmail.com>
Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

drivers/block/zram/zram_drv.c | 95 +++++++++++++++++++++++++++----------------
drivers/block/zram/zram_drv.h | 6 ++-
2 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index debee95..26dc4e5 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c

@@ -57,14 +57,15 @@ static inline struct zram *dev_to_zram(struct device *dev)
return (struct zram *)dev_to_disk(dev)->private_data;
}

-static unsigned long zram_get_handle(struct zram *zram, u32 index)
+static struct zram_entry *zram_get_entry(struct zram *zram, u32 index)
{
- return zram->table[index].handle;
+ return zram->table[index].entry;
}

-static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+static void zram_set_entry(struct zram *zram, u32 index,
+ struct zram_entry *entry)
{
- zram->table[index].handle = handle;
+ zram->table[index].entry = entry;
}

/* flag operations require table entry bit_spin_lock() being held */
@@ -437,7 +438,7 @@ static bool zram_same_page_read(struct zram *zram, u32 index,
unsigned int offset, unsigned int len)
{
zram_slot_lock(zram, index);
- if (unlikely(!zram_get_handle(zram, index) ||
+ if (unlikely(!zram_get_entry(zram, index) ||
zram_test_flag(zram, index, ZRAM_SAME))) {
void *mem;

@@ -476,6 +477,32 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
return false;
}

+static struct zram_entry *zram_entry_alloc(struct zram *zram,
+ unsigned int len, gfp_t flags)
+{

+ struct zram_entry *entry;
+

+ entry = kzalloc(sizeof(*entry),
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ if (!entry)

+ return NULL;
+

+ entry->handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!entry->handle) {
+ kfree(entry);

+ return NULL;
+ }
+

+ return entry;
+}
+

+static inline void zram_entry_free(struct zram *zram,

+ struct zram_entry *entry)
+{

+ zs_free(zram->mem_pool, entry->handle);
+ kfree(entry);
+}
+

static void zram_meta_free(struct zram *zram, u64 disksize)

+ zs_unmap_object(zram->mem_pool, entry->handle);

static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

{
int ret;
- unsigned long handle;
+ struct zram_entry *entry;
unsigned int comp_len;

void *src, *dst;
struct zcomp_strm *zstrm;

@@ -693,13 +720,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
return 0;

zstrm = zcomp_stream_get(zram->comp);

- ret = zram_compress(zram, &zstrm, page, &handle, &comp_len);
+ ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
if (ret) {

zcomp_stream_put(zram->comp);
return ret;
}

- dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+ dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);

src = zstrm->buffer;
if (comp_len == PAGE_SIZE)
@@ -709,7 +736,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
kunmap_atomic(src);

zcomp_stream_put(zram->comp);
- zs_unmap_object(zram->mem_pool, handle);

+ zs_unmap_object(zram->mem_pool, entry->handle);

/*
* Free memory associated with this sector

@@ -717,7 +744,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
*/
zram_slot_lock(zram, index);
zram_free_page(zram, index);
- zram_set_handle(zram, index, handle);
+ zram_set_entry(zram, index, entry);
zram_set_obj_size(zram, index, comp_len);
zram_slot_unlock(zram, index);

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index e34e44d..fe3d216 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -69,10 +69,14 @@ enum zram_pageflags {

/*-- Data structures */

js1...@gmail.com

unread,

May 11, 2017, 10:40:05 PM5/11/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Changes from v4
o rebase on next-20170511
o collect Reviewed-by from Sergey

Changes from v3
o fix module build problem
o make zram_dedup_enabled() statically return false if CONFIG_ZRAM_DEDUP=n

Changes from v2
o rebase to latest zram code
o manage alloc/free of the zram_entry in zram_drv.c
o remove useless RB_CLEAR_NODE
o set RO permission tor use_deup sysfs entry if CONFIG_ZRAM_DEDUP=n

Changes from v1
o reogranize dedup specific functions
o support Kconfig on/off
o update zram documents
o compare all the entries with same checksum (patch #4)

This patchset implements deduplication feature in zram. Motivation
is to save memory usage by zram. There are detailed description
about motivation and experimental results on patch #2 so please
refer it.

Thanks.

Joonsoo Kim (4):
zram: introduce zram_entry to prepare dedup functionality
zram: implement deduplication in zram
zram: make deduplication feature optional
zram: compare all the entries with same checksum for deduplication

Documentation/ABI/testing/sysfs-block-zram | 10 ++
Documentation/blockdev/zram.txt | 3 +
drivers/block/zram/Kconfig | 14 ++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/zram_dedup.c | 254 +++++++++++++++++++++++++++++
drivers/block/zram/zram_dedup.h | 45 +++++
drivers/block/zram/zram_drv.c | 184 +++++++++++++++++----
drivers/block/zram/zram_drv.h | 32 +++-
8 files changed, 507 insertions(+), 38 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

--
2.7.4

js1...@gmail.com

unread,

May 11, 2017, 10:40:05 PM5/11/17

to

From: Joonsoo Kim <iamjoon...@lge.com>

Benefit of deduplication is dependent on the workload so it's not
preferable to always enable. Therefore, make it optional in Kconfig
and device param. Default is 'off'. This option will be beneficial
for users who use the zram as blockdev and stores build output to it.

Reviewed-by: Sergey Senozhatsky <sergey.se...@gmail.com>
Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

Documentation/ABI/testing/sysfs-block-zram | 10 ++++
Documentation/blockdev/zram.txt | 1 +
drivers/block/zram/Kconfig | 14 ++++++
drivers/block/zram/Makefile | 3 +-
drivers/block/zram/zram_dedup.c | 15 ++++++
drivers/block/zram/zram_dedup.h | 23 +++++++++
drivers/block/zram/zram_drv.c | 81 ++++++++++++++++++++++++++----
drivers/block/zram/zram_drv.h | 9 ++++
8 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram
index 451b6d8..3c1945f 100644
--- a/Documentation/ABI/testing/sysfs-block-zram
+++ b/Documentation/ABI/testing/sysfs-block-zram
@@ -90,3 +90,13 @@ Description:
device's debugging info useful for kernel developers. Its
format is not documented intentionally and may change
anytime without any notice.
+
+What: /sys/block/zram<id>/use_dedup
+Date: March 2017
+Contact: Joonsoo Kim <iamjoon...@lge.com>
+Description:
+ The use_dedup file is read-write and specifies deduplication
+ feature is used or not. If enabled, duplicated data is
+ managed by reference count and will not be stored in memory
+ twice. Benefit of this feature largely depends on the workload
+ so keep attention when use.
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 2cdc303..cbbe39b 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -168,6 +168,7 @@ max_comp_streams RW the number of possible concurrent compress operations
comp_algorithm RW show and change the compression algorithm
compact WO trigger memory compaction
debug_stat RO this file is used for zram debugging purposes
+use_dedup RW show and set deduplication feature

User space is advised to use the following files to read the device statistics.
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6..2f3dd1f 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,17 @@ config ZRAM
disks and maybe many more.

See zram.txt for more information.
+
+config ZRAM_DEDUP
+ bool "Deduplication support for ZRAM data"
+ depends on ZRAM
+ default n
+ help
+ Deduplicate ZRAM data to reduce amount of memory consumption.
+ Advantage largely depends on the workload. In some cases, this
+ option reduces memory usage to the half. However, if there is no
+ duplicated data, the amount of memory consumption would be
+ increased due to additional metadata usage. And, there is
+ computation time trade-off. Please check the benefit before
+ enabling this option. Experiment shows the positive effect when
+ the zram is used as blockdev and is used to store build output.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 29cb008..d7204ef 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,4 @@
-zram-y := zcomp.o zram_drv.o zram_dedup.o
+zram-y := zcomp.o zram_drv.o
+zram-$(CONFIG_ZRAM_DEDUP) += zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c
index a8427f7..560b1f5 100644
--- a/drivers/block/zram/zram_dedup.c
+++ b/drivers/block/zram/zram_dedup.c
@@ -41,6 +41,9 @@ void zram_dedup_insert(struct zram *zram, struct zram_entry *new,

struct rb_node **rb_node, *parent = NULL;

struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram))
+ return;

+
new->checksum = checksum;

hash = &zram->hash[checksum % zram->hash_size];

rb_root = &hash->rb_root;
@@ -148,6 +151,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
void *mem;
struct zram_entry *entry;

+ if (!zram_dedup_enabled(zram))
+ return NULL;
+
mem = kmap_atomic(page);
*checksum = zram_dedup_checksum(mem);

@@ -160,6 +166,9 @@ struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,

void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,

unsigned long handle, unsigned int len)
{

+ if (!zram_dedup_enabled(zram))
+ return;

+
entry->handle = handle;

entry->refcount = 1;
entry->len = len;
@@ -167,6 +176,9 @@ void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,

bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
{

+ if (!zram_dedup_enabled(zram))
+ return true;
+
if (zram_dedup_put(zram, entry))
return false;

@@ -178,6 +190,9 @@ int zram_dedup_init(struct zram *zram, size_t num_pages)
int i;
struct zram_hash *hash;

+ if (!zram_dedup_enabled(zram))
+ return 0;

+
zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;

zram->hash_size = min_t(size_t, ZRAM_HASH_SIZE_MAX, zram->hash_size);

zram->hash_size = max_t(size_t, ZRAM_HASH_SIZE_MIN, zram->hash_size);

diff --git a/drivers/block/zram/zram_dedup.h b/drivers/block/zram/zram_dedup.h
index ebe6bff..8ab267b 100644
--- a/drivers/block/zram/zram_dedup.h
+++ b/drivers/block/zram/zram_dedup.h
@@ -4,6 +4,8 @@
struct zram;
struct zram_entry;

+#ifdef CONFIG_ZRAM_DEDUP
+
u64 zram_dedup_dup_size(struct zram *zram);
u64 zram_dedup_meta_size(struct zram *zram);

@@ -18,5 +20,26 @@ bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);

int zram_dedup_init(struct zram *zram, size_t num_pages);
void zram_dedup_fini(struct zram *zram);
+#else
+
+static inline u64 zram_dedup_dup_size(struct zram *zram) { return 0; }
+static inline u64 zram_dedup_meta_size(struct zram *zram) { return 0; }
+
+static inline void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum) { }
+static inline struct zram_entry *zram_dedup_find(struct zram *zram,
+ struct page *page, u32 *checksum) { return NULL; }
+
+static inline void zram_dedup_init_entry(struct zram *zram,
+ struct zram_entry *entry, unsigned long handle,
+ unsigned int len) { }
+static inline bool zram_dedup_put_entry(struct zram *zram,
+ struct zram_entry *entry) { return true; }
+
+static inline int zram_dedup_init(struct zram *zram,
+ size_t num_pages) { return 0; }

+static inline void zram_dedup_fini(struct zram *zram) { }
+
+#endif

#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8eab8a0..372602c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -333,6 +333,41 @@ static ssize_t comp_algorithm_store(struct device *dev,
return len;
}

+static ssize_t use_dedup_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ bool val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->use_dedup;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (int)val);
+}
+
+#ifdef CONFIG_ZRAM_DEDUP
+static ssize_t use_dedup_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ if (kstrtoint(buf, 10, &val) || (val != 0 && val != 1))
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change dedup usage for initialized device\n");
+ return -EBUSY;
+ }
+ zram->use_dedup = val;
+ up_write(&zram->init_lock);
+ return len;
+}
+#endif
+
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
@@ -425,6 +460,15 @@ static DEVICE_ATTR_RO(io_stat);
static DEVICE_ATTR_RO(mm_stat);
static DEVICE_ATTR_RO(debug_stat);

+static unsigned long zram_entry_handle(struct zram *zram,

+ struct zram_entry *entry)
+{

+ if (zram_dedup_enabled(zram))
+ return entry->handle;
+ else
+ return (unsigned long)entry;
+}
+
static void zram_slot_lock(struct zram *zram, u32 index)
{
bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value);
@@ -485,14 +529,17 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,
struct zram_entry *entry;
unsigned long handle;

- entry = kzalloc(sizeof(*entry),
- flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
- if (!entry)
+ handle = zs_malloc(zram->mem_pool, len, flags);
+ if (!handle)
return NULL;

- handle = zs_malloc(zram->mem_pool, len, flags);
- if (!handle) {
- kfree(entry);
+ if (!zram_dedup_enabled(zram))
+ return (struct zram_entry *)handle;

+
+ entry = kzalloc(sizeof(*entry),
+ flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+ if (!entry) {

+ zs_free(zram->mem_pool, handle);
return NULL;
}

@@ -507,7 +554,11 @@ void zram_entry_free(struct zram *zram, struct zram_entry *entry)
if (!zram_dedup_put_entry(zram, entry))
return;

- zs_free(zram->mem_pool, entry->handle);
+ zs_free(zram->mem_pool, zram_entry_handle(zram, entry));
+
+ if (!zram_dedup_enabled(zram))
+ return;
+
kfree(entry);

atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
@@ -598,7 +649,8 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)

entry = zram_get_entry(zram, index);
size = zram_get_obj_size(zram, index);

- src = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);

+ src = zs_map_object(zram->mem_pool,

+ zram_entry_handle(zram, entry), ZS_MM_RO);

if (size == PAGE_SIZE) {
dst = kmap_atomic(page);
memcpy(dst, src, PAGE_SIZE);

@@ -612,7 +664,7 @@ static int zram_decompress_page(struct zram *zram, struct page *page, u32 index)
kunmap_atomic(dst);
zcomp_stream_put(zram->comp);
}
- zs_unmap_object(zram->mem_pool, entry->handle);
+ zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));

zram_slot_unlock(zram, index);

/* Should NEVER happen. Return bio error if it does. */

@@ -750,7 +802,8 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
return ret;
}

- dst = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_WO);

+ dst = zs_map_object(zram->mem_pool,

+ zram_entry_handle(zram, entry), ZS_MM_WO);

src = zstrm->buffer;
if (comp_len == PAGE_SIZE)

@@ -760,7 +813,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)
kunmap_atomic(src);

zcomp_stream_put(zram->comp);
- zs_unmap_object(zram->mem_pool, entry->handle);
+ zs_unmap_object(zram->mem_pool, zram_entry_handle(zram, entry));
zram_dedup_insert(zram, entry, checksum);

found_dup:
@@ -1159,6 +1212,11 @@ static DEVICE_ATTR_WO(mem_limit);
static DEVICE_ATTR_WO(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_DEDUP
+static DEVICE_ATTR_RW(use_dedup);
+#else
+static DEVICE_ATTR_RO(use_dedup);
+#endif

static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1169,6 +1227,7 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+ &dev_attr_use_dedup.attr,
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 4b86921..0091e23 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,7 +134,16 @@ struct zram {
* zram is claimed so open request will be failed

*/
bool claim; /* Protected by bdev->bd_mutex */

+ bool use_dedup;
};

+static inline bool zram_dedup_enabled(struct zram *zram)
+{
+#ifdef CONFIG_ZRAM_DEDUP
+ return zram->use_dedup;
+#else
+ return false;
+#endif
+}

js1...@gmail.com

unread,

May 11, 2017, 10:40:06 PM5/11/17

to

mm_stat: 430845952 191014886 196898816 0 196898816 28320 0 0 0

<dedup>

Elapsed time: 250 s
mm_stat: 430505984 190971334 148365312 0 148365312 28404 0 47287038 3945792

There is no performance degradation and save 23% memory.

Test result #3 (Blockdev):
copy android build output dir(out/host) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 88 s
mm_stat: 8834420736 3658184579 3834208256 0 3834208256 32889 0 0 0

<dedup>
Elapsed time: out/host: 100 s
mm_stat: 8832929792 3657329322 2832015360 0 2832015360 32609 0 952568877 80880336

It shows performance degradation roughly 13% and save 24% memory. Maybe,
it is due to overhead of calculating checksum and comparison.

Test result #4 (Blockdev):
copy android build output dir(out/target/common) to ext4 FS on zram

<no-dedup>
Elapsed time: out/host: 203 s

mm_stat: 4041678848 2310355010 2346577920 0 2346582016 500 4 0 0

<dedup>

Reviewed-by: Sergey Senozhatsky <sergey.se...@gmail.com>
Acked-by: Minchan Kim <min...@kernel.org>
Signed-off-by: Joonsoo Kim <iamjoon...@lge.com>
---

6 files changed, 278 insertions(+), 7 deletions(-)

create mode 100644 drivers/block/zram/zram_dedup.c
create mode 100644 drivers/block/zram/zram_dedup.h

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 4fced8a..2cdc303 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt

@@ -217,6 +217,8 @@ line of text and contains the following stats separated by whitespace:
same_pages the number of same element filled pages written to this disk.
No memory is allocated for such pages.
pages_compacted the number of pages freed during compaction
+ dup_data_size deduplicated data size
+ meta_data_size the amount of metadata allocated for deduplication feature

9) Deactivate:
swapoff /dev/zram0

diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index 9e2b79e..29cb008 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,3 +1,3 @@
-zram-y := zcomp.o zram_drv.o
+zram-y := zcomp.o zram_drv.o zram_dedup.o

obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zram_dedup.c b/drivers/block/zram/zram_dedup.c

+void zram_dedup_insert(struct zram *zram, struct zram_entry *new,
+ u32 checksum)

+{
+ struct zram_hash *hash;
+ struct rb_root *rb_root;

+ struct rb_node **rb_node, *parent = NULL;

+ struct zram_entry *entry;
+

+ new->checksum = checksum;

+ hash = &zram->hash[checksum % zram->hash_size];
+ rb_root = &hash->rb_root;

+
+ spin_lock(&hash->lock);

+ rb_node = &rb_root->rb_node;
+ while (*rb_node) {
+ parent = *rb_node;
+ entry = rb_entry(parent, struct zram_entry, rb_node);
+ if (checksum < entry->checksum)
+ rb_node = &parent->rb_left;
+ else if (checksum > entry->checksum)
+ rb_node = &parent->rb_right;
+ else
+ rb_node = &parent->rb_left;
+ }
+
+ rb_link_node(&new->rb_node, parent, rb_node);
+ rb_insert_color(&new->rb_node, rb_root);

+ spin_unlock(&hash->lock);
+}
+

+static bool zram_dedup_match(struct zram *zram, struct zram_entry *entry,
+ unsigned char *mem)
+{
+ bool match = false;
+ unsigned char *cmem;
+ struct zcomp_strm *zstrm;
+
+ cmem = zs_map_object(zram->mem_pool, entry->handle, ZS_MM_RO);
+ if (entry->len == PAGE_SIZE) {
+ match = !memcmp(mem, cmem, PAGE_SIZE);
+ } else {
+ zstrm = zcomp_stream_get(zram->comp);
+ if (!zcomp_decompress(zstrm, cmem, entry->len, zstrm->buffer))
+ match = !memcmp(mem, zstrm->buffer, PAGE_SIZE);
+ zcomp_stream_put(zram->comp);
+ }

+ zs_unmap_object(zram->mem_pool, entry->handle);
+

+ return match;
+}
+

+static unsigned long zram_dedup_put(struct zram *zram,

+ struct zram_entry *entry)
+{

+ struct zram_hash *hash;
+ u32 checksum;
+
+ checksum = entry->checksum;

+ hash = &zram->hash[checksum % zram->hash_size];

+
+ spin_lock(&hash->lock);
+

+ entry->refcount--;
+ if (!entry->refcount)
+ rb_erase(&entry->rb_node, &hash->rb_root);
+ else
+ atomic64_sub(entry->len, &zram->stats.dup_data_size);

+
+ spin_unlock(&hash->lock);
+

+ return entry->refcount;
+}
+

+static struct zram_entry *zram_dedup_get(struct zram *zram,
+ unsigned char *mem, u32 checksum)

+{
+ struct zram_hash *hash;

+ struct zram_entry *entry;

+ struct rb_node *rb_node;
+

+ hash = &zram->hash[checksum % zram->hash_size];
+
+ spin_lock(&hash->lock);

+ rb_node = hash->rb_root.rb_node;
+ while (rb_node) {

+ entry = rb_entry(rb_node, struct zram_entry, rb_node);

+ if (checksum == entry->checksum) {

+ entry->refcount++;
+ atomic64_add(entry->len, &zram->stats.dup_data_size);
+ spin_unlock(&hash->lock);
+

+ if (zram_dedup_match(zram, entry, mem))
+ return entry;
+
+ zram_entry_free(zram, entry);
+

+ return NULL;
+ }
+

+ if (checksum < entry->checksum)
+ rb_node = rb_node->rb_left;
+ else
+ rb_node = rb_node->rb_right;

+ }
+ spin_unlock(&hash->lock);
+

+ return NULL;
+}
+

+struct zram_entry *zram_dedup_find(struct zram *zram, struct page *page,
+ u32 *checksum)
+{
+ void *mem;

+ struct zram_entry *entry;
+

+ mem = kmap_atomic(page);
+ *checksum = zram_dedup_checksum(mem);
+
+ entry = zram_dedup_get(zram, mem, *checksum);
+ kunmap_atomic(mem);

+
+ return entry;
+}
+

+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len)
+{

+ entry->handle = handle;

+ entry->refcount = 1;
+ entry->len = len;
+}
+
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry)
+{
+ if (zram_dedup_put(zram, entry))
+ return false;
+
+ return true;
+}
+
+int zram_dedup_init(struct zram *zram, size_t num_pages)
+{
+ int i;
+ struct zram_hash *hash;
+

+ zram->hash_size = num_pages >> ZRAM_HASH_SHIFT;

+void zram_dedup_init_entry(struct zram *zram, struct zram_entry *entry,
+ unsigned long handle, unsigned int len);
+bool zram_dedup_put_entry(struct zram *zram, struct zram_entry *entry);
+
+int zram_dedup_init(struct zram *zram, size_t num_pages);
+void zram_dedup_fini(struct zram *zram);
+
+#endif /* _ZRAM_DEDUP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 26dc4e5..8eab8a0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c

@@ -389,14 +389,16 @@ static ssize_t mm_stat_show(struct device *dev,
max_used = atomic_long_read(&zram->stats.max_used_pages);

ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.same_pages),
- pool_stats.pages_compacted);
+ pool_stats.pages_compacted,
+ zram_dedup_dup_size(zram),
+ zram_dedup_meta_size(zram));

up_read(&zram->init_lock);

return ret;
@@ -481,26 +483,34 @@ static struct zram_entry *zram_entry_alloc(struct zram *zram,

unsigned int len, gfp_t flags)

{
struct zram_entry *entry;
+ unsigned long handle;

entry = kzalloc(sizeof(*entry),
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
if (!entry)
return NULL;

- entry->handle = zs_malloc(zram->mem_pool, len, flags);
- if (!entry->handle) {

+ handle = zs_malloc(zram->mem_pool, len, flags);

+ if (!handle) {
kfree(entry);
return NULL;
}

+ zram_dedup_init_entry(zram, entry, handle, len);
+ atomic64_add(sizeof(*entry), &zram->stats.meta_data_size);
+
return entry;
}

-static inline void zram_entry_free(struct zram *zram,
- struct zram_entry *entry)
+void zram_entry_free(struct zram *zram, struct zram_entry *entry)
{
+ if (!zram_dedup_put_entry(zram, entry))
+ return;

+
zs_free(zram->mem_pool, entry->handle);

kfree(entry);
+
+ atomic64_sub(sizeof(*entry), &zram->stats.meta_data_size);
}

static void zram_meta_free(struct zram *zram, u64 disksize)

@@ -513,6 +523,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
zram_free_page(zram, index);

zs_destroy_pool(zram->mem_pool);
+ zram_dedup_fini(zram);
vfree(zram->table);
}

@@ -531,6 +542,12 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
return false;
}

+ if (zram_dedup_init(zram, num_pages)) {
+ vfree(zram->table);
+ zs_destroy_pool(zram->mem_pool);
+ return false;
+ }
+
return true;
}

@@ -715,10 +732,17 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

void *src, *dst;
struct zcomp_strm *zstrm;

struct page *page = bvec->bv_page;
+ u32 checksum;

if (zram_same_page_write(zram, index, page))
return 0;

+ entry = zram_dedup_find(zram, page, &checksum);
+ if (entry) {
+ comp_len = entry->len;
+ goto found_dup;
+ }
+
zstrm = zcomp_stream_get(zram->comp);

ret = zram_compress(zram, &zstrm, page, &entry, &comp_len);
if (ret) {

@@ -737,7 +761,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index)

zcomp_stream_put(zram->comp);
zs_unmap_object(zram->mem_pool, entry->handle);
+ zram_dedup_insert(zram, entry, checksum);

+found_dup:

/*
* Free memory associated with this sector

* before overwriting unused sectors.

diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index fe3d216..4b86921 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h

@@ -18,8 +18,10 @@
#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
#include <linux/crypto.h>
+#include <linux/spinlock.h>

#include "zcomp.h"
+#include "zram_dedup.h"

/*-- Configurable parameters */

@@ -70,6 +72,10 @@ enum zram_pageflags {
/*-- Data structures */

struct zram_entry {
+ struct rb_node rb_node;
+ u32 len;
+ u32 checksum;
+ unsigned long refcount;

unsigned long handle;
};

@@ -94,6 +100,13 @@ struct zram_stats {
atomic64_t pages_stored; /* no. of pages currently stored */
atomic_long_t max_used_pages; /* no. of maximum pages stored */
atomic64_t writestall; /* no. of write slow paths */
+ atomic64_t dup_data_size; /* compressed size of pages duplicated */
+ atomic64_t meta_data_size; /* size of zram_entries */
+};
+

+struct zram_hash {
+ spinlock_t lock;
+ struct rb_root rb_root;
};

struct zram {
@@ -101,6 +114,8 @@ struct zram {
struct zs_pool *mem_pool;
struct zcomp *comp;
struct gendisk *disk;
+ struct zram_hash *hash;
+ size_t hash_size;
/* Prevent concurrent execution of device init */
struct rw_semaphore init_lock;
/*
@@ -120,4 +135,6 @@ struct zram {

*/
bool claim; /* Protected by bdev->bd_mutex */
};
+

+void zram_entry_free(struct zram *zram, struct zram_entry *entry);
#endif
--
2.7.4