[RFC] [PATCH 0/2] memcg: per cgroup dirty limit

Andrea Righi

unread,

Feb 21, 2010, 10:30:01 AM2/21/10

to

Control the maximum amount of dirty pages a cgroup can have at any given time.

Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
page cache used by any cgroup. So, in case of multiple cgroup writers, they
will not be able to consume more than their designated share of dirty pages and
will be forced to perform write-out if they cross that limit.

The overall design is the following:

- account dirty pages per cgroup
- limit the number of dirty pages via memory.dirty_bytes in cgroupfs
- start to write-out in balance_dirty_pages() when the cgroup or global limit
is exceeded

This feature is supposed to be strictly connected to any underlying IO
controller implementation, so we can stop increasing dirty pages in VM layer
and enforce a write-out before any cgroup will consume the global amount of
dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes limit.

TODO:
- handle the migration of tasks across different cgroups (a page may be set
dirty when a task runs in a cgroup and cleared after the task is moved to
another cgroup).
- provide an appropriate documentation (in Documentation/cgroups/memory.txt)

-Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Andrea Righi

unread,

Feb 21, 2010, 10:30:01 AM2/21/10

to

Infrastructure to account dirty pages per cgroup + add memory.dirty_bytes limit
in cgroupfs.

Signed-off-by: Andrea Righi <ari...@develer.com>
---
include/linux/memcontrol.h | 31 ++++++
mm/memcontrol.c | 218 +++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 248 insertions(+), 1 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1f9b119..ba3fe0d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,6 +25,16 @@ struct page_cgroup;
struct page;
struct mm_struct;

+/* Cgroup memory statistics items exported to the kernel */
+enum memcg_page_stat_item {
+ MEMCG_NR_FREE_PAGES,
+ MEMCG_NR_RECLAIMABLE_PAGES,
+ MEMCG_NR_FILE_DIRTY,
+ MEMCG_NR_WRITEBACK,
+ MEMCG_NR_WRITEBACK_TEMP,
+ MEMCG_NR_UNSTABLE_NFS,
+};
+
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
/*
* All "charge" functions with gfp_mask should use GFP_KERNEL or
@@ -48,6 +58,8 @@ extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr);

extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask);
+extern void mem_cgroup_charge_dirty(struct page *page,
+ enum zone_stat_item idx, int charge);
extern void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru);
extern void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru);
extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
@@ -117,6 +129,10 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
extern int do_swap_account;
#endif

+extern unsigned long mem_cgroup_dirty_bytes(void);
+
+extern u64 mem_cgroup_page_state(enum memcg_page_stat_item item);
+
static inline bool mem_cgroup_disabled(void)
{
if (mem_cgroup_subsys.disabled)
@@ -144,6 +160,11 @@ static inline int mem_cgroup_cache_charge(struct page *page,
return 0;
}

+static inline void mem_cgroup_charge_dirty(struct page *page,
+ enum zone_stat_item idx, int charge)
+{
+}
+
static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page, gfp_t gfp_mask, struct mem_cgroup **ptr)
{
@@ -312,6 +333,16 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return 0;
}

+static inline unsigned long mem_cgroup_dirty_bytes(void)
+{
+ return vm_dirty_bytes;
+}
+
+static inline u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
+{
+ return 0;
+}
+
#endif /* CONFIG_CGROUP_MEM_CONT */

#endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b..288b9a4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -64,13 +64,18 @@ enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_FILE_DIRTY, /* # of dirty pages in page cache */
+ MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
+ MEM_CGROUP_STAT_WRITEBACK_TEMP, /* # of pages under writeback using
+ temporary buffers */
+ MEM_CGROUP_STAT_UNSTABLE_NFS, /* # of NFS unstable pages */

MEM_CGROUP_STAT_NSTATS,
};
@@ -225,6 +230,9 @@ struct mem_cgroup {
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;

+ /* control memory cgroup dirty pages */
+ unsigned long dirty_bytes;
+
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -519,6 +527,67 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
put_cpu();
}

+static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem = NULL;
+
+ pc = lookup_page_cgroup(page);
+ if (unlikely(!pc))
+ return NULL;
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ mem = pc->mem_cgroup;
+ if (mem)
+ css_get(&mem->css);
+ }
+ unlock_page_cgroup(pc);
+ return mem;
+}
+
+void mem_cgroup_charge_dirty(struct page *page,
+ enum zone_stat_item idx, int charge)
+{
+ struct mem_cgroup *mem;
+ struct mem_cgroup_stat_cpu *cpustat;
+ unsigned long flags;
+ int cpu;
+
+ if (mem_cgroup_disabled())
+ return;
+ /* Translate the zone_stat_item into a mem_cgroup_stat_index */
+ switch (idx) {
+ case NR_FILE_DIRTY:
+ idx = MEM_CGROUP_STAT_FILE_DIRTY;
+ break;
+ case NR_WRITEBACK:
+ idx = MEM_CGROUP_STAT_WRITEBACK;
+ break;
+ case NR_WRITEBACK_TEMP:
+ idx = MEM_CGROUP_STAT_WRITEBACK_TEMP;
+ break;
+ case NR_UNSTABLE_NFS:
+ idx = MEM_CGROUP_STAT_UNSTABLE_NFS;
+ break;
+ default:
+ return;
+ }
+ /* Charge the memory cgroup statistics */
+ mem = get_mem_cgroup_from_page(page);
+ if (!mem) {
+ mem = root_mem_cgroup;
+ css_get(&mem->css);
+ }
+
+ local_irq_save(flags);
+ cpu = get_cpu();
+ cpustat = &mem->stat.cpustat[cpu];
+ __mem_cgroup_stat_add_safe(cpustat, idx, charge);
+ put_cpu();
+ local_irq_restore(flags);
+ css_put(&mem->css);
+}
+
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
enum lru_list idx)
{
@@ -992,6 +1061,97 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}

+static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
+{
+ struct cgroup *cgrp = memcg->css.cgroup;
+ unsigned long dirty_bytes;
+
+ /* root ? */
+ if (cgrp->parent == NULL)
+ return vm_dirty_bytes;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ dirty_bytes = memcg->dirty_bytes;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return dirty_bytes;
+}
+
+unsigned long mem_cgroup_dirty_bytes(void)
+{
+ struct mem_cgroup *memcg;
+ unsigned long dirty_bytes;
+
+ if (mem_cgroup_disabled())
+ return vm_dirty_bytes;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg == NULL)
+ dirty_bytes = vm_dirty_bytes;
+ else
+ dirty_bytes = get_dirty_bytes(memcg);
+ rcu_read_unlock();
+
+ return dirty_bytes;
+}
+
+u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
+{
+ struct mem_cgroup *memcg;
+ struct cgroup *cgrp;
+ u64 ret = 0;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg == NULL)
+ goto out;
+ cgrp = memcg->css.cgroup;
+ /* Use system-wide statistics for the root cgroup */
+ if (cgrp->parent == NULL)
+ goto out;
+ switch (item) {
+ case MEMCG_NR_FREE_PAGES:
+ ret = res_counter_read_u64(&memcg->res, RES_LIMIT) -
+ res_counter_read_u64(&memcg->res, RES_USAGE);
+ /*
+ * Translate free memory in pages and ensure we never return 0.
+ */
+ ret = (ret >> PAGE_SHIFT) + 1;
+ break;
+ case MEMCG_NR_RECLAIMABLE_PAGES:
+ ret = mem_cgroup_read_stat(&memcg->stat, LRU_ACTIVE_ANON) +
+ mem_cgroup_read_stat(&memcg->stat, LRU_ACTIVE_FILE) +
+ mem_cgroup_read_stat(&memcg->stat, LRU_INACTIVE_ANON) +
+ mem_cgroup_read_stat(&memcg->stat, LRU_INACTIVE_FILE);
+ break;
+ case MEMCG_NR_FILE_DIRTY:
+ ret = mem_cgroup_read_stat(&memcg->stat,
+ MEM_CGROUP_STAT_FILE_DIRTY);
+ break;
+ case MEMCG_NR_WRITEBACK:
+ ret = mem_cgroup_read_stat(&memcg->stat,
+ MEM_CGROUP_STAT_WRITEBACK);
+ break;
+ case MEMCG_NR_WRITEBACK_TEMP:
+ ret = mem_cgroup_read_stat(&memcg->stat,
+ MEM_CGROUP_STAT_WRITEBACK_TEMP);
+ break;
+ case MEMCG_NR_UNSTABLE_NFS:
+ ret = mem_cgroup_read_stat(&memcg->stat,
+ MEM_CGROUP_STAT_UNSTABLE_NFS);
+ break;
+ default:
+ WARN_ON(1);
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -2874,6 +3034,10 @@ enum {
MCS_PGPGIN,
MCS_PGPGOUT,
MCS_SWAP,
+ MCS_FILE_DIRTY,
+ MCS_WRITEBACK,
+ MCS_WRITEBACK_TEMP,
+ MCS_UNSTABLE_NFS,
MCS_INACTIVE_ANON,
MCS_ACTIVE_ANON,
MCS_INACTIVE_FILE,
@@ -2896,6 +3060,10 @@ struct {
{"pgpgin", "total_pgpgin"},
{"pgpgout", "total_pgpgout"},
{"swap", "total_swap"},
+ {"filedirty", "dirty_pages"},
+ {"writeback", "writeback_pages"},
+ {"writeback_tmp", "writeback_temp_pages"},
+ {"nfs", "nfs_unstable"},
{"inactive_anon", "total_inactive_anon"},
{"active_anon", "total_active_anon"},
{"inactive_file", "total_inactive_file"},
@@ -2924,6 +3092,14 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_DIRTY);
+ s->stat[MCS_FILE_DIRTY] += val;
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_WRITEBACK);
+ s->stat[MCS_WRITEBACK] += val;
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_WRITEBACK_TEMP);
+ s->stat[MCS_WRITEBACK_TEMP] += val;
+ val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_UNSTABLE_NFS);
+ s->stat[MCS_UNSTABLE_NFS] += val;

/* per zone stat */
val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3049,6 +3225,41 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return 0;
}

+static u64 mem_cgroup_dirty_bytes_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_dirty_bytes(memcg);
+}
+
+static int mem_cgroup_dirty_bytes_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+ struct mem_cgroup *parent;
+
+ if (cgrp->parent == NULL)
+ return -EINVAL;
+
+ parent = mem_cgroup_from_cont(cgrp->parent);
+
+ cgroup_lock();
+
+ /* If under hierarchy, only empty-root can set this value */
+ if ((parent->use_hierarchy) ||
+ (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
+ cgroup_unlock();
+ return -EINVAL;
+ }
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->dirty_bytes = val;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ cgroup_unlock();
+
+ return 0;
+}

static struct cftype mem_cgroup_files[] = {
{
@@ -3098,6 +3309,11 @@ static struct cftype mem_cgroup_files[] = {
.read_u64 = mem_cgroup_swappiness_read,
.write_u64 = mem_cgroup_swappiness_write,
},
+ {
+ .name = "dirty_bytes",
+ .read_u64 = mem_cgroup_dirty_bytes_read,
+ .write_u64 = mem_cgroup_dirty_bytes_write,
+ },
};

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
--
1.6.3.3

Andrea Righi

unread,

Feb 21, 2010, 10:30:02 AM2/21/10

to

Apply the cgroup dirty pages accounting and limiting infrastructure to
the opportune kernel functions.

Signed-off-by: Andrea Righi <ari...@develer.com>
---

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e13..357632a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
+#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/module.h>

@@ -1129,6 +1130,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)

list_del(&req->writepages_entry);
dec_bdi_stat(bdi, BDI_WRITEBACK);
+ mem_cgroup_charge_dirty(req->pages[0], NR_WRITEBACK_TEMP, -1);
dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
bdi_writeout_inc(bdi);
wake_up(&fi->page_waitq);
@@ -1240,6 +1242,7 @@ static int fuse_writepage_locked(struct page *page)
req->inode = inode;

inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+ mem_cgroup_charge_dirty(tmp_page, NR_WRITEBACK_TEMP, 1);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
end_page_writeback(page);

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index d63d964..3d9de01 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -439,6 +439,7 @@ nfs_mark_request_commit(struct nfs_page *req)
req->wb_index,
NFS_PAGE_TAG_COMMIT);
spin_unlock(&inode->i_lock);
+ mem_cgroup_charge_dirty(req->wb_page, NR_UNSTABLE_NFS, 1);
inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -450,6 +451,7 @@ nfs_clear_request_commit(struct nfs_page *req)
struct page *page = req->wb_page;

if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ mem_cgroup_charge_dirty(page, NR_UNSTABLE_NFS, -1);
dec_zone_page_state(page, NR_UNSTABLE_NFS);
dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
return 1;
@@ -1320,6 +1322,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
nfs_mark_request_commit(req);
+ mem_cgroup_charge_dirty(req->wb_page, NR_UNSTABLE_NFS, -1);
dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
BDI_RECLAIMABLE);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 105b508..b9ffac5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1660,8 +1660,10 @@ nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
kunmap_atomic(kaddr, KM_USER0);

- if (!TestSetPageWriteback(clone_page))
+ if (!TestSetPageWriteback(clone_page)) {
+ mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, 1);
inc_zone_page_state(clone_page, NR_WRITEBACK);
+ }
unlock_page(clone_page);

return 0;
@@ -1788,8 +1790,10 @@ static void __nilfs_end_page_io(struct page *page, int err)
}

if (buffer_nilfs_allocated(page_buffers(page))) {
- if (TestClearPageWriteback(page))
+ if (TestClearPageWriteback(page)) {
+ mem_cgroup_charge_dirty(clone_page, NR_WRITEBACK, -1);
dec_zone_page_state(page, NR_WRITEBACK);
+ }
} else
end_page_writeback(page);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 698ea80..c19d809 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -135,6 +135,7 @@ void __remove_from_page_cache(struct page *page)
* having removed the page entirely.
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943..c9ff1cd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
*/
static int calc_period_shift(void)
{
- unsigned long dirty_total;
+ unsigned long dirty_total, dirty_bytes;

- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
+ dirty_bytes = mem_cgroup_dirty_bytes();
+ if (dirty_bytes)
+ dirty_total = dirty_bytes / PAGE_SIZE;
else
dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
100;
@@ -406,14 +407,20 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
*/
unsigned long determine_dirtyable_memory(void)
{
- unsigned long x;
-
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
-
+ unsigned long memcg_memory, memory;
+
+ memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
+ if (memcg_memory > 0) {
+ memcg_memory +=
+ mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
+ if (memcg_memory < memory)
+ return memcg_memory;
+ }
if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
+ memory -= highmem_dirtyable_memory(memory);

- return x + 1; /* Ensure that we never return 0 */
+ return memory + 1; /* Ensure that we never return 0 */
}

void
@@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
{
unsigned long background;
- unsigned long dirty;
+ unsigned long dirty, dirty_bytes;
unsigned long available_memory = determine_dirtyable_memory();
struct task_struct *tsk;

- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ dirty_bytes = mem_cgroup_dirty_bytes();
+ if (dirty_bytes)
+ dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
else {
int dirty_ratio;

@@ -505,9 +513,17 @@ static void balance_dirty_pages(struct address_space *mapping,
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);

- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+ nr_reclaimable = mem_cgroup_page_state(MEMCG_NR_FILE_DIRTY);
+ if (nr_reclaimable == 0) {
+ nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_writeback = global_page_state(NR_WRITEBACK);
+ } else {
+ nr_reclaimable +=
+ mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
+ nr_writeback =
+ mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
+ }

bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
@@ -660,6 +676,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;

for ( ; ; ) {
+ unsigned long dirty;
+
get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);

/*
@@ -668,10 +686,15 @@ void throttle_vm_writeout(gfp_t gfp_mask)
*/
dirty_thresh += dirty_thresh / 10; /* wheeee... */

- if (global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK) <= dirty_thresh)
- break;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ dirty = mem_cgroup_page_state(MEMCG_NR_WRITEBACK);
+ if (dirty < 0)
+ dirty = global_page_state(NR_UNSTABLE_NFS) +
+ global_page_state(NR_WRITEBACK);
+ else
+ dirty += mem_cgroup_page_state(MEMCG_NR_UNSTABLE_NFS);
+ if (dirty <= dirty_thresh)
+ break;
+ congestion_wait(BLK_RW_ASYNC, HZ/10);

/*
* The caller might hold locks which can prevent IO completion
@@ -1078,6 +1101,7 @@ int __set_page_dirty_no_writeback(struct page *page)
void account_page_dirtied(struct page *page, struct address_space *mapping)
{
if (mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, 1);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
task_dirty_inc(current);
@@ -1253,6 +1277,7 @@ int clear_page_dirty_for_io(struct page *page)
* for more comments.
*/
if (TestClearPageDirty(page)) {
+ mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
@@ -1288,8 +1313,10 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
+ mem_cgroup_charge_dirty(page, NR_WRITEBACK, -1);
dec_zone_page_state(page, NR_WRITEBACK);
+ }
return ret;
}

@@ -1319,8 +1346,10 @@ int test_set_page_writeback(struct page *page)
} else {
ret = TestSetPageWriteback(page);
}
- if (!ret)
+ if (!ret) {
+ mem_cgroup_charge_dirty(page, NR_WRITEBACK, 1);
inc_zone_page_state(page, NR_WRITEBACK);
+ }
return ret;

}
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e372..f44e370 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -73,6 +73,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
if (TestClearPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_charge_dirty(page, NR_FILE_DIRTY, -1);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info,
BDI_RECLAIMABLE);
--
1.6.3.3

David Rientjes

unread,

Feb 21, 2010, 4:30:01 PM2/21/10

to

Is it possible to merge this with try_get_mem_cgroup_from_page()?

WARN()? We don't want to silently leak counters.

> + }
> + /* Charge the memory cgroup statistics */
> + mem = get_mem_cgroup_from_page(page);
> + if (!mem) {
> + mem = root_mem_cgroup;
> + css_get(&mem->css);
> + }

get_mem_cgroup_from_page() should probably handle the root_mem_cgroup case
and return a reference from it.

> +
> + local_irq_save(flags);
> + cpu = get_cpu();
> + cpustat = &mem->stat.cpustat[cpu];
> + __mem_cgroup_stat_add_safe(cpustat, idx, charge);

get_cpu()? Preemption is already disabled, just use smp_processor_id().

The rcu_read_lock() isn't protecting anything here.

> +
> + return dirty_bytes;
> +}
> +
> +u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
> +{
> + struct mem_cgroup *memcg;
> + struct cgroup *cgrp;
> + u64 ret = 0;
> +
> + if (mem_cgroup_disabled())
> + return 0;
> +
> + rcu_read_lock();

Again, this isn't necessary.

David Rientjes

unread,

Feb 21, 2010, 4:40:02 PM2/21/10

to

On Sun, 21 Feb 2010, Andrea Righi wrote:

> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0b19943..c9ff1cd 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> - unsigned long dirty_total;
> + unsigned long dirty_total, dirty_bytes;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)
> + dirty_total = dirty_bytes / PAGE_SIZE;
> else
> dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> 100;

This needs a comment since mem_cgroup_dirty_bytes() doesn't imply that it
is responsible for returning the global vm_dirty_bytes when that's
actually what it does (both for CONFIG_CGROUP_MEM_RES_CTRL=n and root
cgroup).

Andrea Righi

unread,

Feb 21, 2010, 5:20:02 PM2/21/10

to

On Sun, Feb 21, 2010 at 01:28:35PM -0800, David Rientjes wrote:

[snip]

> > +static struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
> > +{
> > + struct page_cgroup *pc;
> > + struct mem_cgroup *mem = NULL;
> > +
> > + pc = lookup_page_cgroup(page);
> > + if (unlikely(!pc))
> > + return NULL;
> > + lock_page_cgroup(pc);
> > + if (PageCgroupUsed(pc)) {
> > + mem = pc->mem_cgroup;
> > + if (mem)
> > + css_get(&mem->css);
> > + }
> > + unlock_page_cgroup(pc);
> > + return mem;
> > +}
>
> Is it possible to merge this with try_get_mem_cgroup_from_page()?

Agreed.

>
> > + }
> > + /* Charge the memory cgroup statistics */
> > + mem = get_mem_cgroup_from_page(page);
> > + if (!mem) {
> > + mem = root_mem_cgroup;
> > + css_get(&mem->css);
> > + }
>
> get_mem_cgroup_from_page() should probably handle the root_mem_cgroup case
> and return a reference from it.

Right. But I'd prefer to use try_get_mem_cgroup_from_page() without
changing the behaviour of this function.

>
> > +
> > + local_irq_save(flags);
> > + cpu = get_cpu();
> > + cpustat = &mem->stat.cpustat[cpu];
> > + __mem_cgroup_stat_add_safe(cpustat, idx, charge);
>
> get_cpu()? Preemption is already disabled, just use smp_processor_id().

mmmh... actually, we can just copy the code from
mem_cgroup_charge_statistics(), so local_irq_save/restore are not
necessarily needed and we can just use get_cpu()/put_cpu().

Right!

>
> > +
> > + return dirty_bytes;
> > +}
> > +
> > +u64 mem_cgroup_page_state(enum memcg_page_stat_item item)
> > +{
> > + struct mem_cgroup *memcg;
> > + struct cgroup *cgrp;
> > + u64 ret = 0;
> > +
> > + if (mem_cgroup_disabled())
> > + return 0;
> > +
> > + rcu_read_lock();
>
> Again, this isn't necessary.

OK. I'll apply your changes to the next version of this patch.

Thanks for reviewing!

-Andrea

Andrea Righi

unread,

Feb 21, 2010, 5:40:02 PM2/21/10

to

On Sun, Feb 21, 2010 at 01:38:28PM -0800, David Rientjes wrote:
> On Sun, 21 Feb 2010, Andrea Righi wrote:
>
> > diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> > index 0b19943..c9ff1cd 100644
> > --- a/mm/page-writeback.c
> > +++ b/mm/page-writeback.c
> > @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> > */
> > static int calc_period_shift(void)
> > {
> > - unsigned long dirty_total;
> > + unsigned long dirty_total, dirty_bytes;
> >
> > - if (vm_dirty_bytes)
> > - dirty_total = vm_dirty_bytes / PAGE_SIZE;
> > + dirty_bytes = mem_cgroup_dirty_bytes();
> > + if (dirty_bytes)
> > + dirty_total = dirty_bytes / PAGE_SIZE;
> > else
> > dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> > 100;
>
> This needs a comment since mem_cgroup_dirty_bytes() doesn't imply that it
> is responsible for returning the global vm_dirty_bytes when that's
> actually what it does (both for CONFIG_CGROUP_MEM_RES_CTRL=n and root
> cgroup).

Fair enough.

Thanks,
-Andrea

KAMEZAWA Hiroyuki

unread,

Feb 21, 2010, 7:00:01 PM2/21/10

to

On Sun, 21 Feb 2010 16:18:43 +0100
Andrea Righi <ari...@develer.com> wrote:

> Control the maximum amount of dirty pages a cgroup can have at any given time.
>
> Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> page cache used by any cgroup. So, in case of multiple cgroup writers, they
> will not be able to consume more than their designated share of dirty pages and
> will be forced to perform write-out if they cross that limit.
>
> The overall design is the following:
>
> - account dirty pages per cgroup
> - limit the number of dirty pages via memory.dirty_bytes in cgroupfs
> - start to write-out in balance_dirty_pages() when the cgroup or global limit
> is exceeded
>
> This feature is supposed to be strictly connected to any underlying IO
> controller implementation, so we can stop increasing dirty pages in VM layer
> and enforce a write-out before any cgroup will consume the global amount of
> dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes limit.
>
> TODO:
> - handle the migration of tasks across different cgroups (a page may be set
> dirty when a task runs in a cgroup and cleared after the task is moved to
> another cgroup).
> - provide an appropriate documentation (in Documentation/cgroups/memory.txt)
>

Thank you, this was a long concern in memcg.

Regards,
-Kame

KAMEZAWA Hiroyuki

unread,

Feb 21, 2010, 7:30:02 PM2/21/10

to

On Sun, 21 Feb 2010 16:18:44 +0100
Andrea Righi <ari...@develer.com> wrote:

> Infrastructure to account dirty pages per cgroup + add memory.dirty_bytes limit
> in cgroupfs.
>
> Signed-off-by: Andrea Righi <ari...@develer.com>

Looks clean in general. But some confliction with memcg in mmotm.
And please CC: to linu...@kvack.org when you modify memcg.

?

Hmm. do we need to get refcnt for dirty page accounting ?

This patch will conflict with mmotm. per_cpu stats are all rewritten.
In 1st impression, I think you should add DIRTY/WRITEBACK/UNSTABLE flag to
page_cgroup. If not, you can never handle migration.

And css_get()/css_put() is very heavy operation. (You can see it by perf, I think.)

So,
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
mem = pc->page_cgroup;
if (mem && PageCgroupUsed(pc)) {
/* account */
}
unlock_page_cgroup(pc);

is better, rather than adding accessing function.
memcg is stable when USED page_cgroup is under lock.

> +
> static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> enum lru_list idx)
> {
> @@ -992,6 +1061,97 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
> return swappiness;
> }
>
> +static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
> +{
> + struct cgroup *cgrp = memcg->css.cgroup;
> + unsigned long dirty_bytes;
> +
> + /* root ? */
> + if (cgrp->parent == NULL)
> + return vm_dirty_bytes;

We have mem_cgroup_is_root() macro.

> +
> + spin_lock(&memcg->reclaim_param_lock);
> + dirty_bytes = memcg->dirty_bytes;
> + spin_unlock(&memcg->reclaim_param_lock);
> +
> + return dirty_bytes;
> +}

Hmm...do we need spinlock ? You use "unsigned long", then, read-write
is always atomic if not read-modify-write.

mem_cgroup_is_root() again.

Hmm. A concern here is that whether you have to consider
"hierarchical accounting" or not. It's design descision but you have
to describe "dirty limit doesn't handle hierarchical accounting"
somewhere. But I should read 2nd patch before I comment more.

nitpick: When I add "per-memcg, not for hierarchy" function, I tend to
add _local_ in the function name. As mem_cgroup_get_local_stat().

Okay, then, only hierarchy root can set the value.
Please descirbe this kind of important things in patch description.

Thanks.
-Kame

KAMEZAWA Hiroyuki

unread,

Feb 21, 2010, 7:40:01 PM2/21/10

to

On Sun, 21 Feb 2010 16:18:45 +0100
Andrea Righi <ari...@develer.com> wrote:

> Apply the cgroup dirty pages accounting and limiting infrastructure to
> the opportune kernel functions.
>
> Signed-off-by: Andrea Righi <ari...@develer.com>

I think there are design confusion with 1st patch.

> ---
> fs/fuse/file.c | 3 ++
> fs/nfs/write.c | 3 ++
> fs/nilfs2/segment.c | 8 ++++-
> mm/filemap.c | 1 +
> mm/page-writeback.c | 69 ++++++++++++++++++++++++++++++++++++--------------
> mm/truncate.c | 1 +
> 6 files changed, 63 insertions(+), 22 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index a9f5e13..357632a 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -11,6 +11,7 @@
> #include <linux/pagemap.h>
> #include <linux/slab.h>
> #include <linux/kernel.h>
> +#include <linux/memcontrol.h>
> #include <linux/sched.h>
> #include <linux/module.h>
>
> @@ -1129,6 +1130,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
>
> list_del(&req->writepages_entry);
> dec_bdi_stat(bdi, BDI_WRITEBACK);
> + mem_cgroup_charge_dirty(req->pages[0], NR_WRITEBACK_TEMP, -1);

Here, you account dirty pages to the memcg which page_cgroup belongs to.
Not to the root cgroup of hierarchical accouning.

> dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
> bdi_writeout_inc(bdi);
> wake_up(&fi->page_waitq);
> @@ -1240,6 +1242,7 @@ static int fuse_writepage_locked(struct page *page)
> req->inode = inode;
>
> inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
> + mem_cgroup_charge_dirty(tmp_page, NR_WRITEBACK_TEMP, 1);

here too....

Here, you get dirty pages of a memcg which the task belongs to.
Not from root memcg of the memcg-hierarchy which task belogns to.

> + if (dirty_bytes)
> + dirty_total = dirty_bytes / PAGE_SIZE;
> else
> dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> 100;
> @@ -406,14 +407,20 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
> */
> unsigned long determine_dirtyable_memory(void)
> {
> - unsigned long x;
> -
> - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> -
> + unsigned long memcg_memory, memory;
> +
> + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> + if (memcg_memory > 0) {
> + memcg_memory +=
> + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> + if (memcg_memory < memory)
> + return memcg_memory;
> + }

Here, you get local usage.

> if (!vm_highmem_is_dirtyable)
> - x -= highmem_dirtyable_memory(x);
> + memory -= highmem_dirtyable_memory(memory);
>
> - return x + 1; /* Ensure that we never return 0 */
> + return memory + 1; /* Ensure that we never return 0 */
> }
>
> void
> @@ -421,12 +428,13 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
> unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
> {
> unsigned long background;
> - unsigned long dirty;
> + unsigned long dirty, dirty_bytes;
> unsigned long available_memory = determine_dirtyable_memory();
> struct task_struct *tsk;
>
> - if (vm_dirty_bytes)
> - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)
> + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
> else {
> int dirty_ratio;

you use local value. But, if hierarchila accounting used, memcg->dirty_bytes
should be got from root-of-hierarchy memcg.

I have no objection if you add a pointer as
memcg->subhierarchy_root
to get root of hierarchical accounting. But please check problem of hierarchy, again.

Thanks,
-Kame

Vivek Goyal

unread,

Feb 22, 2010, 9:30:03 AM2/22/10

to

On Sun, Feb 21, 2010 at 04:18:43PM +0100, Andrea Righi wrote:
> Control the maximum amount of dirty pages a cgroup can have at any given time.
>
> Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> page cache used by any cgroup. So, in case of multiple cgroup writers, they
> will not be able to consume more than their designated share of dirty pages and
> will be forced to perform write-out if they cross that limit.
>
> The overall design is the following:
>
> - account dirty pages per cgroup
> - limit the number of dirty pages via memory.dirty_bytes in cgroupfs
> - start to write-out in balance_dirty_pages() when the cgroup or global limit
> is exceeded
>
> This feature is supposed to be strictly connected to any underlying IO
> controller implementation, so we can stop increasing dirty pages in VM layer
> and enforce a write-out before any cgroup will consume the global amount of
> dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes limit.
>

Thanks Andrea. I had been thinking about looking into it from IO
controller perspective so that we can control async IO (buffered writes
also).

Before I dive into patches, two quick things.

- IIRC, last time you had implemented per memory cgroup "dirty_ratio" and
not "dirty_bytes". Why this change? To begin with either per memcg
configurable dirty ratio also makes sense? By default it can be the
global dirty ratio for each cgroup.

- Looks like we will start writeout from memory cgroup once we cross the
dirty ratio, but still there is no gurantee that we be writting pages
belonging to cgroup which crossed the dirty ratio and triggered the
writeout.

This behavior is not very good at least from IO controller perspective
where if two dd threads are dirtying memory in two cgroups, then if
one crosses it dirty ratio, it should perform writeouts of its own pages
and not other cgroups pages. Otherwise we probably will again introduce
serialization among two writers and will not see service differentation.

May be we can modify writeback_inodes_wbc() to check first dirty page
of the inode. And if it does not belong to same memcg as the task who
is performing balance_dirty_pages(), then skip that inode.

This does not handle the problem of shared files where processes from
two different cgroups are dirtying same file but it will atleast cover
other cases without introducing too much of complexity?

Thanks
Vivek

Vivek Goyal

unread,

Feb 22, 2010, 11:10:02 AM2/22/10

to

We seem to be doing same operation as existing "mem_cgroup_update_file_mapped"
function is doing to udpate some stats. Can we just reuse that? We
probably can create one core function which take index of stat to update
and update_file_mapped and other variants for memcg dirty ratio can make
use of it.

In fact instead of single function charge_dirty() accounting for
WRITEBACK, we well as other states like UNSTABLE_NFS is not very intutive.
May be we can have indivdual functions.

mem_cgroup_update_dirty()
mem_cgroup_update_writeback()
mem_cgroup_update_unstable_nfs() etc.

Thanks
Vivek

Balbir Singh

unread,

Feb 22, 2010, 11:20:03 AM2/22/10

to

* Andrea Righi <ari...@develer.com> [2010-02-21 16:18:44]:

Sounds like a reusable routine, have I seen it before?

Kamezawa is in the process of changing these, so you might want to
look at and integrate with those patches when they are ready.

> + cpustat = &mem->stat.cpustat[cpu];
> + __mem_cgroup_stat_add_safe(cpustat, idx, charge);
> + put_cpu();
> + local_irq_restore(flags);
> + css_put(&mem->css);
> +}
> +

I would not recommend introducing a function that is not used in the
patch.

> static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
> enum lru_list idx)
> {
> @@ -992,6 +1061,97 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
> return swappiness;
> }
>
> +static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
> +{
> + struct cgroup *cgrp = memcg->css.cgroup;
> + unsigned long dirty_bytes;
> +
> + /* root ? */
> + if (cgrp->parent == NULL)
> + return vm_dirty_bytes;
> +
> + spin_lock(&memcg->reclaim_param_lock);
> + dirty_bytes = memcg->dirty_bytes;
> + spin_unlock(&memcg->reclaim_param_lock);

Can't the read be RCU protected? Do we need a spin lock here?

> _______________________________________________
> Containers mailing list
> Conta...@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

--
Three Cheers,
Balbir

Vivek Goyal

unread,

Feb 22, 2010, 12:00:02 PM2/22/10

to

it could be just

if (memcg_memory) {
}

> + memcg_memory +=
> + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> + if (memcg_memory < memory)
> + return memcg_memory;
> + }
> if (!vm_highmem_is_dirtyable)
> - x -= highmem_dirtyable_memory(x);
> + memory -= highmem_dirtyable_memory(memory);
>

If vm_highmem_is_dirtyable=0, In that case, we can still return with
"memcg_memory" which can be more than "memory". IOW, highmem is not
dirtyable system wide but still we can potetially return back saying
for this cgroup we can dirty more pages which can potenailly be acutally
be more that system wide allowed?

Because you have modified dirtyable_memory() and made it per cgroup, I
think it automatically takes care of the cases of per cgroup dirty ratio,
I mentioned in my previous mail. So we will use system wide dirty ratio
to calculate the allowed dirty pages in this cgroup (dirty_ratio *
available_memory()) and if this cgroup wrote too many pages start
writeout?

dirty is unsigned long. Will above condition be ever true?

Are you expecting that NR_WRITEBACK can go negative?

Vivek

Balbir Singh

unread,

Feb 22, 2010, 12:30:02 PM2/22/10

to

* Vivek Goyal <vgo...@redhat.com> [2010-02-22 10:58:40]:

>
> We seem to be doing same operation as existing "mem_cgroup_update_file_mapped"
> function is doing to udpate some stats. Can we just reuse that? We
> probably can create one core function which take index of stat to update
> and update_file_mapped and other variants for memcg dirty ratio can make
> use of it.
>

Good Point, it can be easily extended to be generic

> In fact instead of single function charge_dirty() accounting for
> WRITEBACK, we well as other states like UNSTABLE_NFS is not very intutive.
> May be we can have indivdual functions.
>
> mem_cgroup_update_dirty()
> mem_cgroup_update_writeback()
> mem_cgroup_update_unstable_nfs() etc.
>

Hmm.. probably yes.

--
Three Cheers,
Balbir

Balbir Singh

unread,

Feb 22, 2010, 12:40:02 PM2/22/10

to

* Vivek Goyal <vgo...@redhat.com> [2010-02-22 09:27:45]:

I thought that the I/O controller would eventually provide hooks to do
this.. no?

>
> May be we can modify writeback_inodes_wbc() to check first dirty page
> of the inode. And if it does not belong to same memcg as the task who
> is performing balance_dirty_pages(), then skip that inode.

Do you expect all pages of an inode to be paged in by the same cgroup?

--
Three Cheers,
Balbir

Andrea Righi

unread,

Feb 22, 2010, 1:00:03 PM2/22/10

to

On Mon, Feb 22, 2010 at 09:32:21AM +0900, KAMEZAWA Hiroyuki wrote:
> > - if (vm_dirty_bytes)
> > - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
> > + dirty_bytes = mem_cgroup_dirty_bytes();
> > + if (dirty_bytes)
> > + dirty = DIV_ROUND_UP(dirty_bytes, PAGE_SIZE);
> > else {
> > int dirty_ratio;
>
> you use local value. But, if hierarchila accounting used, memcg->dirty_bytes
> should be got from root-of-hierarchy memcg.
>
> I have no objection if you add a pointer as
> memcg->subhierarchy_root
> to get root of hierarchical accounting. But please check problem of hierarchy, again.

Right, it won't work with hierarchy. I'll fix also considering the
hierarchy case.

Thanks for your review.

-Andrea

Vivek Goyal

unread,

Feb 22, 2010, 1:10:02 PM2/22/10

to

Are we not protecting "memcg" pointer using rcu here?

Thanks
Vivek

Vivek Goyal

unread,

Feb 22, 2010, 1:10:02 PM2/22/10

to

Actually no. This belongs to writeback logic which selects the inode to
write from. Ideally, like reclaim logic, we need to flush out the pages
from memory cgroup which is being throttled so that we can create
parallel buffered write paths at higher layer and rate of IO allowed on
this paths can be controlled by IO controller (either proportional BW or
max BW etc).

Currently the issue is that everything in page cache is common and there
is no means in writeout path to create a service differentiation. This is
where this per memory cgroup dirty_ratio/dirty_bytes can be useful where
writeout from a cgroup are not throttled till it does not hit its own
dirty limits.

Also we need to make sure that in case of throttling, we are submitting pages
to writeout from our own cgroup and not from other cgroup, otherwise we
are back to same situation.

>
> >
> > May be we can modify writeback_inodes_wbc() to check first dirty page
> > of the inode. And if it does not belong to same memcg as the task who
> > is performing balance_dirty_pages(), then skip that inode.
>
> Do you expect all pages of an inode to be paged in by the same cgroup?

I guess at least in simple cases. Not sure whether it will cover majority
of usage or not and up to what extent that matters.

If we start doing background writeout, on per page (like memory reclaim),
the it probably will be slower and hence flusing out pages sequentially
from inode makes sense.

At one point I was thinking, like pages, can we have an inode list per
memory cgroup so that writeback logic can traverse that inode list to
determine which inodes need to be cleaned. But associating inodes to
memory cgroup is not very intutive at the same time, we again have the
issue of shared file pages from two differnent cgroups.

But I guess, a simpler scheme would be to just check first dirty page from
inode and if it does not belong to memory cgroup of task being throttled,
skip it.

It will not cover the case of shared file pages across memory cgroups, but
at least something relatively simple to begin with. Do you have more ideas
on how it can be handeled better.

Vivek

Andrea Righi

unread,

Feb 22, 2010, 1:10:03 PM2/22/10

to

On Mon, Feb 22, 2010 at 09:22:42AM +0900, KAMEZAWA Hiroyuki wrote:

[snip]

> > +static unsigned long get_dirty_bytes(struct mem_cgroup *memcg)
> > +{
> > + struct cgroup *cgrp = memcg->css.cgroup;
> > + unsigned long dirty_bytes;
> > +
> > + /* root ? */
> > + if (cgrp->parent == NULL)
> > + return vm_dirty_bytes;
>
> We have mem_cgroup_is_root() macro.
>
> > +
> > + spin_lock(&memcg->reclaim_param_lock);
> > + dirty_bytes = memcg->dirty_bytes;
> > + spin_unlock(&memcg->reclaim_param_lock);
> > +
> > + return dirty_bytes;
> > +}
> Hmm...do we need spinlock ? You use "unsigned long", then, read-write
> is always atomic if not read-modify-write.

I think I simply copy&paste the memcg->swappiness case. But I agree,
read-write should be atomic.

-Andrea

Andrea Righi

unread,

Feb 22, 2010, 1:20:01 PM2/22/10

to

On Mon, Feb 22, 2010 at 09:27:45AM -0500, Vivek Goyal wrote:
> On Sun, Feb 21, 2010 at 04:18:43PM +0100, Andrea Righi wrote:
> > Control the maximum amount of dirty pages a cgroup can have at any given time.
> >
> > Per cgroup dirty limit is like fixing the max amount of dirty (hard to reclaim)
> > page cache used by any cgroup. So, in case of multiple cgroup writers, they
> > will not be able to consume more than their designated share of dirty pages and
> > will be forced to perform write-out if they cross that limit.
> >
> > The overall design is the following:
> >
> > - account dirty pages per cgroup
> > - limit the number of dirty pages via memory.dirty_bytes in cgroupfs
> > - start to write-out in balance_dirty_pages() when the cgroup or global limit
> > is exceeded
> >
> > This feature is supposed to be strictly connected to any underlying IO
> > controller implementation, so we can stop increasing dirty pages in VM layer
> > and enforce a write-out before any cgroup will consume the global amount of
> > dirty pages defined by the /proc/sys/vm/dirty_ratio|dirty_bytes limit.
> >
>
> Thanks Andrea. I had been thinking about looking into it from IO
> controller perspective so that we can control async IO (buffered writes
> also).
>
> Before I dive into patches, two quick things.
>
> - IIRC, last time you had implemented per memory cgroup "dirty_ratio" and
> not "dirty_bytes". Why this change? To begin with either per memcg
> configurable dirty ratio also makes sense? By default it can be the
> global dirty ratio for each cgroup.

I would't like to add many different interfaces to do the same thing.
I'd prefer to choose just one interface and always use it. We just have
to define which is the best one. IMHO dirty_bytes is more generic. If
we want to define the limit as a % we can always do that in userspace.

>
> - Looks like we will start writeout from memory cgroup once we cross the
> dirty ratio, but still there is no gurantee that we be writting pages
> belonging to cgroup which crossed the dirty ratio and triggered the
> writeout.
>
> This behavior is not very good at least from IO controller perspective
> where if two dd threads are dirtying memory in two cgroups, then if
> one crosses it dirty ratio, it should perform writeouts of its own pages
> and not other cgroups pages. Otherwise we probably will again introduce
> serialization among two writers and will not see service differentation.

Right, but I'd prefer to start with a simple solution. Handling this
per-page is too costly and not good for entire I/O for now. We can
always improve service differentiation and fairness in a second
step, I think.

>
> May be we can modify writeback_inodes_wbc() to check first dirty page
> of the inode. And if it does not belong to same memcg as the task who
> is performing balance_dirty_pages(), then skip that inode.
>
> This does not handle the problem of shared files where processes from
> two different cgroups are dirtying same file but it will atleast cover
> other cases without introducing too much of complexity?

Yes, if we want to take care ot that, at least we could start with a
per-inode solution. It will probably introduce less overhead and will
work the the most part of the cases (except the case when multiple
cgroups write to the same file/inode).

Peter Zijlstra

unread,

Feb 22, 2010, 1:30:03 PM2/22/10

to

This stuff looks really rather horrible,

Relying on these cgroup functions returning 0 seems fragile, some of
them can really be 0. Also sprinkling all that if cgroup foo all over
the place leads to these ugly indentation problems you have.

How about pulling all these things into separate functions, and using a
proper mem_cgroup_has_dirty() function to select on?

Vivek Goyal

unread,

Feb 22, 2010, 1:40:02 PM2/22/10

to

dirty_ratio is easy to configure. One system wide default value works for
all the newly created cgroups. For dirty_bytes, you shall have to
configure each and individual cgroup with a specific value depneding on
what is the upper limit of memory for that cgroup.

Secondly, memory cgroup kind of partitions global memory resource per
cgroup. So if as long as we have global dirty ratio knobs, it makes sense
to have per cgroup dirty ratio knob also.

But I guess we can introduce that later and use gloabl dirty ratio for
all the memory cgroups (instead of each cgroup having a separate dirty
ratio). The only thing is that we need to enforce this dirty ratio on the
cgroup and if I am reading the code correctly, your modifications of
calculating available_memory() per cgroup should take care of that.

> >
> > - Looks like we will start writeout from memory cgroup once we cross the
> > dirty ratio, but still there is no gurantee that we be writting pages
> > belonging to cgroup which crossed the dirty ratio and triggered the
> > writeout.
> >
> > This behavior is not very good at least from IO controller perspective
> > where if two dd threads are dirtying memory in two cgroups, then if
> > one crosses it dirty ratio, it should perform writeouts of its own pages
> > and not other cgroups pages. Otherwise we probably will again introduce
> > serialization among two writers and will not see service differentation.
>
> Right, but I'd prefer to start with a simple solution. Handling this
> per-page is too costly and not good for entire I/O for now. We can
> always improve service differentiation and fairness in a second
> step, I think.
>
> >
> > May be we can modify writeback_inodes_wbc() to check first dirty page
> > of the inode. And if it does not belong to same memcg as the task who
> > is performing balance_dirty_pages(), then skip that inode.
> >
> > This does not handle the problem of shared files where processes from
> > two different cgroups are dirtying same file but it will atleast cover
> > other cases without introducing too much of complexity?
>
> Yes, if we want to take care ot that, at least we could start with a
> per-inode solution. It will probably introduce less overhead and will
> work the the most part of the cases (except the case when multiple
> cgroups write to the same file/inode).

Fair enough. In first round we can take care of enforcing dirty ratio per
cgroup and configurable dirty_bytes per cgroup. Once that is in, we can
look into doing writeout from inodes of memory cgroup being throttled.

Thanks
Vivek

Vivek Goyal

unread,

Feb 22, 2010, 2:40:02 PM2/22/10

to

On Mon, Feb 22, 2010 at 09:22:42AM +0900, KAMEZAWA Hiroyuki wrote:

[..]

> > +static int mem_cgroup_dirty_bytes_write(struct cgroup *cgrp, struct cftype *cft,
> > + u64 val)
> > +{
> > + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
> > + struct mem_cgroup *parent;
> > +
> > + if (cgrp->parent == NULL)
> > + return -EINVAL;
> > +
> > + parent = mem_cgroup_from_cont(cgrp->parent);
> > +
> > + cgroup_lock();
> > +
> > + /* If under hierarchy, only empty-root can set this value */
> > + if ((parent->use_hierarchy) ||
> > + (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
> > + cgroup_unlock();
> > + return -EINVAL;
> > + }
>
> Okay, then, only hierarchy root can set the value.
> Please descirbe this kind of important things in patch description.
>

Hi Andrea,

Why can only root of the hierarchy set set dirty_bytes value? In this
case, a child cgroup's amount of dirty pages will be controlled by
dirty_ratio?

Thanks
Vivek

David Rientjes

unread,

Feb 22, 2010, 4:20:02 PM2/22/10

to

On Mon, 22 Feb 2010, Vivek Goyal wrote:

> dirty_ratio is easy to configure. One system wide default value works for
> all the newly created cgroups. For dirty_bytes, you shall have to
> configure each and individual cgroup with a specific value depneding on
> what is the upper limit of memory for that cgroup.
>

Agreed, it makes sense for each memcg to have a dirty_ratio that defaults
to whatever vm_dirty_ratio does, and export that constant via
linux/writeback.h. dirty_bytes would then use the same semantics as
globally so that if it is set to 0, the finer-granuality is disabled by
default and we use memcg->dirty_ratio instead.

> Secondly, memory cgroup kind of partitions global memory resource per
> cgroup. So if as long as we have global dirty ratio knobs, it makes sense
> to have per cgroup dirty ratio knob also.
>

It has a good default, too: whatever ratio of memory that was allowed to
be dirty before the memcg limit was set is still allowed by default.

David Rientjes

unread,

Feb 22, 2010, 4:30:01 PM2/22/10

to

On Mon, 22 Feb 2010, Andrea Righi wrote:

> > Hmm...do we need spinlock ? You use "unsigned long", then, read-write
> > is always atomic if not read-modify-write.
>
> I think I simply copy&paste the memcg->swappiness case. But I agree,
> read-write should be atomic.
>

We don't need memcg->reclaim_param_lock in get_swappiness() or
mem_cgroup_get_reclaim_priority().

KAMEZAWA Hiroyuki

unread,

Feb 22, 2010, 7:20:01 PM2/22/10

to

On Mon, 22 Feb 2010 12:58:33 -0500
Vivek Goyal <vgo...@redhat.com> wrote:

> On Mon, Feb 22, 2010 at 11:06:40PM +0530, Balbir Singh wrote:
> > * Vivek Goyal <vgo...@redhat.com> [2010-02-22 09:27:45]:
> >
> >
> > >

> > > May be we can modify writeback_inodes_wbc() to check first dirty page
> > > of the inode. And if it does not belong to same memcg as the task who
> > > is performing balance_dirty_pages(), then skip that inode.
> >
> > Do you expect all pages of an inode to be paged in by the same cgroup?
>
> I guess at least in simple cases. Not sure whether it will cover majority
> of usage or not and up to what extent that matters.
>
> If we start doing background writeout, on per page (like memory reclaim),
> the it probably will be slower and hence flusing out pages sequentially
> from inode makes sense.
>
> At one point I was thinking, like pages, can we have an inode list per
> memory cgroup so that writeback logic can traverse that inode list to
> determine which inodes need to be cleaned. But associating inodes to
> memory cgroup is not very intutive at the same time, we again have the
> issue of shared file pages from two differnent cgroups.
>
> But I guess, a simpler scheme would be to just check first dirty page from
> inode and if it does not belong to memory cgroup of task being throttled,
> skip it.
>
> It will not cover the case of shared file pages across memory cgroups, but
> at least something relatively simple to begin with. Do you have more ideas
> on how it can be handeled better.
>

If pagesa are "shared", it's hard to find _current_ owner. Then, what I'm
thinking as memcg's update is a memcg-for-page-cache and pagecache
migration between memcg.

The idea is
- At first, treat page cache as what we do now.
- When a process touches page cache, check process's memcg and page cache's
memcg. If process-memcg != pagecache-memcg, we migrate it to a special
container as memcg-for-page-cache.

Then,
- read-once page caches are handled by local memcg.
- shared page caches are handled in specail memcg for "shared".

But this will add significant overhead in native implementation.
(We may have to use page flags rather than page_cgroup's....)

I'm now wondering about
- set "shared flag" to a page_cgroup if cached pages are accessed.
- sweep them to special memcg in other (kernel) daemon when we hit thresh
or some.

But hmm, I'm not sure that memcg-for-shared-page-cache is accepptable
for anyone.

Thanks,
-Kame

Andrea Righi

unread,

Feb 23, 2010, 4:30:01 AM2/23/10

to

On Mon, Feb 22, 2010 at 09:44:42PM +0530, Balbir Singh wrote:
[snip]

OK, I'll rebase the patch to -mm. Are those changes already included in
mmotm?

Thanks,
-Andrea

Andrea Righi

unread,

Feb 23, 2010, 4:30:01 AM2/23/10

to

Right. I like it. We can extend this function or provide separate
functions to account each stat.

Thanks!
-Andrea

Andrea Righi

unread,

Feb 23, 2010, 4:50:01 AM2/23/10

to

Agreed. Will do in the next version of the patch.

Thanks,
-Andrea

Andrea Righi

unread,

Feb 23, 2010, 4:50:02 AM2/23/10

to

On Mon, Feb 22, 2010 at 11:52:15AM -0500, Vivek Goyal wrote:
> > unsigned long determine_dirtyable_memory(void)
> > {
> > - unsigned long x;
> > -
> > - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > -
> > + unsigned long memcg_memory, memory;
> > +
> > + memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> > + memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> > + if (memcg_memory > 0) {
>
> it could be just
>
> if (memcg_memory) {

Agreed.

> }
>
> > + memcg_memory +=
> > + mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> > + if (memcg_memory < memory)
> > + return memcg_memory;
> > + }
> > if (!vm_highmem_is_dirtyable)
> > - x -= highmem_dirtyable_memory(x);
> > + memory -= highmem_dirtyable_memory(memory);
> >
>
> If vm_highmem_is_dirtyable=0, In that case, we can still return with
> "memcg_memory" which can be more than "memory". IOW, highmem is not
> dirtyable system wide but still we can potetially return back saying
> for this cgroup we can dirty more pages which can potenailly be acutally
> be more that system wide allowed?
>
> Because you have modified dirtyable_memory() and made it per cgroup, I
> think it automatically takes care of the cases of per cgroup dirty ratio,
> I mentioned in my previous mail. So we will use system wide dirty ratio
> to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> available_memory()) and if this cgroup wrote too many pages start
> writeout?

OK, if I've understood well, you're proposing to use per-cgroup
dirty_ratio interface and do something like:

unsigned long determine_dirtyable_memory(void)
{
unsigned long memcg_memory, memory;

memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();

if (!vm_highmem_is_dirtyable)
memory -= highmem_dirtyable_memory(memory);

memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
if (!memcg_memory)

return memory + 1; /* Ensure that we never return 0 */

memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
if (!vm_highmem_is_dirtyable)
memcg_memory -= highmem_dirtyable_memory(memory) *
mem_cgroup_dirty_ratio() / 100;
if (memcg_memory < memory)
return memcg_memory;
}

No, this is a bug, indeed. The right check is just "if (dirty)".

Thanks!
-Andrea

Andrea Righi

unread,

Feb 23, 2010, 4:50:02 AM2/23/10

to

ok, this is wrong:

> if (memcg_memory < memory)
> return memcg_memory;
> }

return min(memcg_memory, memory);

Andrea Righi

unread,

Feb 23, 2010, 5:00:02 AM2/23/10

to

On Mon, Feb 22, 2010 at 01:29:34PM -0500, Vivek Goyal wrote:
> > I would't like to add many different interfaces to do the same thing.
> > I'd prefer to choose just one interface and always use it. We just have
> > to define which is the best one. IMHO dirty_bytes is more generic. If
> > we want to define the limit as a % we can always do that in userspace.
> >
>
> dirty_ratio is easy to configure. One system wide default value works for
> all the newly created cgroups. For dirty_bytes, you shall have to
> configure each and individual cgroup with a specific value depneding on
> what is the upper limit of memory for that cgroup.

OK.

>
> Secondly, memory cgroup kind of partitions global memory resource per
> cgroup. So if as long as we have global dirty ratio knobs, it makes sense
> to have per cgroup dirty ratio knob also.
>
> But I guess we can introduce that later and use gloabl dirty ratio for
> all the memory cgroups (instead of each cgroup having a separate dirty
> ratio). The only thing is that we need to enforce this dirty ratio on the
> cgroup and if I am reading the code correctly, your modifications of
> calculating available_memory() per cgroup should take care of that.

At the moment (with dirty_bytes) if the cgroup has dirty_bytes == 0, it
simply uses the system wide available_memory(), ignoring the memory
upper limit for that cgroup and fallbacks to the current behaviour.

With dirty_ratio, should we change the code to *always* apply this
percentage to the cgroup memory upper limit, and automatically set it
equal to the global dirty_ratio by default when the cgroup is created?
mmmh... I vote yes.

-Andrea

Andrea Righi

unread,

Feb 23, 2010, 5:00:02 AM2/23/10

to

I'm rewriting the patch to correctly handle hierarchy. The hierarchy
design is completely broken in this patch.

-Andrea

Andrea Righi

unread,

Feb 23, 2010, 7:00:01 AM2/23/10

to

On Mon, Feb 22, 2010 at 01:07:32PM -0500, Vivek Goyal wrote:
> > > > +unsigned long mem_cgroup_dirty_bytes(void)
> > > > +{
> > > > + struct mem_cgroup *memcg;
> > > > + unsigned long dirty_bytes;
> > > > +
> > > > + if (mem_cgroup_disabled())
> > > > + return vm_dirty_bytes;
> > > > +
> > > > + rcu_read_lock();
> > > > + memcg = mem_cgroup_from_task(current);
> > > > + if (memcg == NULL)
> > > > + dirty_bytes = vm_dirty_bytes;
> > > > + else
> > > > + dirty_bytes = get_dirty_bytes(memcg);
> > > > + rcu_read_unlock();
> > >
> > > The rcu_read_lock() isn't protecting anything here.
> >
> > Right!
>
> Are we not protecting "memcg" pointer using rcu here?

Vivek, you are right:

mem_cgroup_from_task() -> task_subsys_state() -> rcu_dereference()

So, this *must* be RCU protected.

Thanks!
-Andrea

Vivek Goyal

unread,

Feb 23, 2010, 10:20:02 AM2/23/10

to

Is it not the case that the task who touched the page first is owner of
the page and task memcg is charged for that page. Subsequent shared users
of the page get a free ride?

If yes, why it is hard to find _current_ owner. Will it not be the memory
cgroup which brought the page into existence?

> Then, what I'm
> thinking as memcg's update is a memcg-for-page-cache and pagecache
> migration between memcg.
>
> The idea is
> - At first, treat page cache as what we do now.
> - When a process touches page cache, check process's memcg and page cache's
> memcg. If process-memcg != pagecache-memcg, we migrate it to a special
> container as memcg-for-page-cache.
>
> Then,
> - read-once page caches are handled by local memcg.
> - shared page caches are handled in specail memcg for "shared".
>
> But this will add significant overhead in native implementation.
> (We may have to use page flags rather than page_cgroup's....)
>
> I'm now wondering about
> - set "shared flag" to a page_cgroup if cached pages are accessed.
> - sweep them to special memcg in other (kernel) daemon when we hit thresh
> or some.
>
> But hmm, I'm not sure that memcg-for-shared-page-cache is accepptable
> for anyone.

I have not understood the idea well hence few queries/thoughts.

- You seem to be suggesting that shared page caches can be accounted
separately with-in memcg. But one page still need to be associated
with one specific memcg and one can only do migration across memcg
based on some policy who used how much. But we probably are trying
to be too accurate there and it might not be needed.

Can you elaborate a little more on what you meant by migrating pages
to special container memcg-for-page-cache? Is it a shared container
across memory cgroups which are sharing a page?

- Current writeback mechanism is flushing per inode basis. I think
biggest advantage is faster writeout speed as contiguous pages
are dispatched to disk (irrespective to the memory cgroup differnt
pages can belong to), resulting in better merging and less seeks.

Even if we can account shared pages well across memory cgroups, flushing
these pages to disk will probably become complicated/slow if we start going
through the pages of a memory cgroup and start flushing these out upon
hitting the dirty_background/dirty_ratio/dirty_bytes limits.

Thanks
Vivek

Vivek Goyal

unread,

Feb 23, 2010, 3:00:03 PM2/23/10

to

On Tue, Feb 23, 2010 at 10:40:40AM +0100, Andrea Righi wrote:

I think we can use system wide dirty_ratio for per cgroup (instead of
providing configurable dirty_ratio for each cgroup where each memory
cgroup can have different dirty ratio. Can't think of a use case
immediately).

>
> unsigned long determine_dirtyable_memory(void)
> {
> unsigned long memcg_memory, memory;
>
> memory = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
> if (!vm_highmem_is_dirtyable)
> memory -= highmem_dirtyable_memory(memory);
>
> memcg_memory = mem_cgroup_page_state(MEMCG_NR_FREE_PAGES);
> if (!memcg_memory)
> return memory + 1; /* Ensure that we never return 0 */
> memcg_memory += mem_cgroup_page_state(MEMCG_NR_RECLAIMABLE_PAGES);
> if (!vm_highmem_is_dirtyable)
> memcg_memory -= highmem_dirtyable_memory(memory) *
> mem_cgroup_dirty_ratio() / 100;
> if (memcg_memory < memory)
> return memcg_memory;
> }
>

This one is tricky and I don't have good answers. I have concerns though.

- While calculating system wide dirtyable memory, we rely on actual memory
available. (NR_FREE_PAGES + reclaimable_pages). In case of per memory
cgroup available free pages, we are relying on not necessarily on
actually available dirtyable memory but based on a user configurable
limit (LIMIT - USAGE = cgroup_dirtyable_memory).

This is good as long as total sum of limits of all cgroups is not more
than available memory. But if somebody sets the "limit" to a high value,
we will allow lots of write from that cgroup without being throttled.

So if memory cgroups were not configured right so that limit total
represents the actual memory in system, then we might end up having lot
more dirty pages in the system.

- Subtracting high memory pages from dirtyable memory is tricky. Because
how to account it in per cgroup calculation. May be we can just do
following.

calculate_memcg_memory;
memory = memory - highmem_dirtyable_memory();

if (memcg_memory < memory)
return memcg_memory;

Not sure. This is very crude and leaves the scope of more pages being
dirty than otherwise would have been. Ideas?

Vivek

Vivek Goyal

unread,

Feb 23, 2010, 3:10:02 PM2/23/10

to

Yes inheriting global dirty ratio as per cgroup ratio makes sense. Only
thing different here is that we apply dirty_ratio on cgroup memory limit
and not actual available memory. So if one configures cgroup with total
limit higher than available memory, we could end up lot more dirty pages
in system.

But one can then argue that it is wrong system configuration and admin should
take care of that while configuring the system. :-)

So, I would vote, yes.

Thanks
Vivek

Vivek Goyal

unread,

Feb 23, 2010, 4:40:01 PM2/23/10

to

On Sun, Feb 21, 2010 at 04:18:45PM +0100, Andrea Righi wrote:

[..]

> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0b19943..c9ff1cd 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c

> @@ -137,10 +137,11 @@ static struct prop_descriptor vm_dirties;
> */
> static int calc_period_shift(void)
> {
> - unsigned long dirty_total;
> + unsigned long dirty_total, dirty_bytes;
>
> - if (vm_dirty_bytes)
> - dirty_total = vm_dirty_bytes / PAGE_SIZE;

> + dirty_bytes = mem_cgroup_dirty_bytes();
> + if (dirty_bytes)

> + dirty_total = dirty_bytes / PAGE_SIZE;
> else
> dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
> 100;

Ok, I don't understand this so I better ask. Can you explain a bit how memory
cgroup dirty ratio is going to play with per BDI dirty proportion thing.

Currently we seem to be calculating per BDI proportion (based on recently
completed events), of system wide dirty ratio and decide whether a process
should be throttled or not.

Because throttling decision is also based on BDI and its proportion, how
are we going to fit it with mem cgroup? Is it going to be BDI proportion
of dirty memory with-in memory cgroup (and not system wide)?

Thanks
Vivek

David Rientjes

unread,

Feb 23, 2010, 5:30:02 PM2/23/10

to

On Tue, 23 Feb 2010, Vivek Goyal wrote:

> > > Because you have modified dirtyable_memory() and made it per cgroup, I
> > > think it automatically takes care of the cases of per cgroup dirty ratio,
> > > I mentioned in my previous mail. So we will use system wide dirty ratio
> > > to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> > > available_memory()) and if this cgroup wrote too many pages start
> > > writeout?
> >
> > OK, if I've understood well, you're proposing to use per-cgroup
> > dirty_ratio interface and do something like:
>
> I think we can use system wide dirty_ratio for per cgroup (instead of
> providing configurable dirty_ratio for each cgroup where each memory
> cgroup can have different dirty ratio. Can't think of a use case
> immediately).

I think each memcg should have both dirty_bytes and dirty_ratio,
dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from
the global vm_dirty_ratio. Changing vm_dirty_ratio would not change
memcgs already using their own dirty_ratio, but new memcgs would get the
new value by default. The ratio would act over the amount of available
memory to the cgroup as though it were its own "virtual system" operating
with a subset of the system's RAM and the same global ratio.

KAMEZAWA Hiroyuki

unread,

Feb 23, 2010, 7:20:02 PM2/23/10

to

yes.

-Kame

KAMEZAWA Hiroyuki

unread,

Feb 23, 2010, 7:30:01 PM2/23/10

to

On Tue, 23 Feb 2010 10:12:01 -0500
Vivek Goyal <vgo...@redhat.com> wrote:

yes.

>
> If yes, why it is hard to find _current_ owner. Will it not be the memory
> cgroup which brought the page into existence?
>

Considering extreme case, a memcg's dirty ratio can be filled by
free riders.

Assume cgroup, A, B, Share

/A
/B
/Share

- Pages touched by both of A and B are moved to Share.

Then, libc etc...will be moved to Share.
As far as I remember, solaris has similar concept of partitioning.

> - Current writeback mechanism is flushing per inode basis. I think
> biggest advantage is faster writeout speed as contiguous pages
> are dispatched to disk (irrespective to the memory cgroup differnt
> pages can belong to), resulting in better merging and less seeks.
>
> Even if we can account shared pages well across memory cgroups, flushing
> these pages to disk will probably become complicated/slow if we start going
> through the pages of a memory cgroup and start flushing these out upon
> hitting the dirty_background/dirty_ratio/dirty_bytes limits.
>

It's my bad to write this idea on this thread. I noticed my motivation is
not related to dirty_ratio. please ignore.

Thanks,
-Kame

Andrea Righi

unread,

Feb 25, 2010, 9:40:01 AM2/25/10

to

On Tue, Feb 23, 2010 at 02:22:12PM -0800, David Rientjes wrote:
> On Tue, 23 Feb 2010, Vivek Goyal wrote:
>
> > > > Because you have modified dirtyable_memory() and made it per cgroup, I
> > > > think it automatically takes care of the cases of per cgroup dirty ratio,
> > > > I mentioned in my previous mail. So we will use system wide dirty ratio
> > > > to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> > > > available_memory()) and if this cgroup wrote too many pages start
> > > > writeout?
> > >
> > > OK, if I've understood well, you're proposing to use per-cgroup
> > > dirty_ratio interface and do something like:
> >
> > I think we can use system wide dirty_ratio for per cgroup (instead of
> > providing configurable dirty_ratio for each cgroup where each memory
> > cgroup can have different dirty ratio. Can't think of a use case
> > immediately).
>
> I think each memcg should have both dirty_bytes and dirty_ratio,
> dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from
> the global vm_dirty_ratio. Changing vm_dirty_ratio would not change
> memcgs already using their own dirty_ratio, but new memcgs would get the
> new value by default. The ratio would act over the amount of available
> memory to the cgroup as though it were its own "virtual system" operating
> with a subset of the system's RAM and the same global ratio.

Agreed.

-Andrea

Andrea Righi

unread,

Feb 25, 2010, 10:20:01 AM2/25/10

to

IMHO we need to calculate the BDI dirty threshold as a function of the
cgroup's dirty memory, and keep BDI statistics system wide.

So, if a task is generating some writes, the threshold to start itself
the writeback will be calculated as a function of the cgroup's dirty
memory. If the BDI dirty memory is greater than this threshold, the task
must start to writeback dirty pages until it reaches the expected dirty
limit.

OK, in this way a cgroup with a small dirty limit may be forced to
writeback a lot of pages dirtied by other cgroups on the same device.
But this is always related to the fact that tasks are forced to
writeback dirty inodes randomly, and not the inodes they've actually
dirtied.

-Andrea

Minchan Kim

unread,

Feb 25, 2010, 10:40:02 AM2/25/10

to

Hi

On Tue, Feb 23, 2010 at 8:58 PM, Andrea Righi <ari...@develer.com> wrote:
> On Mon, Feb 22, 2010 at 01:07:32PM -0500, Vivek Goyal wrote:
>> > > > +unsigned long mem_cgroup_dirty_bytes(void)
>> > > > +{
>> > > > + struct mem_cgroup *memcg;
>> > > > + unsigned long dirty_bytes;
>> > > > +
>> > > > + if (mem_cgroup_disabled())
>> > > > + return vm_dirty_bytes;
>> > > > +
>> > > > + rcu_read_lock();
>> > > > + memcg = mem_cgroup_from_task(current);
>> > > > + if (memcg == NULL)
>> > > > + dirty_bytes = vm_dirty_bytes;
>> > > > + else
>> > > > + dirty_bytes = get_dirty_bytes(memcg);
>> > > > + rcu_read_unlock();
>> > >
>> > > The rcu_read_lock() isn't protecting anything here.
>> >
>> > Right!
>>
>> Are we not protecting "memcg" pointer using rcu here?
>
> Vivek, you are right:
>
> mem_cgroup_from_task() -> task_subsys_state() -> rcu_dereference()
>
> So, this *must* be RCU protected.

So, Doesn't mem_cgroup_from_task in mem_cgroup_can_attach need RCU, too?

--
Kind regards,
Minchan Kim

KAMEZAWA Hiroyuki

unread,

Feb 25, 2010, 7:20:02 PM2/25/10

to

On Thu, 25 Feb 2010 15:34:44 +0100
Andrea Righi <ari...@develer.com> wrote:

> On Tue, Feb 23, 2010 at 02:22:12PM -0800, David Rientjes wrote:
> > On Tue, 23 Feb 2010, Vivek Goyal wrote:
> >
> > > > > Because you have modified dirtyable_memory() and made it per cgroup, I
> > > > > think it automatically takes care of the cases of per cgroup dirty ratio,
> > > > > I mentioned in my previous mail. So we will use system wide dirty ratio
> > > > > to calculate the allowed dirty pages in this cgroup (dirty_ratio *
> > > > > available_memory()) and if this cgroup wrote too many pages start
> > > > > writeout?
> > > >
> > > > OK, if I've understood well, you're proposing to use per-cgroup
> > > > dirty_ratio interface and do something like:
> > >
> > > I think we can use system wide dirty_ratio for per cgroup (instead of
> > > providing configurable dirty_ratio for each cgroup where each memory
> > > cgroup can have different dirty ratio. Can't think of a use case
> > > immediately).
> >
> > I think each memcg should have both dirty_bytes and dirty_ratio,
> > dirty_bytes defaults to 0 (disabled) while dirty_ratio is inherited from
> > the global vm_dirty_ratio. Changing vm_dirty_ratio would not change
> > memcgs already using their own dirty_ratio, but new memcgs would get the
> > new value by default. The ratio would act over the amount of available
> > memory to the cgroup as though it were its own "virtual system" operating
> > with a subset of the system's RAM and the same global ratio.
>
> Agreed.
>

BTW, please add background_dirty_ratio in the same series of patches.
(or something other to kick background-writeback in proper manner.)

If not, we can't kick background write-back until we're caught by dirty_ratio.

Thanks,
-Kame

KAMEZAWA Hiroyuki

unread,

Feb 25, 2010, 7:30:02 PM2/25/10

to

Hm ? I don't read the whole thread but can_attach() is called under
cgroup_mutex(). So, it doesn't need to use RCU.

Thanks,
-Kame

Minchan Kim

unread,

Feb 26, 2010, 12:00:01 AM2/26/10

to

Hi, Kame.

Vivek mentioned memcg is protected by RCU if I understand his intention right.
So I commented that without enough knowledge of memcg.
After your comment, I dive into the code.

Just out of curiosity.

Really, memcg is protected by RCU?
I think most of RCU around memcg is for protecting task_struct and
cgroup_subsys_state.
The memcg is protected by cgroup_mutex as you mentioned.
Am I missing something?

> Thanks,
> -Kame
>
>
>

--
Kind regards,
Minchan Kim

KAMEZAWA Hiroyuki

unread,

Feb 26, 2010, 12:10:02 AM2/26/10

to

Hi,

On Fri, 26 Feb 2010 13:50:04 +0900
Minchan Kim <minch...@gmail.com> wrote:

> > Hm ? I don't read the whole thread but can_attach() is called under
> > cgroup_mutex(). So, it doesn't need to use RCU.
>
> Vivek mentioned memcg is protected by RCU if I understand his intention right.
> So I commented that without enough knowledge of memcg.
> After your comment, I dive into the code.
>
> Just out of curiosity.
>
> Really, memcg is protected by RCU?

yes. All cgroup subsystem is protected by RCU.

> I think most of RCU around memcg is for protecting task_struct and
> cgroup_subsys_state.
> The memcg is protected by cgroup_mutex as you mentioned.
> Am I missing something?

There are several levels of protections.

cgroup subsystem's ->destroy() call back is finally called by

As this.

768 synchronize_rcu();
769
770 mutex_lock(&cgroup_mutex);
771 /*
772 * Release the subsystem state objects.
773 */
774 for_each_subsys(cgrp->root, ss)
775 ss->destroy(ss, cgrp);
776
777 cgrp->root->number_of_cgroups--;
778 mutex_unlock(&cgroup_mutex);

Before here,
- there are no tasks under this cgroup (cgroup's refcnt is 0)
&& cgroup is marked as REMOVED.

Then, this access
rcu_read_lock();
mem = mem_cgroup_from_task(task);
if (css_tryget(mem->css)) <===============checks cgroup refcnt
....
rcu_read_unlock()
is O.K.

And, it's graranteed that we don't have to do complicated fine-grain check
if cgroup_mutex() is held.

Because cgroup_mutex() is system-wide heavy lock, this refcnt+RCU trick is
used and works quite well.

Thanks,
-Kame

Minchan Kim

unread,

Feb 26, 2010, 1:00:02 AM2/26/10

to

If it it, do we always need css_tryget after mem_cgroup_from_task
without cgroup_mutex to make sure css is vaild?

But I found several cases that don't call css_tryget

1. mm_match_cgroup
It's used by page_referenced_xxx. so we I think we don't grab
cgroup_mutex at that time.

2. mem_cgroup_oom_called
I think in here we don't grab cgroup_mutex, too.

I guess some design would cover that problems.
Could you tell me if you don't mind?
Sorry for bothering you.

Thanks, Kame.

--
Kind regards,
Minchan Kim

KAMEZAWA Hiroyuki

unread,

Feb 26, 2010, 1:20:02 AM2/26/10

to

On Fri, 26 Feb 2010 14:53:39 +0900
Minchan Kim <minch...@gmail.com> wrote:

On a case by cases.

> But I found several cases that don't call css_tryget
>
> 1. mm_match_cgroup
> It's used by page_referenced_xxx. so we I think we don't grab
> cgroup_mutex at that time.
>

yes. but all check are done under RCU. And this function never
access contents of memory which pointer holds.
And, please conider the whole context.

mem_cgroup_try_charge()
mem_cout =..... (refcnt +1)
....
try_to_free_mem_cgroup_pages(mem_cont)
....
shrink_xxx_list()
....
page_referenced_anon(page, mem_cont)
mm_match_cgroup(mm, mem_cont)

Then, this mem_cont (2nd argument to mm_match_cgroup) is always valid.
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm->ownder));
rcu_read_unlock();
return memcg != mem_cont;

Here,
1. mem_cont is never reused. (because refcnt+1)
2. we don't access memcg's contents.

I think even rcu_read_lock()/unlock() is unnecessary.

> 2. mem_cgroup_oom_called
> I think in here we don't grab cgroup_mutex, too.
>

In OOM-killer context, memcg which causes OOM has refcnt +1.
Then, not necessary.

> I guess some design would cover that problems.

Maybe.

> Could you tell me if you don't mind?
> Sorry for bothering you.
>

In my point of view, the most terrible porblem is heavy cost of
css_tryget() when you run multi-thread heavy program.
So, I want to see some inovation, but haven't find yet.

I admit this RCU+refcnt is tend to be hard to review. But it's costly
operation to take refcnt and it's worth to be handled in the best
usage of each logics, on a case by cases for now.

Thanks,
-Kame

Minchan Kim

unread,

Feb 26, 2010, 1:40:01 AM2/26/10

to

On Fri, Feb 26, 2010 at 3:15 PM, KAMEZAWA Hiroyuki

Thanks for always kind explanation, Kame.

> Thanks,
> -Kame
>
>
>

--
Kind regards,
Minchan Kim

Vivek Goyal

unread,

Feb 26, 2010, 5:00:03 PM2/26/10

to

Ok, so calculate dirty per cgroup and calculate BDI's proportion from
cgroup dirty? So will you be keeping track of vm_completion events per
cgroup or will rely on existing system wide and per BDI completion events
to calculate BDI proportion?

BDI proportion are more of an indication of device speed and faster device
gets higher share of dirty, so may be we don't have to keep track of
completion events per cgroup and can rely on system wide completion events
for calculating the proportion of a BDI.

> OK, in this way a cgroup with a small dirty limit may be forced to
> writeback a lot of pages dirtied by other cgroups on the same device.
> But this is always related to the fact that tasks are forced to
> writeback dirty inodes randomly, and not the inodes they've actually
> dirtied.

So we are left with following two issues.

- Should we rely on global BDI stats for BDI_RECLAIMABLE and BDI_WRITEBACK
or we need to make these per cgroup to determine actually how many pages
have been dirtied by a cgroup and force writeouts accordingly?

- Once we decide to throttle a cgroup, it should write its inodes and
should not be serialized behind other cgroup's inodes.

If we don't tackle above two issues, I am not sure what probelm will be solved
by the patch set. The only thing I can see is that we will be doing write-outs
much more aggressively when we have got some memory cgroups created. (Smaller
dirty per cgroup will lead to smaller per BDI dirty and when compared with
overall BDI stat, it should lead to more writeouts).

if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
break;

Because bdi_thres calculation will be based on per cgroup dirty and
bdi_nr_reclaimable and bdi_nr_writeback will be system wide, we will be
doing much more aggressive writeouts.

But we will not achieve parallel writeback paths so probably will not help IO
controller a lot.

Kame-san, is it a problem, with current memory cgroups where writeback is
not happening that actively, and you run into situation where there are too
many dirty pages in a cgroup and reclaim can take long time?

Thanks
Vivek

Andrea Righi

unread,

Feb 26, 2010, 5:30:01 PM2/26/10

to

We could try to save who made the inode dirty
(inode->cgroup_that_made_inode_dirty) so that during the active
writeback each cgroup can be forced to write only its own inodes.

-Andrea

Vivek Goyal

unread,

Feb 26, 2010, 5:40:02 PM2/26/10

to

Yes, but that will require to store a reference to memcg and will become
little complicated.

I was thinking of just matching the cgroup of task being throttled and
memcg of first dirty page in the inode. So we can possibly implement
something like in memcontroller.

bool memcg_task_inode_cgroup_match(inode)

and this function will retrieve first dirty page and compare the cgroup of
that with task memory cgroup. No hassle of storing a pointer hence
reference to memcg.

Well, we could store css_id, and no need to keep a reference to the
memcg. But I guess not storing anything in inode will be simpler.

Thanks
Vivek

KAMEZAWA Hiroyuki

unread,

Feb 28, 2010, 8:00:02 PM2/28/10

to

Hmm, not same situation to the global memory management, but we have similar.

In memcg, we just count user's page, "hard to reclaim" situation doesn't happen.
But "reclaim is slower than expected" is an usual problem.

When you try
% dd id=/dev/zero of=./tmpfifle .....
under proper limitation of memcg, you'll find dd is very slow.
We know background writeback helps this situation. We need to kick background
write-back.

Thanks,
-Kame