[BUGFIX][PATCH] memcg: fix oom kill behavior.

KAMEZAWA Hiroyuki

unread,

Mar 1, 2010, 10:10:02 PM3/1/10

to

Brief Summary (for Andrew)

- Nishimura reported my fix (one year ago)
a636b327f731143ccc544b966cfd8de6cb6d72c6
doesn't work well in some extreme situation.

- David Rientjes said mem_cgroup_oom_called() is completely
ugly and broken and.....
And he tries to remove that in his patch set.

Then, I wrote this as bugfix onto mmotm. This patch implements
- per-memcg OOM lock as per-zone OOM lock
- avoid to return -ENOMEM via mamcg's page fault path.
ENOMEM causes unnecessary page_fault_out_of_memory().
(Even if memcg hangs, there is no change from current behavior)
- in addtion to MEMDIE thread, KILLED proceses go bypath memcg.

I'm glad if this goes into 2.6.34 timeline (as bugfix). But I'm
afraid this seems too big as bugfix...

My plans for 2.6.35 are
- oom-notifier for memcg (based on memcg threshold notifier)
- oom-freezer (disable oom-kill) for memcg
- better handling in extreme situation.
And now, Andrea Righi works for dirty_ratio for memcg. We'll have
something better in 2.6.35 kernels.

This patch will HUNK with David's set. Then, if this hunks in mmotm,
I'll rework.

Tested on x86-64. Nishimura-san, could you test ?

==
From: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>

In current page-fault code,

handle_mm_fault()
-> ...
-> mem_cgroup_charge()
-> map page or handle error.
-> check return code.

If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory()
is called. But if it's caused by memcg, OOM should have been already
invoked.
Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6

That patch records last_oom_jiffies for memcg's sub-hierarchy and
prevents page_fault_out_of_memory from being invoked in near future.

But Nishimura-san reported that check by jiffies is not enough
when the system is terribly heavy.

This patch changes memcg's oom logic as.
* If memcg causes OOM-kill, continue to retry.
* remove jiffies check which is used now.
* add memcg-oom-lock which works like perzone oom lock.
* If current is killed(as a process), bypass charge.

Something more sophisticated can be added but this pactch does
fundamental things.
TODO:
- add oom notifier
- add permemcg disable-oom-kill flag and freezer at oom.
- more chances for wake up oom waiter (when changing memory limit etc..)

Changelog;
- fixed per-memcg oom lock.

Signed-off-by: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
---
include/linux/memcontrol.h | 6 --
mm/memcontrol.c | 109 +++++++++++++++++++++++++++++++++------------
mm/oom_kill.c | 8 ---
3 files changed, 82 insertions(+), 41 deletions(-)

Index: mmotm-2.6.33-Feb11/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.33-Feb11.orig/include/linux/memcontrol.h
+++ mmotm-2.6.33-Feb11/include/linux/memcontrol.h
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
return false;
}

-extern bool mem_cgroup_oom_called(struct task_struct *task);
void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
return true;
}

-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
- return false;
-}
-
static inline int
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{
Index: mmotm-2.6.33-Feb11/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Feb11/mm/memcontrol.c
@@ -200,7 +200,7 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long last_oom_jiffies;
+ atomic_t oom_lock;
atomic_t refcnt;

unsigned int swappiness;
@@ -1234,32 +1234,77 @@ static int mem_cgroup_hierarchical_recla
return total;
}

-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
+ int *val = (int *)data;
+ int x;

- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;
+ x = atomic_inc_return(&mem->oom_lock);
+ if (x > *val)
+ *val = x;
+ return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+ int check = 0;
+
+ mem_cgroup_walk_tree(mem, &check, mem_cgroup_oom_lock_cb);
+
+ if (check == 1)
+ return true;
+ return false;
}

-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{
- mem->last_oom_jiffies = jiffies;
+ atomic_dec(&mem->oom_lock);
return 0;
}

-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+{
+ DEFINE_WAIT(wait);
+ bool locked;
+
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
+ /* At first, try to OOM lock hierarchy under mem.*/
+ mutex_lock(&memcg_oom_mutex);
+ locked = mem_cgroup_oom_lock(mem);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (locked) {
+ finish_wait(&memcg_oom_waitq, &wait);
+ mem_cgroup_out_of_memory(mem, mask);
+ } else {
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+ }
+ mutex_lock(&memcg_oom_mutex);
+ mem_cgroup_oom_unlock(mem);
+ /* TODO: more fine grained waitq ? */
+ wake_up_all(&memcg_oom_waitq);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;
+ /* Give chance to dying process */
+ schedule_timeout(1);
+ return true;
}

/*
@@ -1432,11 +1477,14 @@ static int __mem_cgroup_try_charge(struc
struct res_counter *fail_res;
int csize = CHARGE_SIZE;

- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
- /* Don't account this! */
- *memcg = NULL;
- return 0;
- }
+ /*
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+ * in system level. So, allow to go ahead dying process in addition to
+ * MEMDIE process.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
+ || fatal_signal_pending(current)))
+ goto bypass;

/*
* We always charge the cgroup the mm_struct belongs to.
@@ -1549,11 +1597,15 @@ static int __mem_cgroup_try_charge(struc
}

if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
+ if (!oom)
+ goto nomem;
+ if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ continue;
}
- goto nomem;
+ /* When we reach here, current task is dying .*/
+ css_put(&mem->css);
+ goto bypass;
}
}
if (csize > PAGE_SIZE)
@@ -1572,6 +1624,9 @@ done:
nomem:
css_put(&mem->css);
return -ENOMEM;
+bypass:
+ *memcg = NULL;
+ return 0;
}

/*
Index: mmotm-2.6.33-Feb11/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Feb11.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Feb11/mm/oom_kill.c
@@ -599,13 +599,6 @@ void pagefault_out_of_memory(void)
/* Got some memory back in the last second. */
return;

- /*
- * If this is from memcg, oom-killer is already invoked.
- * and not worth to go system-wide-oom.
- */
- if (mem_cgroup_oom_called(current))
- goto rest_and_return;
-
if (sysctl_panic_on_oom)
panic("out of memory from page fault. panic_on_oom is selected.\n");

@@ -617,7 +610,6 @@ void pagefault_out_of_memory(void)
* Give "p" a good chance of killing itself before we
* retry to allocate memory.
*/
-rest_and_return:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

KAMEZAWA Hiroyuki

unread,

Mar 2, 2010, 12:00:01 AM3/2/10

to

Very sorry, mutex_lock is called after prepare_to_wait.
This is a fixed one.

In current page-fault code,

Changelog;
- fixed mutex and prepare_to_wait order.

+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+ int check = 0;
+
+ mem_cgroup_walk_tree(mem, &check, mem_cgroup_oom_lock_cb);

-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+ if (check == 1)
+ return true;
+ return false;

+}
+

+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{
- mem->last_oom_jiffies = jiffies;
+ atomic_dec(&mem->oom_lock);
return 0;
}

-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)

+{

+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);

+ DEFINE_WAIT(wait);
+ bool locked;
+

+ /* At first, try to OOM lock hierarchy under mem.*/
+ mutex_lock(&memcg_oom_mutex);
+ locked = mem_cgroup_oom_lock(mem);

+ if (!locked)
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);

+ mutex_unlock(&memcg_oom_mutex);
+
+ if (locked)

Daisuke Nishimura

unread,

Mar 2, 2010, 12:50:02 AM3/2/10

to

On Tue, 2 Mar 2010 13:55:24 +0900, KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com> wrote:
> Very sorry, mutex_lock is called after prepare_to_wait.
> This is a fixed one.

I'm willing to test your patch, but I have one concern.

Isn't there such race conditions ?

context A context B
mutex_lock(&memcg_oom_mutex)
mem_cgroup_oom_lock()
->success
mutex_unlock(&memcg_oom_mutex)
mem_cgroup_out_of_memory()
mutex_lock(&memcg_oom_mutex)
mem_cgroup_oom_lock()
->fail
prepare_to_wait()
mutex_unlock(&memcg_oom_mutex)
mutex_lock(&memcg_oom_mutex)
mem_cgroup_oom_unlock()
wake_up_all()
mutex_unlocklock(&memcg_oom_mutex)
schedule()
finish_wait()

In this case, context B will not be waken up, right?

Thanks,
Daisuke Nishimura.

KAMEZAWA Hiroyuki

unread,

Mar 2, 2010, 1:10:02 AM3/2/10

to

No.
prerape_to_wait();
schedule();
finish_wait();
call sequence is for this kind of waiting.

1. Thread B. call prepare_to_wait(), then, wait is queued and task's status
is changed to be TASK_INTERRUPTIBLE
2. Thread A. wake_up_all() check all waiters in queue and change their status
to be TASK_RUNNING.
3. Thread B. calles schedule() but it's status is TASK_RUNNING,
it will be scheduled soon, no sleep.

Then, mutex_lock after prepare_to_wait() is bad ;)

Thanks,
-Kame

Daisuke Nishimura

unread,

Mar 2, 2010, 1:30:01 AM3/2/10

to

Ah, you're right. I forgot the point 2.
Thank you for your clarification.

I'll test this patch all through this night, and check whether it doesn't trigger
global oom after memcg's oom.

Thanks,
Daisuke Nishimura.

Balbir Singh

unread,

Mar 2, 2010, 12:20:03 PM3/2/10

to

* KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com> [2010-03-02 11:58:34]:

> Brief Summary (for Andrew)
>
> - Nishimura reported my fix (one year ago)
> a636b327f731143ccc544b966cfd8de6cb6d72c6
> doesn't work well in some extreme situation.
>
> - David Rientjes said mem_cgroup_oom_called() is completely
> ugly and broken and.....
> And he tries to remove that in his patch set.
>
> Then, I wrote this as bugfix onto mmotm. This patch implements
> - per-memcg OOM lock as per-zone OOM lock
> - avoid to return -ENOMEM via mamcg's page fault path.
> ENOMEM causes unnecessary page_fault_out_of_memory().
> (Even if memcg hangs, there is no change from current behavior)
> - in addtion to MEMDIE thread, KILLED proceses go bypath memcg.
>
> I'm glad if this goes into 2.6.34 timeline (as bugfix). But I'm
> afraid this seems too big as bugfix...
>
> My plans for 2.6.35 are
> - oom-notifier for memcg (based on memcg threshold notifier)
> - oom-freezer (disable oom-kill) for memcg
> - better handling in extreme situation.
> And now, Andrea Righi works for dirty_ratio for memcg. We'll have
> something better in 2.6.35 kernels.
>
> This patch will HUNK with David's set. Then, if this hunks in mmotm,
> I'll rework.
>

Hi, Kamezawa-San,

Some review comments below.

> Tested on x86-64. Nishimura-san, could you test ?
>
> ==
> From: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
>
> In current page-fault code,
>
> handle_mm_fault()
> -> ...
> -> mem_cgroup_charge()
> -> map page or handle error.
> -> check return code.
>
> If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory()
> is called. But if it's caused by memcg, OOM should have been already
> invoked.
> Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6
>
> That patch records last_oom_jiffies for memcg's sub-hierarchy and
> prevents page_fault_out_of_memory from being invoked in near future.
>
> But Nishimura-san reported that check by jiffies is not enough
> when the system is terribly heavy.
>
> This patch changes memcg's oom logic as.
> * If memcg causes OOM-kill, continue to retry.
> * remove jiffies check which is used now.

I like this very much!

> + *val = x;a

Use the max_t function here?
x = max_t(int, x, *val);

I was wondering if we should really wake up all? Shouldn't this be per
memcg? The waitq that is, since the check is per memcg, the wakeup
should also be per memcg.

> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majo...@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"do...@kvack.org"> em...@kvack.org </a>

--
Three Cheers,
Balbir

KAMEZAWA Hiroyuki

unread,

Mar 2, 2010, 7:10:02 PM3/2/10

to

Sure.

The difficulty of per-memcg waitq is because of hierarchy.

Assume following hierarhcy A and its children 01 and 02.

A/ <==== under OOM #2
01 <==== under OOM #1
02

And the OOM happens in following sequence.
1. 01 goes to OOM (#1)
2. A goes to OOM (#2)

Because oom-kill in group 01 can fix both oom under A and 01,
oom-kill under A and oom-kill under 01 should be mutual exclusive.

When OOM under 01 wakes up, we have no way to wake up waiters on A.
That's the reason I used system-wide waitq.

I think there is no big problem. But I hope someone finds a new magic
for doing logically correct things. Then, I added a TODO.
I'll add some comments.

Thanks,
-Kame

Daisuke Nishimura

unread,

Mar 2, 2010, 7:40:01 PM3/2/10

to

> I'll test this patch all through this night, and check whether it doesn't trigger
> global oom after memcg's oom.
>

O.K. It works well.
Feel free to add my signs.

Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>

Thanks,
Daisuke Nishimura.

KAMEZAWA Hiroyuki

unread,

Mar 2, 2010, 7:50:02 PM3/2/10

to

On Wed, 3 Mar 2010 09:26:06 +0900
Daisuke Nishimura <nish...@mxp.nes.nec.co.jp> wrote:

> > I'll test this patch all through this night, and check whether it doesn't trigger
> > global oom after memcg's oom.
> >
> O.K. It works well.
> Feel free to add my signs.
>
> Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
> Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
>

Thank you !

I'll apply Balbir's comment and post v3.

Thanks,
-Kame

KAMEZAWA Hiroyuki

unread,

Mar 3, 2010, 2:30:02 AM3/3/10

to

On Wed, 3 Mar 2010 09:38:44 +0900
KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com> wrote:

> On Wed, 3 Mar 2010 09:26:06 +0900
> Daisuke Nishimura <nish...@mxp.nes.nec.co.jp> wrote:
>
> > > I'll test this patch all through this night, and check whether it doesn't trigger
> > > global oom after memcg's oom.
> > >
> > O.K. It works well.
> > Feel free to add my signs.
> >
> > Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
> > Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
> >
>
> Thank you !
>
> I'll apply Balbir's comment and post v3.
>

rebased onto mmotm-Mar2.
tested on x86-64.

In current page-fault code,

Changelog 20100303
- added comments
Changelog 20100302

- fixed mutex and prepare_to_wait order.
- fixed per-memcg oom lock.

Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>

Signed-off-by: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
---
include/linux/memcontrol.h | 6 --

mm/memcontrol.c | 119 ++++++++++++++++++++++++++++++++++-----------
mm/oom_kill.c | 8 ---
3 files changed, 92 insertions(+), 41 deletions(-)

Index: mmotm-2.6.33-Mar2/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.33-Mar2.orig/include/linux/memcontrol.h
+++ mmotm-2.6.33-Mar2/include/linux/memcontrol.h

@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(v
return false;
}

-extern bool mem_cgroup_oom_called(struct task_struct *task);
void mem_cgroup_update_file_mapped(struct page *page, int val);
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
gfp_t gfp_mask, int nid,
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(v
return true;
}

-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
- return false;
-}
-
static inline int
mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
{

Index: mmotm-2.6.33-Mar2/mm/memcontrol.c
===================================================================
--- mmotm-2.6.33-Mar2.orig/mm/memcontrol.c
+++ mmotm-2.6.33-Mar2/mm/memcontrol.c
@@ -203,7 +203,7 @@ struct mem_cgroup {

* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
- unsigned long last_oom_jiffies;
+ atomic_t oom_lock;
atomic_t refcnt;

unsigned int swappiness;

@@ -1246,32 +1246,87 @@ static int mem_cgroup_hierarchical_recla

return total;
}

-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
+ int *val = (int *)data;
+ int x;

- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;
+ x = atomic_inc_return(&mem->oom_lock);

+ *val = max(x, *val);

+ return 0;
}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{

+ int lock_count = 0;
+
+ mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);

-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)

+ if (lock_count == 1)

+ return true;
+ return false;
+}
+
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{
- mem->last_oom_jiffies = jiffies;
+ atomic_dec(&mem->oom_lock);
return 0;
}

-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)

{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);

+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+

+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)

+{

+ DEFINE_WAIT(wait);
+ bool locked;
+
+ /* At first, try to OOM lock hierarchy under mem.*/
+ mutex_lock(&memcg_oom_mutex);
+ locked = mem_cgroup_oom_lock(mem);
+ if (!locked)
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_INTERRUPTIBLE);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (locked)
+ mem_cgroup_out_of_memory(mem, mask);
+ else {
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+ }
+ mutex_lock(&memcg_oom_mutex);
+ mem_cgroup_oom_unlock(mem);
+ /*

+ * Here, we use global waitq .....more fine grained waitq ?
+ * Assume following hierarchy.
+ * A/
+ * 01
+ * 02
+ * assume OOM happens both in A and 01 at the same time. Tthey are
+ * mutually exclusive by lock. (kill in 01 helps A.)
+ * When we use per memcg waitq, we have to wake up waiters on A and 02
+ * in addtion to waiters on 01. We use global waitq for avoiding mess.
+ * It will not be a big problem.
+ */

+ wake_up_all(&memcg_oom_waitq);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;
+ /* Give chance to dying process */
+ schedule_timeout(1);
+ return true;
}

/*
@@ -1443,11 +1498,14 @@ static int __mem_cgroup_try_charge(struc

struct res_counter *fail_res;
int csize = CHARGE_SIZE;

- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
- /* Don't account this! */
- *memcg = NULL;
- return 0;
- }
+ /*
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+ * in system level. So, allow to go ahead dying process in addition to
+ * MEMDIE process.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
+ || fatal_signal_pending(current)))
+ goto bypass;

/*
* We always charge the cgroup the mm_struct belongs to.

@@ -1560,11 +1618,15 @@ static int __mem_cgroup_try_charge(struc

}

if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
+ if (!oom)
+ goto nomem;
+ if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ continue;
}
- goto nomem;
+ /* When we reach here, current task is dying .*/
+ css_put(&mem->css);
+ goto bypass;
}
}
if (csize > PAGE_SIZE)

@@ -1574,6 +1636,9 @@ done:

nomem:
css_put(&mem->css);
return -ENOMEM;
+bypass:
+ *memcg = NULL;
+ return 0;
}

/*

Index: mmotm-2.6.33-Mar2/mm/oom_kill.c
===================================================================
--- mmotm-2.6.33-Mar2.orig/mm/oom_kill.c
+++ mmotm-2.6.33-Mar2/mm/oom_kill.c
@@ -603,13 +603,6 @@ void pagefault_out_of_memory(void)

/* Got some memory back in the last second. */
return;

- /*
- * If this is from memcg, oom-killer is already invoked.
- * and not worth to go system-wide-oom.
- */
- if (mem_cgroup_oom_called(current))
- goto rest_and_return;
-
if (sysctl_panic_on_oom)
panic("out of memory from page fault. panic_on_oom is selected.\n");

@@ -621,7 +614,6 @@ void pagefault_out_of_memory(void)

* Give "p" a good chance of killing itself before we
* retry to allocate memory.
*/
-rest_and_return:
if (!test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}

--

Andrew Morton

unread,

Mar 3, 2010, 6:20:02 PM3/3/10

to

> ...

>
> +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
> +{
> + int lock_count = 0;
> +
> + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
>
> -static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
> + if (lock_count == 1)
> + return true;
> + return false;
> +}

mem_cgroup_walk_tree() will visit all items, but it could have returned
when it found the first "locked" item. I minor inefficiency, I guess.

If the calling process has signal_pending() then the schedule() will
immediately return. A bug, I suspect. Fixable by using
TASK_UNINTERRUPTIBLE.

> + finish_wait(&memcg_oom_waitq, &wait);
> + }
> + mutex_lock(&memcg_oom_mutex);
> + mem_cgroup_oom_unlock(mem);
> + /*
> + * Here, we use global waitq .....more fine grained waitq ?
> + * Assume following hierarchy.
> + * A/
> + * 01
> + * 02
> + * assume OOM happens both in A and 01 at the same time. Tthey are
> + * mutually exclusive by lock. (kill in 01 helps A.)
> + * When we use per memcg waitq, we have to wake up waiters on A and 02
> + * in addtion to waiters on 01. We use global waitq for avoiding mess.
> + * It will not be a big problem.
> + */
> + wake_up_all(&memcg_oom_waitq);
> + mutex_unlock(&memcg_oom_mutex);
> +
> + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
> + return false;
> + /* Give chance to dying process */
> + schedule_timeout(1);
> + return true;
> }

--

KAMEZAWA Hiroyuki

unread,

Mar 3, 2010, 11:10:02 PM3/3/10

to

Perhaps. but considering unlock, this walk-all seems simpler because we don't
have to remember what we locked. Hmm...but create/remove cgroup while
we do oom-lock can cause bug. I'll add a check or re-design this lock.

Hmm..If it doen't sleep, it continue to reclaim memory. But we have no
return path to the caller in memcg's charge function even if signal_pending,
allowing continue reclaim just wastes cpu.

Sure, I'll update this to be TASK_UNINTERRUPTIBLE.
But I'll revisit this when we implement oom-notifier and oom-kill-disable.

Thank you for review. I'll post v4.

Regards,
-Kame

Daisuke Nishimura

unread,

Mar 3, 2010, 11:10:02 PM3/3/10

to

On Wed, 3 Mar 2010 16:23:04 +0900, KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com> wrote:
> On Wed, 3 Mar 2010 09:38:44 +0900
> KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com> wrote:
>
> > On Wed, 3 Mar 2010 09:26:06 +0900
> > Daisuke Nishimura <nish...@mxp.nes.nec.co.jp> wrote:
> >
> > > > I'll test this patch all through this night, and check whether it doesn't trigger
> > > > global oom after memcg's oom.
> > > >
> > > O.K. It works well.
> > > Feel free to add my signs.
> > >
> > > Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
> > > Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
> > >
> >
> > Thank you !
> >
> > I'll apply Balbir's comment and post v3.
> >
>
> rebased onto mmotm-Mar2.
> tested on x86-64.
>

I found a small race problem. This is the fix for it.

===
From: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>

We must avoid making oom_lock of a newly created child be negative.

Signed-off-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
---
mm/memcontrol.c | 7 ++++++-
1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3ce8c5b..9e25400 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1272,7 +1272,12 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)

static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{

- atomic_dec(&mem->oom_lock);
+ /*
+ * There is a small race window where a new child can be created after
+ * we called mem_cgroup_oom_lock(). Use atomic_add_unless() to avoid
+ * making oom_lock of such a child be negative.
+ */
+ atomic_add_unless(&mem->oom_lock, -1, 0);
return 0;
}

--
1.6.4

KAMEZAWA Hiroyuki

unread,

Mar 3, 2010, 11:20:01 PM3/3/10

to

On Thu, 4 Mar 2010 13:04:06 +0900
Daisuke Nishimura <nish...@mxp.nes.nec.co.jp> wrote:

Thank you!. I'll merge this to v4.

-Kame

KAMEZAWA Hiroyuki

unread,

Mar 4, 2010, 12:30:02 AM3/4/10

to

On Thu, 4 Mar 2010 13:04:06 +0900
Daisuke Nishimura <nish...@mxp.nes.nec.co.jp> wrote:
> > rebased onto mmotm-Mar2.
> > tested on x86-64.
> >
> I found a small race problem. This is the fix for it.
>

Here. Thank you for all your help.
-Kame

In current page-fault code,

Changelog 20100304
- fixed mem_cgroup_oom_unlock()
- added comments
- changed wait status from TASK_INTERRUPTIBLE to TASK_KILLABLE

Changelog 20100303
- added comments
Changelog 20100302
- fixed mutex and prepare_to_wait order.
- fixed per-memcg oom lock.

Reviewed-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>
Signed-off-by: Daisuke Nishimura <nish...@mxp.nes.nec.co.jp>

Signed-off-by: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
---
include/linux/memcontrol.h | 6 --

mm/memcontrol.c | 134 +++++++++++++++++++++++++++++++++++----------
mm/oom_kill.c | 8 --
3 files changed, 107 insertions(+), 41 deletions(-)

@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_recla

return total;
}

-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
{
- bool ret = false;
- struct mem_cgroup *mem;
- struct mm_struct *mm;
+ int *val = (int *)data;
+ int x;

+ /*
+ * Logically, we can stop scanning immediately when we find
+ * a memcg is already locked. But condidering unlock ops and
+ * creation/removal of memcg, scan-all is simple operation.
+ */

+ x = atomic_inc_return(&mem->oom_lock);
+ *val = max(x, *val);
+ return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+ int lock_count = 0;

- rcu_read_lock();
- mm = task->mm;
- if (!mm)
- mm = &init_mm;
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
- ret = true;
- rcu_read_unlock();
- return ret;

+ mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+

+ if (lock_count == 1)
+ return true;
+ return false;
}

-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)

+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
{
- mem->last_oom_jiffies = jiffies;

+ /*
+ * When a new child is created while the hierarchy is under oom,
+ * mem_cgroup_oom_lock() may not be called. We have to use
+ * atomic_add_unless() here.

+ */
+ atomic_add_unless(&mem->oom_lock, -1, 0);
return 0;
}

-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)

+{

+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
{
- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);

+ DEFINE_WAIT(wait);
+ bool locked;
+
+ /* At first, try to OOM lock hierarchy under mem.*/
+ mutex_lock(&memcg_oom_mutex);
+ locked = mem_cgroup_oom_lock(mem);

+ /*
+ * Even if signal_pending(), we can't quit charge() loop without
+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+ * under OOM is always welcomed, use TASK_KILLABLE here.
+ */
+ if (!locked)
+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);

+ mutex_unlock(&memcg_oom_mutex);
+
+ if (locked)
+ mem_cgroup_out_of_memory(mem, mask);
+ else {
+ schedule();
+ finish_wait(&memcg_oom_waitq, &wait);
+ }
+ mutex_lock(&memcg_oom_mutex);
+ mem_cgroup_oom_unlock(mem);
+ /*
+ * Here, we use global waitq .....more fine grained waitq ?
+ * Assume following hierarchy.
+ * A/
+ * 01
+ * 02
+ * assume OOM happens both in A and 01 at the same time. Tthey are
+ * mutually exclusive by lock. (kill in 01 helps A.)
+ * When we use per memcg waitq, we have to wake up waiters on A and 02
+ * in addtion to waiters on 01. We use global waitq for avoiding mess.
+ * It will not be a big problem.

+ * (And a task may be moved to other groups while it's waiting for OOM.)

+ */
+ wake_up_all(&memcg_oom_waitq);
+ mutex_unlock(&memcg_oom_mutex);
+
+ if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+ return false;
+ /* Give chance to dying process */
+ schedule_timeout(1);
+ return true;
}

/*

@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struc

struct res_counter *fail_res;
int csize = CHARGE_SIZE;

- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
- /* Don't account this! */
- *memcg = NULL;
- return 0;
- }
+ /*
+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+ * in system level. So, allow to go ahead dying process in addition to
+ * MEMDIE process.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)
+ || fatal_signal_pending(current)))
+ goto bypass;

/*
* We always charge the cgroup the mm_struct belongs to.

@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struc

}

if (!nr_retries--) {
- if (oom) {
- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
- record_last_oom(mem_over_limit);
+ if (!oom)
+ goto nomem;
+ if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ continue;
}
- goto nomem;
+ /* When we reach here, current task is dying .*/
+ css_put(&mem->css);
+ goto bypass;
}
}
if (csize > PAGE_SIZE)

@@ -1574,6 +1651,9 @@ done:

--