From: Andrea Arcangeli <and...@suse.de>
Subject: keep balance between different classzones
Signed-off-by: Andrea Arcangeli <and...@suse.de>
--- x/include/linux/mmzone.h.orig 2004-12-04 08:56:32.000000000 +0100
+++ x/include/linux/mmzone.h 2004-12-24 17:59:13.864424040 +0100
@@ -112,18 +112,14 @@ struct zone {
unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high;
/*
- * protection[] is a pre-calculated number of extra pages that must be
- * available in a zone in order for __alloc_pages() to allocate memory
- * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
- * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
- * for us to choose to allocate the page from that zone.
- *
- * It uses both min_free_kbytes and sysctl_lower_zone_protection.
- * The protection values are recalculated if either of these values
- * change. The array elements are in zonelist order:
- * [0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+ * We don't know if the memory that we're going to allocate will be freeable
+ * or/and it will be released eventually, so to avoid totally wasting several
+ * GB of ram we must reserve some of the lower zone memory (otherwise we risk
+ * to run OOM on the lower zones despite there's tons of freeable ram
+ * on the higher zones). This array is recalculated at runtime if the
+ * sysctl_lowmem_reserve_ratio sysctl changes.
*/
- unsigned long protection[MAX_NR_ZONES];
+ unsigned long lowmem_reserve[MAX_NR_ZONES];
struct per_cpu_pageset pageset[NR_CPUS];
@@ -366,7 +362,8 @@ struct ctl_table;
struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
-int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1];
+int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
#include <linux/topology.h>
--- x/include/linux/sysctl.h.orig 2004-12-04 08:56:32.000000000 +0100
+++ x/include/linux/sysctl.h 2004-12-24 17:59:13.865423888 +0100
@@ -159,7 +159,7 @@ enum
VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
VM_SWAPPINESS=19, /* Tendency to steal mapped memory */
- VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
+ VM_LOWMEM_RESERVE_RATIO=20,/* reservation ratio for lower memory zones */
VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
VM_LAPTOP_MODE=23, /* vm laptop mode */
--- x/kernel/sysctl.c.orig 2004-12-04 08:56:33.000000000 +0100
+++ x/kernel/sysctl.c 2004-12-24 17:59:13.868423432 +0100
@@ -62,7 +62,6 @@ extern int core_uses_pid;
extern char core_pattern[];
extern int cad_pid;
extern int pid_max;
-extern int sysctl_lower_zone_protection;
extern int min_free_kbytes;
extern int printk_ratelimit_jiffies;
extern int printk_ratelimit_burst;
@@ -736,14 +735,13 @@ static ctl_table vm_table[] = {
},
#endif
{
- .ctl_name = VM_LOWER_ZONE_PROTECTION,
- .procname = "lower_zone_protection",
- .data = &sysctl_lower_zone_protection,
- .maxlen = sizeof(sysctl_lower_zone_protection),
+ .ctl_name = VM_LOWMEM_RESERVE_RATIO,
+ .procname = "lowmem_reserve_ratio",
+ .data = &sysctl_lowmem_reserve_ratio,
+ .maxlen = sizeof(sysctl_lowmem_reserve_ratio),
.mode = 0644,
- .proc_handler = &lower_zone_protection_sysctl_handler,
+ .proc_handler = &lowmem_reserve_ratio_sysctl_handler,
.strategy = &sysctl_intvec,
- .extra1 = &zero,
},
{
.ctl_name = VM_MIN_FREE_KBYTES,
--- x/mm/page_alloc.c.orig 2004-12-04 08:56:33.000000000 +0100
+++ x/mm/page_alloc.c 2004-12-24 17:59:36.182031248 +0100
@@ -42,7 +42,15 @@ unsigned long totalram_pages;
unsigned long totalhigh_pages;
long nr_swap_pages;
int numnodes = 1;
-int sysctl_lower_zone_protection = 0;
+/*
+ * results with 256, 32 in the lowmem_reserve sysctl:
+ * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
+ * 1G machine -> (16M dma, 784M normal, 224M high)
+ * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
+ * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
+ * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ */
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
EXPORT_SYMBOL(totalram_pages);
EXPORT_SYMBOL(nr_swap_pages);
@@ -583,19 +591,6 @@ buffered_rmqueue(struct zone *zone, int
/*
* This is the 'heart' of the zoned buddy allocator.
- *
- * Herein lies the mysterious "incremental min". That's the
- *
- * local_low = z->pages_low;
- * min += local_low;
- *
- * thing. The intent here is to provide additional protection to low zones for
- * allocation requests which _could_ use higher zones. So a GFP_HIGHMEM
- * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
- * request. This preserves additional space in those lower zones for requests
- * which really do need memory from those zones. It means that on a decent
- * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
- * zone untouched.
*/
struct page * fastcall
__alloc_pages(unsigned int gfp_mask, unsigned int order,
@@ -608,7 +603,7 @@ __alloc_pages(unsigned int gfp_mask, uns
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
- int alloc_type;
+ int classzone_idx;
int do_retry;
int can_try_harder;
@@ -628,11 +623,11 @@ __alloc_pages(unsigned int gfp_mask, uns
return NULL;
}
- alloc_type = zone_idx(zones[0]);
+ classzone_idx = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */
for (i = 0; (z = zones[i]) != NULL; i++) {
- min = z->pages_low + (1<<order) + z->protection[alloc_type];
+ min = z->pages_low + (1<<order) + z->lowmem_reserve[classzone_idx];
if (z->free_pages < min)
continue;
@@ -655,7 +650,7 @@ __alloc_pages(unsigned int gfp_mask, uns
min /= 2;
if (can_try_harder)
min -= min / 4;
- min += (1<<order) + z->protection[alloc_type];
+ min += (1<<order) + z->lowmem_reserve[classzone_idx];
if (z->free_pages < min)
continue;
@@ -698,7 +693,7 @@ rebalance:
min /= 2;
if (can_try_harder)
min -= min / 4;
- min += (1<<order) + z->protection[alloc_type];
+ min += (1<<order) + z->lowmem_reserve[classzone_idx];
if (z->free_pages < min)
continue;
@@ -1117,9 +1112,9 @@ void show_free_areas(void)
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
);
- printk("protections[]:");
+ printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
- printk(" %lu", zone->protection[i]);
+ printk(" %lu", zone->lowmem_reserve[i]);
printk("\n");
}
@@ -1816,87 +1811,29 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
-static unsigned long higherzone_val(struct zone *z, int max_zone,
- int alloc_type)
-{
- int z_idx = zone_idx(z);
- struct zone *higherzone;
- unsigned long pages;
-
- /* there is no higher zone to get a contribution from */
- if (z_idx == MAX_NR_ZONES-1)
- return 0;
-
- higherzone = &z->zone_pgdat->node_zones[z_idx+1];
-
- /* We always start with the higher zone's protection value */
- pages = higherzone->protection[alloc_type];
-
- /*
- * We get a lower-zone-protection contribution only if there are
- * pages in the higher zone and if we're not the highest zone
- * in the current zonelist. e.g., never happens for GFP_DMA. Happens
- * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
- * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
- */
- if (higherzone->present_pages && z_idx < alloc_type)
- pages += higherzone->pages_low * sysctl_lower_zone_protection;
-
- return pages;
-}
-
/*
- * setup_per_zone_protection - called whenver min_free_kbytes or
- * sysctl_lower_zone_protection changes. Ensures that each zone
- * has a correct pages_protected value, so an adequate number of
+ * setup_per_zone_lowmem_reserve - called whenever
+ * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
+ * has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
- *
- * This algorithm is way confusing. I tries to keep the same behavior
- * as we had with the incremental min iterative algorithm.
*/
-static void setup_per_zone_protection(void)
+static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
- struct zone *zones, *zone;
- int max_zone;
- int i, j;
+ int j, idx;
for_each_pgdat(pgdat) {
- zones = pgdat->node_zones;
+ for (j = 0; j < MAX_NR_ZONES; j++) {
+ struct zone * zone = pgdat->node_zones + j;
+ unsigned long present_pages = zone->present_pages;
- for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
- if (zones[i].present_pages)
- max_zone = i;
+ zone->lowmem_reserve[j] = 0;
- /*
- * For each of the different allocation types:
- * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
- */
- for (i = 0; i < GFP_ZONETYPES; i++) {
- /*
- * For each of the zones:
- * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
- */
- for (j = MAX_NR_ZONES-1; j >= 0; j--) {
- zone = &zones[j];
+ for (idx = j-1; idx >= 0; idx--) {
+ struct zone * lower_zone = pgdat->node_zones + idx;
- /*
- * We never protect zones that don't have memory
- * in them (j>max_zone) or zones that aren't in
- * the zonelists for a certain type of
- * allocation (j>=i). We have to assign these
- * to zero because the lower zones take
- * contributions from the higher zones.
- */
- if (j > max_zone || j >= i) {
- zone->protection[i] = 0;
- continue;
- }
- /*
- * The contribution of the next higher zone
- */
- zone->protection[i] = higherzone_val(zone,
- max_zone, i);
+ lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx];
+ present_pages += lower_zone->present_pages;
}
}
}
@@ -1991,7 +1928,7 @@ static int __init init_per_zone_pages_mi
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_pages_min();
- setup_per_zone_protection();
+ setup_per_zone_lowmem_reserve();
return 0;
}
module_init(init_per_zone_pages_min)
@@ -2006,20 +1943,23 @@ int min_free_kbytes_sysctl_handler(ctl_t
{
proc_dointvec(table, write, file, buffer, length, ppos);
setup_per_zone_pages_min();
- setup_per_zone_protection();
return 0;
}
/*
- * lower_zone_protection_sysctl_handler - just a wrapper around
- * proc_dointvec() so that we can call setup_per_zone_protection()
- * whenever sysctl_lower_zone_protection changes.
+ * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
+ * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
+ * whenever sysctl_lowmem_reserve_ratio changes.
+ *
+ * The reserve ratio obviously has absolutely no relation with the
+ * pages_min watermarks. The lowmem reserve ratio can only make sense
+ * if in function of the boot time zone sizes.
*/
-int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, file, buffer, length, ppos);
- setup_per_zone_protection();
+ setup_per_zone_lowmem_reserve();
return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
This looks OK to me. It really simplifies the code there a lot too.
The only questions I have are: should it be on by default? I don't think
we ever reached an agreement. I'd say yes, after a run in -mm because it
does potentially fix corner cases where lower zones get filled with un-
freeable memory which could have been satisfied with higher zones.
And second, any chance you could you port it to the mm patches already in
-mm? Won't be a big job, just some clashes in __alloc_pages...
mm-keep-count-of-free-areas.patch
mm-higher-order-watermarks.patch
mm-higher-order-watermarks-fix.patch
mm-teach-kswapd-about-higher-order-areas.patch
Thanks,
Nick
Great, thanks for the review! I definitely agree it should be on by
default, I already had an hang report that was solved by more recent
kernels and that probably can only be explained by lowmem_reserve since
there aren't other mm changes in 2.6.5 based trees.
> And second, any chance you could you port it to the mm patches already in
> -mm? Won't be a big job, just some clashes in __alloc_pages...
I already had to port to 2.6.5 too, and that's enough for now unless I
first get a positive ack that it will be merged (if I hadn't more
interesting things to develop, I would be happily porting it).
I believe it can be accepted easily if you change the variable names
from protection to lowmem_reserve.
Is there a need for that or its just your taste? :)
The naming is in sync with 2.4, I called that feature lowmem_reserve
when I wrote it. Protection doesn't actually mean anything. Memory
protection, mprotect, what?
The object of the feature is to reserve lower memory in function of the
classzone allocation, and in function of the zone we're allocating from.
So lowmem_reserve sounds a much better name. And it wasn't me to change
it, it was the 2.6 kernel calling it differently in the first place.
Note that at first 2.6 was doing stuff very differently from 2.4 too
(and it wasn't working right infact). Now it's in perfect sync with the 2.4
algorightm I wrote originally and so I thought it would be much cleaner
to call it the same way as 2.4, which is more self explanatory too.