prepare to enable 32bit intel and amd bus
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/bus_numa.c | 16 ++++++++--------
arch/x86/pci/bus_numa.h | 4 ++--
2 files changed, 10 insertions(+), 10 deletions(-)
Index: linux-2.6/arch/x86/pci/bus_numa.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.c
+++ linux-2.6/arch/x86/pci/bus_numa.c
@@ -51,8 +51,8 @@ void x86_pci_root_bus_res_quirks(struct
}
}
-void __init update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge)
+void __init update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge)
{
int i;
struct resource *res;
@@ -65,20 +65,20 @@ void __init update_res(struct pci_root_i
/* try to merge it with old one */
for (i = 0; i < info->res_num; i++) {
- size_t final_start, final_end;
- size_t common_start, common_end;
+ resource_size_t final_start, final_end;
+ resource_size_t common_start, common_end;
res = &info->res[i];
if (res->flags != flags)
continue;
- common_start = max((size_t)res->start, start);
- common_end = min((size_t)res->end, end);
+ common_start = max(res->start, start);
+ common_end = min(res->end, end);
if (common_start > common_end + 1)
continue;
- final_start = min((size_t)res->start, start);
- final_end = max((size_t)res->end, end);
+ final_start = min(res->start, start);
+ final_end = max(res->end, end);
res->start = final_start;
res->end = final_end;
Index: linux-2.6/arch/x86/pci/bus_numa.h
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.h
+++ linux-2.6/arch/x86/pci/bus_numa.h
@@ -22,6 +22,6 @@ extern int pci_root_num;
extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
extern int found_all_numa_early;
-extern void update_res(struct pci_root_info *info, size_t start,
- size_t end, unsigned long flags, int merge);
+extern void update_res(struct pci_root_info *info, resource_size_t start,
+ resource_size_t end, unsigned long flags, int merge);
#endif
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
32bit numa run out of it, because recent change with wakeup and mptable.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/e820.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -724,7 +724,7 @@ core_initcall(e820_mark_nvs_memory);
/*
* Early reserved memory areas.
*/
-#define MAX_EARLY_RES 20
+#define MAX_EARLY_RES 32
struct early_res {
u64 start, end;
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
kernel/range.c | 9 +++++++++
1 file changed, 9 insertions(+)
Index: linux-2.6/kernel/range.c
===================================================================
--- linux-2.6.orig/kernel/range.c
+++ linux-2.6/kernel/range.c
@@ -13,6 +13,9 @@
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
+ if (start > end)
+ return nr_range;
+
/* Out of slots: */
if (nr_range >= az)
return nr_range;
@@ -30,6 +33,9 @@ int add_range_with_merge(struct range *r
{
int i;
+ if (start > end)
+ return nr_range;
+
/* Try to merge it with old one: */
for (i = 0; i < nr_range; i++) {
u64 final_start, final_end;
@@ -59,6 +65,9 @@ void subtract_range(struct range *range,
{
int i, j;
+ if (start > end)
+ return;
+
for (j = 0; j < az; j++) {
if (!range[j].end)
continue;
enable amd one chain even for all.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/amd_bus.c | 7 ++++---
arch/x86/pci/bus_numa.c | 5 -----
arch/x86/pci/bus_numa.h | 1 -
3 files changed, 4 insertions(+), 9 deletions(-)
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -87,11 +87,12 @@ static int __init early_fill_mp_bus_info
struct range range[RANGE_NUM];
u64 val;
u32 address;
+ int found;
if (!early_pci_allowed())
return -1;
- found_all_numa_early = 0;
+ found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
u32 id;
u16 device;
@@ -105,12 +106,12 @@ static int __init early_fill_mp_bus_info
device = (id>>16) & 0xffff;
if (pci_probes[i].vendor == vendor &&
pci_probes[i].device == device) {
- found_all_numa_early = 1;
+ found = 1;
break;
}
}
- if (!found_all_numa_early)
+ if (!found)
return 0;
pci_root_num = 0;
Index: linux-2.6/arch/x86/pci/bus_numa.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.c
+++ linux-2.6/arch/x86/pci/bus_numa.c
@@ -5,7 +5,6 @@
int pci_root_num;
struct pci_root_info pci_root_info[PCI_ROOT_NR];
-int found_all_numa_early;
void x86_pci_root_bus_res_quirks(struct pci_bus *b)
{
@@ -21,10 +20,6 @@ void x86_pci_root_bus_res_quirks(struct
if (!pci_root_num)
return;
- /* for amd, if only one root bus, don't need to do anything */
- if (pci_root_num < 2 && found_all_numa_early)
- return;
-
for (i = 0; i < pci_root_num; i++) {
if (pci_root_info[i].bus_min == b->number)
break;
Index: linux-2.6/arch/x86/pci/bus_numa.h
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.h
+++ linux-2.6/arch/x86/pci/bus_numa.h
@@ -20,7 +20,6 @@ struct pci_root_info {
#define PCI_ROOT_NR 4
extern int pci_root_num;
extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
-extern int found_all_numa_early;
extern void update_res(struct pci_root_info *info, resource_size_t start,
resource_size_t end, unsigned long flags, int merge);
prepare to enable it for 32bit
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/amd_bus.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -82,8 +82,8 @@ static int __init early_fill_mp_bus_info
struct pci_root_info *info;
u32 reg;
struct resource *res;
- size_t start;
- size_t end;
+ u64 start;
+ u64 end;
struct range range[RANGE_NUM];
u64 val;
u32 address;
--
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/Makefile | 3 +--
arch/x86/pci/amd_bus.c | 17 +++--------------
arch/x86/pci/bus_numa.h | 4 ++--
arch/x86/pci/i386.c | 4 ----
4 files changed, 6 insertions(+), 22 deletions(-)
Index: linux-2.6/arch/x86/pci/Makefile
===================================================================
--- linux-2.6.orig/arch/x86/pci/Makefile
+++ linux-2.6/arch/x86/pci/Makefile
@@ -14,8 +14,7 @@ obj-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-y += common.o early.o
-obj-y += amd_bus.o
-obj-$(CONFIG_X86_64) += bus_numa.o intel_bus.o
+obj-y += amd_bus.o bus_numa.o intel_bus.o
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -6,9 +6,7 @@
#include <asm/pci_x86.h>
-#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
-#endif
#include "bus_numa.h"
@@ -17,8 +15,6 @@
* also get peer root bus resource for io,mmio
*/
-#ifdef CONFIG_X86_64
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -207,7 +203,7 @@ static int __init early_fill_mp_bus_info
address = MSR_K8_TOP_MEM1;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", (u64)end, (u64)end>>20);
if (end < (1ULL<<32))
subtract_range(range, RANGE_NUM, 0, end - 1);
@@ -301,7 +297,8 @@ static int __init early_fill_mp_bus_info
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", (u64)end,
+ (u64)end>>20);
subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
}
@@ -347,14 +344,6 @@ static int __init early_fill_mp_bus_info
return 0;
}
-#else /* !CONFIG_X86_64 */
-
-static int __init early_fill_mp_bus_info(void) { return 0; }
-
-#endif /* !CONFIG_X86_64 */
-
-/* common 32/64 bit code */
-
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
static void enable_pci_io_ecs(void *unused)
Index: linux-2.6/arch/x86/pci/i386.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/i386.c
+++ linux-2.6/arch/x86/pci/i386.c
@@ -257,10 +257,6 @@ void __init pcibios_resource_survey(void
*/
fs_initcall(pcibios_assign_resources);
-void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
-{
-}
-
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
Index: linux-2.6/arch/x86/pci/bus_numa.h
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.h
+++ linux-2.6/arch/x86/pci/bus_numa.h
@@ -1,5 +1,5 @@
-#ifdef CONFIG_X86_64
-
+#ifndef __BUS_NUMA_H
+#define __BUS_NUMA_H
/*
* sub bus (transparent) will use entres from 3 to store extra from
* root, so need to make sure we have enough slot there, Should we
Shouldn't this be resource_size_t?
-hpa
it seems we should use u64, and check the if the sizeof(resource_size_t) != sizeof(u64)
and cap some vale.
YH
v2: split out cap_4g to 61
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/Makefile | 3 +--
arch/x86/pci/amd_bus.c | 12 ------------
arch/x86/pci/bus_numa.h | 4 ++--
arch/x86/pci/i386.c | 4 ----
4 files changed, 3 insertions(+), 20 deletions(-)
@@ -348,14 +344,6 @@ static int __init early_fill_mp_bus_info
-v2: remove not needed cast
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/amd_bus.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -82,8 +82,8 @@ static int __init early_fill_mp_bus_info
struct pci_root_info *info;
u32 reg;
struct resource *res;
- size_t start;
- size_t end;
+ u64 start;
+ u64 end;
struct range range[RANGE_NUM];
u64 val;
u32 address;
@@ -173,7 +173,7 @@ static int __init early_fill_mp_bus_info
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/* kernel only handle 16 bit only */
if (end > 0xffff)
@@ -207,7 +207,7 @@ static int __init early_fill_mp_bus_info
address = MSR_K8_TOP_MEM1;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
if (end < (1ULL<<32))
subtract_range(range, RANGE_NUM, 0, end - 1);
@@ -246,7 +246,7 @@ static int __init early_fill_mp_bus_info
info = &pci_root_info[j];
printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
- node, link, (u64)start, (u64)end);
+ node, link, start, end);
/*
* some sick allocation would have range overlap with fam10h
* mmconf range, so need to update start and end.
@@ -272,13 +272,13 @@ static int __init early_fill_mp_bus_info
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
subtract_range(range, RANGE_NUM, start, endx);
- printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
+ printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
}
if (changed) {
if (start <= end) {
- printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end);
+ printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", start, end);
} else {
printk(KERN_CONT "%s\n", endx?"":" ==> none");
continue;
@@ -301,7 +301,7 @@ static int __init early_fill_mp_bus_info
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
- printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
}
How about:
static inline resource_size_t cap_resource(u64 val)
{
if (val > ~(resource_size_t)0)
return ~(resource_size_t)0;
else
return val;
}
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/amd_bus.c | 7 ++++---
arch/x86/pci/bus_numa.c | 4 ++++
arch/x86/pci/intel_bus.c | 5 ++++-
include/linux/range.h | 10 ++++++++++
4 files changed, 22 insertions(+), 4 deletions(-)
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -201,7 +201,7 @@ static int __init early_fill_mp_bus_info
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = (0xfdULL<<32) - 1;
+ range[0].end = cap_4g((0xfdULL<<32) - 1);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
@@ -286,7 +286,7 @@ static int __init early_fill_mp_bus_info
}
}
- update_res(info, start, end, IORESOURCE_MEM, 1);
+ update_res(info, cap_4g(start), cap_4g(end), IORESOURCE_MEM, 1);
subtract_range(range, RANGE_NUM, start, end);
printk(KERN_CONT "\n");
}
@@ -321,7 +321,8 @@ static int __init early_fill_mp_bus_info
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, cap_4g(range[i].start),
+ cap_4g(range[i].end),
IORESOURCE_MEM, 1);
}
}
Index: linux-2.6/arch/x86/pci/bus_numa.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.c
+++ linux-2.6/arch/x86/pci/bus_numa.c
@@ -55,6 +55,10 @@ void __init update_res(struct pci_root_i
if (start > end)
return;
+ if (sizeof(resource_size_t) < sizeof(u64) &&
+ start == 0xffffffff)
+ return;
+
if (!merge)
goto addit;
Index: linux-2.6/arch/x86/pci/intel_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/intel_bus.c
+++ linux-2.6/arch/x86/pci/intel_bus.c
@@ -6,6 +6,8 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/init.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
#include "bus_numa.h"
@@ -81,7 +83,8 @@ static void __devinit pci_root_bus_res(s
mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
- update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
+ update_res(info, cap_4g(mmioh_base), cap_4g(mmioh_end),
+ IORESOURCE_MEM, 0);
print_ioh_resources(info);
}
Index: linux-2.6/include/linux/range.h
===================================================================
--- linux-2.6.orig/include/linux/range.h
+++ linux-2.6/include/linux/range.h
@@ -19,4 +19,14 @@ int clean_sort_range(struct range *range
void sort_range(struct range *range, int nr_range);
+static inline u64 cap_4g(u64 val)
+{
+ if (sizeof(resource_size_t) >= sizeof(u64))
+ return val;
+
+ if (val < 1ULL)
+ return val;
+
+ return 0xffffffff;
+}
#endif
ok
Stylisically I guess the idiom:
(resource_size_t)~0
... is better, not that it matters in this case, but the (type)~0 idiom
gets the correct answer even when sizeof(type) < sizeof(int).
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
--
should be good for 32bit too.
-v3: cast res->start
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/Makefile | 3 +--
arch/x86/pci/amd_bus.c | 14 +-------------
arch/x86/pci/bus_numa.h | 4 ++--
arch/x86/pci/i386.c | 4 ----
arch/x86/pci/intel_bus.c | 2 +-
5 files changed, 5 insertions(+), 22 deletions(-)
@@ -342,21 +338,13 @@ static int __init early_fill_mp_bus_info
printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n",
busnum, j,
(res->flags & IORESOURCE_IO)?"io port":"mmio",
- res->start, res->end);
+ (u64)res->start, (u64)res->end);
Index: linux-2.6/arch/x86/pci/intel_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/intel_bus.c
+++ linux-2.6/arch/x86/pci/intel_bus.c
@@ -30,7 +30,7 @@ static inline void print_ioh_resources(s
busnum, i,
(res->flags & IORESOURCE_IO) ? "io port" :
"mmio",
- res->start, res->end);
+ (u64)res->start, (u64)res->end);
-v2: hpa said we should compare with (resource_size_t)~0
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/pci/amd_bus.c | 8 +++++---
arch/x86/pci/bus_numa.c | 3 +++
arch/x86/pci/intel_bus.c | 5 ++++-
include/linux/range.h | 8 ++++++++
4 files changed, 20 insertions(+), 4 deletions(-)
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -201,7 +201,7 @@ static int __init early_fill_mp_bus_info
memset(range, 0, sizeof(range));
/* 0xfd00000000-0xffffffffff for HT */
- range[0].end = (0xfdULL<<32) - 1;
+ range[0].end = cap_resource((0xfdULL<<32) - 1);
/* need to take out [0, TOM) for RAM*/
address = MSR_K8_TOP_MEM1;
@@ -286,7 +286,8 @@ static int __init early_fill_mp_bus_info
}
}
- update_res(info, start, end, IORESOURCE_MEM, 1);
+ update_res(info, cap_resource(start), cap_resource(end),
+ IORESOURCE_MEM, 1);
subtract_range(range, RANGE_NUM, start, end);
printk(KERN_CONT "\n");
}
@@ -321,7 +322,8 @@ static int __init early_fill_mp_bus_info
if (!range[i].end)
continue;
- update_res(info, range[i].start, range[i].end,
+ update_res(info, cap_resource(range[i].start),
+ cap_resource(range[i].end),
IORESOURCE_MEM, 1);
}
}
Index: linux-2.6/arch/x86/pci/bus_numa.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/bus_numa.c
+++ linux-2.6/arch/x86/pci/bus_numa.c
@@ -55,6 +55,9 @@ void __init update_res(struct pci_root_i
if (start > end)
return;
+ if (start == (resource_size_t)~0)
+ return;
+
if (!merge)
goto addit;
Index: linux-2.6/arch/x86/pci/intel_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/intel_bus.c
+++ linux-2.6/arch/x86/pci/intel_bus.c
@@ -6,6 +6,8 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/init.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
#include "bus_numa.h"
@@ -81,7 +83,8 @@ static void __devinit pci_root_bus_res(s
mmioh_base |= ((u64)(dword & 0x7ffff)) << 32;
pci_read_config_dword(dev, IOH_LMMIOH_LIMITU, &dword);
mmioh_end |= ((u64)(dword & 0x7ffff)) << 32;
- update_res(info, mmioh_base, mmioh_end, IORESOURCE_MEM, 0);
+ update_res(info, cap_resource(mmioh_base), cap_resource(mmioh_end),
+ IORESOURCE_MEM, 0);
print_ioh_resources(info);
}
Index: linux-2.6/include/linux/range.h
===================================================================
--- linux-2.6.orig/include/linux/range.h
+++ linux-2.6/include/linux/range.h
@@ -19,4 +19,12 @@ int clean_sort_range(struct range *range
void sort_range(struct range *range, int nr_range);
+
+static inline resource_size_t cap_resource(u64 val)
+{
+ if (val > (resource_size_t)~0)
+ return (resource_size_t)~0;
+ else
+ return val;
+}
#endif
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 15 deletions(-)
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -724,14 +724,18 @@ core_initcall(e820_mark_nvs_memory);
/*
* Early reserved memory areas.
*/
-#define MAX_EARLY_RES 32
+/*
+ * need to make sure this one is bigger enough before
+ * find_e820_area could be used
+ */
+#define MAX_EARLY_RES_X 32
struct early_res {
u64 start, end;
- char name[16];
+ char name[15];
char overlap_ok;
};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
#ifdef CONFIG_X86_32
/*
@@ -745,12 +749,22 @@ static struct early_res early_res[MAX_EA
{}
};
+static int max_early_res __initdata = MAX_EARLY_RES_X;
+static struct early_res *early_res __initdata = &early_res_x[0];
+static int early_res_count __initdata =
+#ifdef CONFIG_X86_32
+ 2
+#else
+ 1
+#endif
+ ;
+
static int __init find_overlapped_early(u64 start, u64 end)
{
int i;
struct early_res *r;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
r = &early_res[i];
if (end > r->start && start < r->end)
break;
@@ -768,13 +782,14 @@ static void __init drop_range(int i)
{
int j;
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ for (j = i + 1; j < max_early_res && early_res[j].end; j++)
;
memmove(&early_res[i], &early_res[i + 1],
(j - 1 - i) * sizeof(struct early_res));
early_res[j - 1].end = 0;
+ early_res_count--;
}
/*
@@ -793,9 +808,9 @@ static void __init drop_overlaps_that_ar
struct early_res *r;
u64 lower_start, lower_end;
u64 upper_start, upper_end;
- char name[16];
+ char name[15];
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
r = &early_res[i];
/* Continue past non-overlapping ranges */
@@ -851,7 +866,7 @@ static void __init __reserve_early(u64 s
struct early_res *r;
i = find_overlapped_early(start, end);
- if (i >= MAX_EARLY_RES)
+ if (i >= max_early_res)
panic("Too many early reservations");
r = &early_res[i];
if (r->end)
@@ -864,6 +879,7 @@ static void __init __reserve_early(u64 s
r->overlap_ok = overlap_ok;
if (name)
strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
}
/*
@@ -916,7 +932,7 @@ void __init free_early(u64 start, u64 en
i = find_overlapped_early(start, end);
r = &early_res[i];
- if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+ if (i >= max_early_res || r->end != end || r->start != start)
panic("free_early on not reserved area: %llx-%llx!",
start, end - 1);
@@ -927,14 +943,15 @@ void __init early_res_to_bootmem(u64 sta
{
int i, count;
u64 final_start, final_end;
+ int idx = 0;
count = 0;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
+ printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+ count - idx, max_early_res, start, end);
+ for (i = idx; i < count; i++) {
struct early_res *r = &early_res[i];
printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
r->start, r->end, r->name);
@@ -961,7 +978,7 @@ static inline int __init bad_addr(u64 *a
again:
i = find_overlapped_early(addr, addr + size);
r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
+ if (i < max_early_res && r->end) {
*addrp = addr = round_up(r->end, align);
changed = 1;
goto again;
@@ -978,7 +995,7 @@ static inline int __init bad_addr_size(u
int changed = 0;
again:
last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
if (last > r->start && addr < r->start) {
size = r->start - addr;
so we can keep the boundary between early_res and boot mem more clear.
and only call civertion one time instead of for all nodes.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/setup.c | 1
arch/x86/mm/init_32.c | 1
arch/x86/mm/init_64.c | 3 --
arch/x86/mm/numa_64.c | 62 +++++++++++++++---------------------------------
4 files changed, 22 insertions(+), 45 deletions(-)
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -942,6 +942,7 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
#ifdef CONFIG_X86_64
/*
Index: linux-2.6/arch/x86/mm/init_32.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_32.c
+++ linux-2.6/arch/x86/mm/init_32.c
@@ -764,7 +764,6 @@ static unsigned long __init setup_node_b
printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
nodeid, bootmap, bootmap + bootmap_size);
free_bootmem_with_active_regions(nodeid, end_pfn);
- early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
return bootmap + bootmap_size;
}
Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c
+++ linux-2.6/arch/x86/mm/init_64.c
@@ -578,13 +578,12 @@ void __init initmem_init(unsigned long s
PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
- early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
- reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
#endif
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -164,18 +164,21 @@ static void * __init early_node_mem(int
unsigned long align)
{
unsigned long mem = find_e820_area(start, end, size, align);
- void *ptr;
if (mem != -1L)
return __va(mem);
- ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
- if (ptr == NULL) {
- printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+
+ start = __pa(MAX_DMA_ADDRESS);
+ end = max_low_pfn_mapped << PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
+ if (mem != -1L)
+ return __va(mem);
+
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
size, nodeid);
- return NULL;
- }
- return ptr;
+
+ return NULL;
}
/* Initialize bootmem allocator for a node */
@@ -211,8 +214,12 @@ setup_node_bootmem(int nodeid, unsigned
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
+ reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
nodedata_phys + pgdat_size - 1);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
@@ -227,11 +234,7 @@ setup_node_bootmem(int nodeid, unsigned
* of alloc_bootmem, that could clash with reserved range
*/
bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
- nid = phys_to_nid(nodedata_phys);
- if (nid == nodeid)
- bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
- else
- bootmap_start = roundup(start, PAGE_SIZE);
+ bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
/*
* SMP_CACHE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -239,18 +242,13 @@ setup_node_bootmem(int nodeid, unsigned
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
- if (nodedata_phys < start || nodedata_phys >= end) {
- /*
- * only need to free it if it is from other node
- * bootmem
- */
- if (nid != nodeid)
- free_bootmem(nodedata_phys, pgdat_size);
- }
+ free_early(nodedata_phys, nodedata_phys + pgdat_size);
node_data[nodeid] = NULL;
return;
}
bootmap_start = __pa(bootmap);
+ reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
+ "BOOTMAP");
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
@@ -259,31 +257,11 @@ setup_node_bootmem(int nodeid, unsigned
printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
bootmap_start, bootmap_start + bootmap_size - 1,
bootmap_pages);
-
- free_bootmem_with_active_regions(nodeid, end);
-
- /*
- * convert early reserve to bootmem reserve earlier
- * otherwise early_node_mem could use early reserved mem
- * on previous node
- */
- early_res_to_bootmem(start, end);
-
- /*
- * in some case early_node_mem could use alloc_bootmem
- * to get range on other node, don't reserve that again
- */
- if (nid != nodeid)
- printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
- pgdat_size, BOOTMEM_DEFAULT);
nid = phys_to_nid(bootmap_start);
if (nid != nodeid)
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
- else
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
+ free_bootmem_with_active_regions(nodeid, end);
node_set_online(nodeid);
use early_res_count to track the num, and use find_e820 to get new buffer.
and copy from old to new one.
also clear early_res to prevent later invalid using
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/e820.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 54 insertions(+)
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -908,6 +908,49 @@ void __init reserve_early_overlap_ok(u64
__reserve_early(start, end, name, 1);
}
+static void __init __check_and_double_early_res(void)
+{
+ u64 size;
+ u64 mem;
+ struct early_res *new;
+
+ /* do we have enough slots left ? */
+ if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+ return;
+
+ /* double it */
+ size = sizeof(struct early_res) * max_early_res * 2;
+ mem = find_e820_area(0, max_pfn_mapped << PAGE_SHIFT, size,
+ sizeof(struct early_res));
+
+ if (mem == -1ULL)
+ panic("can not find more space for early_res array");
+
+ new = __va(mem);
+ /* save the first one for own */
+ new[0].start = mem;
+ new[0].end = mem + size;
+ new[0].overlap_ok = 0;
+ /* copy old to new */
+ if (early_res == early_res_x) {
+ memcpy(&new[1], &early_res[0],
+ sizeof(struct early_res) * max_early_res);
+ memset(&new[max_early_res+1], 0,
+ sizeof(struct early_res) * (max_early_res - 1));
+ early_res_count++;
+ } else {
+ memcpy(&new[1], &early_res[1],
+ sizeof(struct early_res) * (max_early_res - 1));
+ memset(&new[max_early_res], 0,
+ sizeof(struct early_res) * max_early_res);
+ }
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = new;
+ max_early_res *= 2;
+ printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n",
+ max_early_res, mem, mem + size - 1);
+}
+
/*
* Most early reservations come here.
*
@@ -921,6 +964,8 @@ void __init reserve_early(u64 start, u64
if (start >= end)
return;
+ __check_and_double_early_res();
+
drop_overlaps_that_are_ok(start, end);
__reserve_early(start, end, name, 0);
}
@@ -949,6 +994,10 @@ void __init early_res_to_bootmem(u64 sta
for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
count - idx, max_early_res, start, end);
for (i = idx; i < count; i++) {
@@ -966,6 +1015,11 @@ void __init early_res_to_bootmem(u64 sta
reserve_bootmem_generic(final_start, final_end - final_start,
BOOTMEM_DEFAULT);
}
+ /* clear them */
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ early_res_count = 0;
}
/* Check for already reserved areas */
with this patch will make <4g ram will not use sparse vmmap
before this patch, will get, before swiotlb try get bootmem
[ 0.000000] nid=1 start=0 end=2080000 aligned=1
[ 0.000000] free [10 - 96]
[ 0.000000] free [b12 - 1000]
[ 0.000000] free [359f - 38a3]
[ 0.000000] free [38b5 - 3a00]
[ 0.000000] free [41e01 - 42000]
[ 0.000000] free [73dde - 73e00]
[ 0.000000] free [73fdd - 74000]
[ 0.000000] free [741dd - 74200]
[ 0.000000] free [743dd - 74400]
[ 0.000000] free [745dd - 74600]
[ 0.000000] free [747dd - 74800]
[ 0.000000] free [749dd - 74a00]
[ 0.000000] free [74bdd - 74c00]
[ 0.000000] free [74ddd - 74e00]
[ 0.000000] free [74fdd - 75000]
[ 0.000000] free [751dd - 75200]
[ 0.000000] free [753dd - 75400]
[ 0.000000] free [755dd - 75600]
[ 0.000000] free [757dd - 75800]
[ 0.000000] free [759dd - 75a00]
[ 0.000000] free [75bdd - 7bf5f]
[ 0.000000] free [7f730 - 7f750]
[ 0.000000] free [100000 - 2080000]
[ 0.000000] total free 1f87170
[ 93.301474] Placing 64MB software IO TLB between ffff880075bdd000 - ffff880079bdd000
[ 93.311814] software IO TLB at phys 0x75bdd000 - 0x79bdd000
with this patch will get: before swiotlb try get bootmem
[ 0.000000] nid=1 start=0 end=2080000 aligned=1
[ 0.000000] free [a - 96]
[ 0.000000] free [702 - 1000]
[ 0.000000] free [359f - 3600]
[ 0.000000] free [37de - 3800]
[ 0.000000] free [39dd - 3a00]
[ 0.000000] free [3bdd - 3c00]
[ 0.000000] free [3ddd - 3e00]
[ 0.000000] free [3fdd - 4000]
[ 0.000000] free [41dd - 4200]
[ 0.000000] free [43dd - 4400]
[ 0.000000] free [45dd - 4600]
[ 0.000000] free [47dd - 4800]
[ 0.000000] free [49dd - 4a00]
[ 0.000000] free [4bdd - 4c00]
[ 0.000000] free [4ddd - 4e00]
[ 0.000000] free [4fdd - 5000]
[ 0.000000] free [51dd - 5200]
[ 0.000000] free [53dd - 5400]
[ 0.000000] free [55dd - 7bf5f]
[ 0.000000] free [7f730 - 7f750]
[ 0.000000] free [100428 - 100600]
[ 0.000000] free [13ea01 - 13ec00]
[ 0.000000] free [170800 - 2080000]
[ 0.000000] total free 1f87170
[ 92.689485] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
[ 92.699799] Placing 64MB software IO TLB between ffff8800055dd000 - ffff8800095dd000
[ 92.710916] software IO TLB at phys 0x55dd000 - 0x95dd000
so will get enough space below 4G, aka pfn 0x100000
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/mm/numa_64.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -163,14 +163,27 @@ static void * __init early_node_mem(int
unsigned long end, unsigned long size,
unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size, align);
+ unsigned long mem;
+ /*
+ * put it on high as possible
+ * something will go with NODE_DATA
+ */
+ if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
+ end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
- start = __pa(MAX_DMA_ADDRESS);
- end = max_low_pfn_mapped << PAGE_SHIFT;
+ end = max_pfn_mapped << PAGE_SHIFT;
+ if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ else
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
x86: Increase MAX_EARLY_RES; insufficient on 32-bit NUMA
Due to recent changes wakeup and mptable, we run out of early
reservations on 32-bit NUMA. Thus, adjust the available number.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
LKML-Reference: <4B22D754...@kernel.org>
Signed-off-by: H. Peter Anvin <h...@zytor.com>
---
arch/x86/kernel/e820.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f50447d..05ed7ab 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
to prepare allocate early res array from fine_e820_area
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 15 deletions(-)
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -724,14 +724,18 @@ core_initcall(e820_mark_nvs_memory);
/*
* Early reserved memory areas.
*/
-#define MAX_EARLY_RES 32
+/*
+ * need to make sure this one is bigger enough before
+ * find_e820_area could be used
+ */
+#define MAX_EARLY_RES_X 32
struct early_res {
u64 start, end;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
- printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
- count, start, end);
- for (i = 0; i < count; i++) {
+ printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+ count - idx, max_early_res, start, end);
+ for (i = idx; i < count; i++) {
struct early_res *r = &early_res[i];
printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
r->start, r->end, r->name);
@@ -961,7 +978,7 @@ static inline int __init bad_addr(u64 *a
again:
i = find_overlapped_early(addr, addr + size);
r = &early_res[i];
- if (i < MAX_EARLY_RES && r->end) {
+ if (i < max_early_res && r->end) {
*addrp = addr = round_up(r->end, align);
changed = 1;
goto again;
@@ -978,7 +995,7 @@ static inline int __init bad_addr_size(u
int changed = 0;
again:
last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ for (i = 0; i < max_early_res && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
if (last > r->start && addr < r->start) {
size = r->start - addr;
--
so we can keep the boundary between early_res and boot mem more clear.
and only call civertion one time instead of for all nodes.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -164,18 +164,21 @@ static void * __init early_node_mem(int
unsigned long align)
{
unsigned long mem = find_e820_area(start, end, size, align);
- void *ptr;
if (mem != -1L)
return __va(mem);
--
also clear early_res to prevent later invalid using
-v2 _check_and_double_early_res should take new start
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 53 insertions(+)
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -908,6 +908,48 @@ void __init reserve_early_overlap_ok(u64
__reserve_early(start, end, name, 1);
}
+static void __init __check_and_double_early_res(u64 start)
+{
+ u64 end, size, mem;
+ struct early_res *new;
+
+ /* do we have enough slots left ? */
+ if ((max_early_res - early_res_count) > max(max_early_res/8, 2))
+ return;
+
+ /* double it */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ size = sizeof(struct early_res) * max_early_res * 2;
+ mem = find_e820_area(start, end, size, sizeof(struct early_res));
@@ -921,6 +963,8 @@ void __init reserve_early(u64 start, u64
if (start >= end)
return;
+ __check_and_double_early_res(end);
+
drop_overlaps_that_are_ok(start, end);
__reserve_early(start, end, name, 0);
}
@@ -949,6 +993,10 @@ void __init early_res_to_bootmem(u64 sta
for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n",
count - idx, max_early_res, start, end);
for (i = idx; i < count; i++) {
@@ -966,6 +1014,11 @@ void __init early_res_to_bootmem(u64 sta
reserve_bootmem_generic(final_start, final_end - final_start,
BOOTMEM_DEFAULT);
}
+ /* clear them */
+ memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ early_res_count = 0;
}
/* Check for already reserved areas */
--
---
arch/x86/kernel/cpu/mtrr/cleanup.c | 180 +++----------------------------------
arch/x86/kernel/mmconf-fam10h_64.c | 7 -
arch/x86/pci/amd_bus.c | 70 ++------------
include/linux/range.h | 22 ++++
kernel/Makefile | 2
kernel/range.c | 154 +++++++++++++++++++++++++++++++
6 files changed, 205 insertions(+), 230 deletions(-)
Index: linux-2.6/arch/x86/kernel/cpu/mtrr/cleanup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ linux-2.6/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
#include <linux/pci.h>
#include <linux/smp.h>
#include <linux/cpu.h>
-#include <linux/sort.h>
#include <linux/mutex.h>
#include <linux/uaccess.h>
#include <linux/kvm_para.h>
+#include <linux/range.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -34,11 +34,6 @@
#include "mtrr.h"
-struct res_range {
- unsigned long start;
- unsigned long end;
-};
-
struct var_mtrr_range_state {
unsigned long base_pfn;
unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
/* Should be related to MTRR_VAR_RANGES nums */
#define RANGE_NUM 256
-static struct res_range __initdata range[RANGE_NUM];
+static struct range __initdata range[RANGE_NUM];
static int __initdata nr_range;
static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __ini
static int __initdata debug_print;
#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
-
-static int __init
-add_range(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- /* Out of slots: */
- if (nr_range >= RANGE_NUM)
- return nr_range;
-
- range[nr_range].start = start;
- range[nr_range].end = end;
-
- nr_range++;
-
- return nr_range;
-}
-
-static int __init
-add_range_with_merge(struct res_range *range, int nr_range,
- unsigned long start, unsigned long end)
-{
- int i;
-
- /* Try to merge it with old one: */
- for (i = 0; i < nr_range; i++) {
- unsigned long final_start, final_end;
- unsigned long common_start, common_end;
-
- if (!range[i].end)
- continue;
-
- common_start = max(range[i].start, start);
- common_end = min(range[i].end, end);
- if (common_start > common_end + 1)
- continue;
-
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
-
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
- }
-
- /* Need to add it: */
- return add_range(range, nr_range, start, end);
-}
-
-static void __init
-subtract_range(struct res_range *range, unsigned long start, unsigned long end)
-{
- int i, j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end &&
- range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end &&
- range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* Find the new spare: */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
-static int __init cmp_range(const void *x1, const void *x2)
-{
- const struct res_range *r1 = x1;
- const struct res_range *r2 = x2;
- long start1, start2;
-
- start1 = r1->start;
- start2 = r2->start;
-
- return start1 - start2;
-}
-
-static int __init clean_sort_range(struct res_range *range, int az)
-{
- int i, j, k = az - 1, nr_range = 0;
-
- for (i = 0; i < k; i++) {
- if (range[i].end)
- continue;
- for (j = k; j > i; j--) {
- if (range[j].end) {
- k = j;
- break;
- }
- }
- if (j == i)
- break;
- range[i].start = range[k].start;
- range[i].end = range[k].end;
- range[k].start = 0;
- range[k].end = 0;
- k--;
- }
- /* count it */
- for (i = 0; i < az; i++) {
- if (!range[i].end) {
- nr_range = i;
- break;
- }
- }
-
- /* sort them */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
-
- return nr_range;
-}
-
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
static int __init
-x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+x86_get_mtrr_mem_range(struct range *range, int nr_range,
unsigned long extra_remove_base,
unsigned long extra_remove_size)
{
@@ -223,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range
continue;
base = range_state[i].base_pfn;
size = range_state[i].size_pfn;
- nr_range = add_range_with_merge(range, nr_range, base,
- base + size - 1);
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
+ base, base + size - 1);
}
if (debug_print) {
printk(KERN_DEBUG "After WB checking\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
@@ -252,10 +106,10 @@ x86_get_mtrr_mem_range(struct res_range
size -= (1<<(20-PAGE_SHIFT)) - base;
base = 1<<(20-PAGE_SHIFT);
}
- subtract_range(range, base, base + size - 1);
+ subtract_range(range, RANGE_NUM, base, base + size - 1);
}
if (extra_remove_size)
- subtract_range(range, extra_remove_base,
+ subtract_range(range, RANGE_NUM, extra_remove_base,
extra_remove_base + extra_remove_size - 1);
if (debug_print) {
@@ -263,7 +117,7 @@ x86_get_mtrr_mem_range(struct res_range
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
}
@@ -273,20 +127,16 @@ x86_get_mtrr_mem_range(struct res_range
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
- printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
range[i].start, range[i].end + 1);
}
- /* clear those is not used */
- for (i = nr_range; i < RANGE_NUM; i++)
- memset(&range[i], 0, sizeof(range[i]));
-
return nr_range;
}
#ifdef CONFIG_MTRR_SANITIZER
-static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+static unsigned long __init sum_ranges(struct range *range, int nr_range)
{
unsigned long sum = 0;
int i;
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(c
early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
static int __init
-x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+x86_setup_var_mtrrs(struct range *range, int nr_range,
u64 chunk_size, u64 gran_size)
{
struct var_mtrr_state var_state;
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u6
unsigned long x_remove_base,
unsigned long x_remove_size, int i)
{
- static struct res_range range_new[RANGE_NUM];
+ static struct range range_new[RANGE_NUM];
unsigned long range_sums_new;
static int nr_range_new;
int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address
* [0, 1M) should always be covered by var mtrr with WB
* and fixed mtrrs should take effect before var mtrr for it:
*/
- nr_range = add_range_with_merge(range, nr_range, 0,
+ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
(1ULL<<(20 - PAGE_SHIFT)) - 1);
/* Sort the ranges: */
- sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ sort_range(range, nr_range);
range_sums = sum_ranges(range, nr_range);
printk(KERN_INFO "total RAM covered: %ldM\n",
Index: linux-2.6/include/linux/range.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/range.h
@@ -0,0 +1,22 @@
+#ifndef _LINUX_RANGE_H
+#define _LINUX_RANGE_H
+
+struct range {
+ u64 start;
+ u64 end;
+};
+
+int add_range(struct range *range, int az, int nr_range,
+ u64 start, u64 end);
+
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+ u64 start, u64 end);
+
+void subtract_range(struct range *range, int az, u64 start, u64 end);
+
+int clean_sort_range(struct range *range, int az);
+
+void sort_range(struct range *range, int nr_range);
+
+#endif
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile
+++ linux-2.6/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
- async.o
+ async.o range.o
obj-y += groups.o
ifdef CONFIG_FUNCTION_TRACER
Index: linux-2.6/kernel/range.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/range.c
@@ -0,0 +1,154 @@
+/*
+ * Range add and subtract
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sort.h>
+
+#include <linux/range.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
+{
+ /* Out of slots: */
+ if (nr_range >= az)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+int add_range_with_merge(struct range *range, int az, int nr_range,
+ u64 start, u64 end)
+{
+ int i;
+
+ /* Try to merge it with old one: */
+ for (i = 0; i < nr_range; i++) {
+ u64 final_start, final_end;
+ u64 common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* Need to add it: */
+ return add_range(range, az, nr_range, start, end);
+}
+
+void subtract_range(struct range *range, int az, u64 start, u64 end)
+{
+ int i, j;
+
+ for (j = 0; j < az; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* Find the new spare: */
+ for (i = 0; i < az; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < az) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int cmp_range(const void *x1, const void *x2)
+{
+ const struct range *r1 = x1;
+ const struct range *r2 = x2;
+ s64 start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+int clean_sort_range(struct range *range, int az)
+{
+ int i, j, k = az - 1, nr_range = 0;
+
+ for (i = 0; i < k; i++) {
+ if (range[i].end)
+ continue;
+ for (j = k; j > i; j--) {
+ if (range[j].end) {
+ k = j;
+ break;
+ }
+ }
+ if (j == i)
+ break;
+ range[i].start = range[k].start;
+ range[i].end = range[k].end;
+ range[k].start = 0;
+ range[k].end = 0;
+ k--;
+ }
+ /* count it */
+ for (i = 0; i < az; i++) {
+ if (!range[i].end) {
+ nr_range = i;
+ break;
+ }
+ }
+
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+
+ return nr_range;
+}
+
+void sort_range(struct range *range, int nr_range)
+{
+ /* sort them */
+ sort(range, nr_range, sizeof(struct range), cmp_range, NULL);
+}
Index: linux-2.6/arch/x86/kernel/mmconf-fam10h_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/mmconf-fam10h_64.c
+++ linux-2.6/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
#include <linux/string.h>
#include <linux/pci.h>
#include <linux/dmi.h>
+#include <linux/range.h>
+
#include <asm/pci-direct.h>
#include <linux/sort.h>
#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_p
{ 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
};
-struct range {
- u64 start;
- u64 end;
-};
-
static int __cpuinit cmp_range(const void *x1, const void *x2)
{
const struct range *r1 = x1;
Index: linux-2.6/arch/x86/pci/amd_bus.c
===================================================================
--- linux-2.6.orig/arch/x86/pci/amd_bus.c
+++ linux-2.6/arch/x86/pci/amd_bus.c
@@ -2,6 +2,8 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
+#include <linux/range.h>
+
#include <asm/pci_x86.h>
#ifdef CONFIG_X86_64
@@ -17,58 +19,6 @@
#ifdef CONFIG_X86_64
-#define RANGE_NUM 16
-
-struct res_range {
- size_t start;
- size_t end;
-};
-
-static void __init update_range(struct res_range *range, size_t start,
- size_t end)
-{
- int i;
- int j;
-
- for (j = 0; j < RANGE_NUM; j++) {
- if (!range[j].end)
- continue;
-
- if (start <= range[j].start && end >= range[j].end) {
- range[j].start = 0;
- range[j].end = 0;
- continue;
- }
-
- if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
- range[j].start = end + 1;
- continue;
- }
-
-
- if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
- range[j].end = start - 1;
- continue;
- }
-
- if (start > range[j].start && end < range[j].end) {
- /* find the new spare */
- for (i = 0; i < RANGE_NUM; i++) {
- if (range[i].end == 0)
- break;
- }
- if (i < RANGE_NUM) {
- range[i].end = range[j].end;
- range[i].start = end + 1;
- } else {
- printk(KERN_ERR "run of slot in ranges\n");
- }
- range[j].end = start - 1;
- continue;
- }
- }
-}
-
struct pci_hostbridge_probe {
u32 bus;
u32 slot;
@@ -111,6 +61,8 @@ static void __init get_pci_mmcfg_amd_fam
fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
}
+#define RANGE_NUM 16
+
/**
* early_fill_mp_bus_to_node()
* called before pcibios_scan_root and pci_scan_bus
@@ -132,7 +84,7 @@ static int __init early_fill_mp_bus_info
struct resource *res;
size_t start;
size_t end;
- struct res_range range[RANGE_NUM];
+ struct range range[RANGE_NUM];
u64 val;
u32 address;
@@ -226,7 +178,7 @@ static int __init early_fill_mp_bus_info
if (end > 0xffff)
end = 0xffff;
update_res(info, start, end, IORESOURCE_IO, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end);
}
/* add left over io port range to def node/link, [0, 0xffff] */
/* find the position */
@@ -256,14 +208,14 @@ static int __init early_fill_mp_bus_info
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20);
if (end < (1ULL<<32))
- update_range(range, 0, end - 1);
+ subtract_range(range, RANGE_NUM, 0, end - 1);
/* get mmconfig */
get_pci_mmcfg_amd_fam10h_range();
/* need to take out mmconf range */
if (fam10h_mmconf_end) {
printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
- update_range(range, fam10h_mmconf_start, fam10h_mmconf_end);
+ subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end);
}
/* mmio resource */
@@ -318,7 +270,7 @@ static int __init early_fill_mp_bus_info
/* we got a hole */
endx = fam10h_mmconf_start - 1;
update_res(info, start, endx, IORESOURCE_MEM, 0);
- update_range(range, start, endx);
+ subtract_range(range, RANGE_NUM, start, endx);
printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx);
start = fam10h_mmconf_end + 1;
changed = 1;
@@ -334,7 +286,7 @@ static int __init early_fill_mp_bus_info
}
update_res(info, start, end, IORESOURCE_MEM, 1);
- update_range(range, start, end);
+ subtract_range(range, RANGE_NUM, start, end);
printk(KERN_CONT "\n");
}
@@ -349,7 +301,7 @@ static int __init early_fill_mp_bus_info
rdmsrl(address, val);
end = (val & 0xffffff800000ULL);
printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20);
- update_range(range, 1ULL<<32, end - 1);
+ subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1);
}
/*
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/include/asm/pci.h | 2 ++
arch/x86/include/asm/pci_64.h | 2 --
arch/x86/kernel/pci-dma.c | 13 ++++++++++---
arch/x86/kernel/setup.c | 7 -------
4 files changed, 12 insertions(+), 12 deletions(-)
Index: linux-2.6/arch/x86/kernel/pci-dma.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/pci-dma.c
+++ linux-2.6/arch/x86/kernel/pci-dma.c
@@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64
}
EXPORT_SYMBOL(dma_set_mask);
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
static __initdata void *dma32_bootmem_ptr;
static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
@@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(vo
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
}
+#else
+void __init dma32_reserve_bootmem(void)
+{
+}
+static void __init dma32_free_bootmem(void)
+{
+}
+
#endif
void __init pci_iommu_alloc(void)
{
-#ifdef CONFIG_X86_64
/* free the range so iommu could get some range less than 4G */
dma32_free_bootmem();
-#endif
+
if (pci_swiotlb_detect())
goto out;
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -944,14 +944,7 @@ void __init setup_arch(char **cmdline_p)
initmem_init(0, max_pfn, acpi, k8);
early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-#ifdef CONFIG_X86_64
- /*
- * dma32_reserve_bootmem() allocates bootmem which may conflict
- * with the crashkernel command line, so do that after
- * reserve_crashkernel()
- */
dma32_reserve_bootmem();
-#endif
reserve_ibft_region();
Index: linux-2.6/arch/x86/include/asm/pci.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/pci.h
+++ linux-2.6/arch/x86/include/asm/pci.h
@@ -124,6 +124,8 @@ extern void pci_iommu_alloc(void);
#include "pci_64.h"
#endif
+void dma32_reserve_bootmem(void);
+
/* implement the pci_ DMA API in terms of the generic device dma_ one */
#include <asm-generic/pci-dma-compat.h>
Index: linux-2.6/arch/x86/include/asm/pci_64.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/pci_64.h
+++ linux-2.6/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, i
extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
int reg, int len, u32 value);
-extern void dma32_reserve_bootmem(void);
-
#endif /* __KERNEL__ */
#endif /* _ASM_X86_PCI_64_H */
so we could double check if we have enough low pages later
-v2: fix errors checkpatch.pl reported
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/mm/init_64.c | 2 +
include/linux/bootmem.h | 2 +
mm/bootmem.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 96 insertions(+)
Index: linux-2.6/mm/bootmem.c
===================================================================
--- linux-2.6.orig/mm/bootmem.c
+++ linux-2.6/mm/bootmem.c
@@ -267,6 +267,98 @@ static void __init __free(bootmem_data_t
BUG();
}
+static void __init print_all_bootmem_free_core(bootmem_data_t *bdata)
+{
+ int aligned;
+ unsigned long *map;
+ unsigned long start, end, count = 0;
+ unsigned long free_start = -1UL, free_end = 0;
+
+ if (!bdata->node_bootmem_map)
+ return;
+
+ start = bdata->node_min_pfn;
+ end = bdata->node_low_pfn;
+
+ /*
+ * If the start is aligned to the machines wordsize, we might
+ * be able to count it in bulks of that order.
+ */
+ aligned = !(start & (BITS_PER_LONG - 1));
+
+ printk(KERN_DEBUG "nid=%td start=0x%010lx end=0x%010lx aligned=%d\n",
+ bdata - bootmem_node_data, start, end, aligned);
+ map = bdata->node_bootmem_map;
+
+ while (start < end) {
+ unsigned long idx, vec;
+
+ idx = start - bdata->node_min_pfn;
+ vec = ~map[idx / BITS_PER_LONG];
+
+ if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+ if (free_start == -1UL) {
+ free_start = idx;
+ free_end = free_start + BITS_PER_LONG;
+ } else {
+ if (free_end == idx) {
+ free_end += BITS_PER_LONG;
+ } else {
+ /* there is gap, print old */
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ free_start = idx;
+ free_end = idx + BITS_PER_LONG;
+ }
+ }
+ count += BITS_PER_LONG;
+ } else {
+ unsigned long off = 0;
+
+ while (vec && off < BITS_PER_LONG) {
+ if (vec & 1) {
+ if (free_start == -1UL) {
+ free_start = idx + off;
+ free_end = free_start + 1;
+ } else {
+ if (free_end == (idx + off)) {
+ free_end++;
+ } else {
+ /* there is gap, print old */
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ free_start = idx + off;
+ free_end = free_start + 1;
+ }
+ }
+ count++;
+ }
+ vec >>= 1;
+ off++;
+ }
+ }
+ start += BITS_PER_LONG;
+ }
+
+ /* last one */
+ if (free_start != -1UL)
+ printk(KERN_DEBUG " free [0x%010lx - 0x%010lx]\n",
+ free_start + bdata->node_min_pfn,
+ free_end + bdata->node_min_pfn);
+ printk(KERN_DEBUG " total free 0x%010lx\n", count);
+}
+
+void __init print_bootmem_free(void)
+{
+ bootmem_data_t *bdata;
+
+ list_for_each_entry(bdata, &bdata_list, list) {
+ print_all_bootmem_free_core(bdata);
+ }
+}
+
static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
unsigned long eidx, int flags)
{
Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c
+++ linux-2.6/arch/x86/mm/init_64.c
@@ -654,6 +654,8 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
+ print_bootmem_free();
+
pci_iommu_alloc();
/* clear_bss() already clear the empty_zero_page */
Index: linux-2.6/include/linux/bootmem.h
===================================================================
--- linux-2.6.orig/include/linux/bootmem.h
+++ linux-2.6/include/linux/bootmem.h
@@ -55,6 +55,8 @@ extern void free_bootmem_node(pg_data_t
extern void free_bootmem(unsigned long addr, unsigned long size);
extern void free_bootmem_late(unsigned long addr, unsigned long size);
+void print_bootmem_free(void);
+
/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
* the architecture-specific code should honor this).
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/mm/numa_64.c | 21 +++++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -163,14 +163,27 @@ static void * __init early_node_mem(int
unsigned long end, unsigned long size,
unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size, align);
+ unsigned long mem;
+ /*
+ * put it on high as possible
+ * something will go with NODE_DATA
+ */
+ if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
+ if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
+ end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
-
- start = __pa(MAX_DMA_ADDRESS);
- end = max_low_pfn_mapped << PAGE_SHIFT;
+ /* extend the search scope */
+ end = max_pfn_mapped << PAGE_SHIFT;
+ if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
+ start = MAX_DMA32_PFN<<PAGE_SHIFT;
+ else
+ start = MAX_DMA_PFN<<PAGE_SHIFT;
mem = find_e820_area(start, end, size, align);
if (mem != -1L)
return __va(mem);
--
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/Kconfig | 7 +
arch/x86/include/asm/e820.h | 6 +
arch/x86/kernel/e820.c | 144 +++++++++++++++++++++++++++++++---
arch/x86/kernel/setup.c | 2
arch/x86/mm/init_64.c | 7 +
arch/x86/mm/numa_64.c | 20 +++-
include/linux/bootmem.h | 7 +
include/linux/mm.h | 5 +
include/linux/mmzone.h | 2
mm/bootmem.c | 182 +++++++++++++++++++++++++++++++++++++++++++-
mm/page_alloc.c | 53 ++++++++++++
mm/percpu.c | 3
mm/sparse-vmemmap.c | 2
13 files changed, 416 insertions(+), 24 deletions(-)
Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -565,6 +565,13 @@ config PARAVIRT_DEBUG
Enable to debug paravirt_ops internals. Specifically, BUG if
a paravirt_op is missing when it is called.
+config NO_BOOTMEM
+ default y
+ bool "Disable Bootmem code"
+ depends on X86_64
+ ---help---
+ use early_res directly instead of bootmem before slab is ready.
+
config MEMTEST
bool "Memtest"
---help---
Index: linux-2.6/arch/x86/include/asm/e820.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/e820.h
+++ linux-2.6/arch/x86/include/asm/e820.h
@@ -117,6 +117,12 @@ extern void free_early(u64 start, u64 en
extern void early_res_to_bootmem(u64 start, u64 end);
extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+void reserve_early_without_check(u64 start, u64 end, char *name);
+u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align);
+#include <linux/range.h>
+int get_free_all_memory_range(struct range **rangep, int nodeid);
+
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
extern int e820_find_active_region(const struct e820entry *ei,
Index: linux-2.6/arch/x86/kernel/e820.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820.c
+++ linux-2.6/arch/x86/kernel/e820.c
@@ -969,6 +969,25 @@ void __init reserve_early(u64 start, u64
__reserve_early(start, end, name, 0);
}
+void __init reserve_early_without_check(u64 start, u64 end, char *name)
+{
+ struct early_res *r;
+
+ if (start >= end)
+ return;
+
+ __check_and_double_early_res(end);
+
+ r = &early_res[early_res_count];
+
+ r->start = start;
+ r->end = end;
+ r->overlap_ok = 0;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+ early_res_count++;
+}
+
void __init free_early(u64 start, u64 end)
{
struct early_res *r;
@@ -983,6 +1002,79 @@ void __init free_early(u64 start, u64 en
drop_range(i);
}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init subtract_early_res(struct range *range, int az)
+{
+ int i, count;
+ u64 final_start, final_end;
+ int idx = 0;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ /* need to skip first one ?*/
+ if (early_res != early_res_x)
+ idx = 1;
+
+ for (i = idx; i < count; i++) {
+ struct early_res *r = &early_res[i];
+
+ final_start = PFN_DOWN(r->start);
+ final_end = PFN_UP(r->end);
+ if (final_start >= final_end)
+ continue;
+
+ subtract_range(range, az, final_start, final_end - 1);
+ }
+
+}
+
+int __init get_free_all_memory_range(struct range **rangep, int nodeid)
+{
+ int i, count;
+ u64 start = 0, end;
+ u64 size;
+ u64 mem;
+ struct range *range;
+ int nr_range;
+
+ count = 0;
+ for (i = 0; i < max_early_res && early_res[i].end; i++)
+ count++;
+
+ count *= 2;
+
+ size = sizeof(struct range) * count;
+ if (max_pfn_mapped > MAX_DMA32_PFN)
+ start = MAX_DMA32_PFN << PAGE_SHIFT;
+ end = max_pfn_mapped << PAGE_SHIFT;
+ mem = find_e820_area(start, end, size, sizeof(struct range));
+ if (mem == -1ULL)
+ panic("can not find more space for range free");
+
+ range = __va(mem);
+ /* use early_node_map[] and early_res to get range array at first */
+ memset(range, 0, size);
+ nr_range = 0;
+
+ /* need to go over early_node_map to find out good range for node */
+ nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
+ subtract_early_res(range, count);
+ nr_range = clean_sort_range(range, count);
+
+ /* need to clear it ? */
+ if (nodeid == MAX_NUMNODES) {
+ memset(&early_res[0], 0,
+ sizeof(struct early_res) * max_early_res);
+ early_res = NULL;
+ max_early_res = 0;
+ }
+
+ *rangep = range;
+ return nr_range;
+}
+#else
void __init early_res_to_bootmem(u64 start, u64 end)
{
int i, count;
@@ -1020,6 +1112,7 @@ void __init early_res_to_bootmem(u64 sta
max_early_res = 0;
early_res_count = 0;
}
+#endif
/* Check for already reserved areas */
static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
@@ -1075,6 +1168,35 @@ again:
/*
* Find a free area with specified alignment in a specific range.
+ * only with the area.between start to end is active range from early_node_map
+ * so they are good as RAM
+ */
+u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end,
+ u64 size, u64 align)
+{
+ u64 addr, last;
+
+ addr = round_up(ei_start, align);
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ goto out;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ goto out;
+ if (last > end)
+ goto out;
+
+ return addr;
+
+out:
+ return -1ULL;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
*/
u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
{
@@ -1082,24 +1204,20 @@ u64 __init find_e820_area(u64 start, u64
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- u64 addr, last;
- u64 ei_last;
+ u64 addr;
+ u64 ei_start, ei_last;
if (ei->type != E820_RAM)
continue;
- addr = round_up(ei->addr, align);
+
ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
+ ei_start = ei->addr;
+ addr = find_early_area(ei_start, ei_last, start, end,
+ size, align);
+
+ if (addr == -1ULL)
continue;
+
return addr;
}
return -1ULL;
Index: linux-2.6/arch/x86/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup.c
+++ linux-2.6/arch/x86/kernel/setup.c
@@ -942,7 +942,9 @@ void __init setup_arch(char **cmdline_p)
#endif
initmem_init(0, max_pfn, acpi, k8);
+#ifndef CONFIG_NO_BOOTMEM
early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
+#endif
dma32_reserve_bootmem();
Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c
+++ linux-2.6/arch/x86/mm/init_64.c
@@ -571,6 +571,7 @@ kernel_physical_mapping_init(unsigned lo
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
+#ifndef CONFIG_NO_BOOTMEM
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -582,8 +583,10 @@ void __init initmem_init(unsigned long s
/* don't touch min_low_pfn */
bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
0, end_pfn);
- e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
+#else
+ e820_register_active_regions(0, start_pfn, end_pfn);
+#endif
}
#endif
@@ -654,7 +657,9 @@ void __init mem_init(void)
long codesize, reservedpages, datasize, initsize;
unsigned long absent_pages;
+#ifndef CONFIG_NO_BOOTMEM
print_bootmem_free();
+#endif
pci_iommu_alloc();
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -198,11 +198,13 @@ static void * __init early_node_mem(int
void __init
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
- unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+ unsigned long start_pfn, last_pfn, nodedata_phys;
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- unsigned long bootmap_start, nodedata_phys;
- void *bootmap;
int nid;
+#ifndef CONFIG_NO_BOOTMEM
+ unsigned long bootmap_start, bootmap_pages, bootmap_size;
+ void *bootmap;
+#endif
if (!end)
return;
@@ -216,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned
start = roundup(start, ZONE_ALIGN);
- printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
+ printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
start, end);
start_pfn = start >> PAGE_SHIFT;
@@ -235,10 +237,13 @@ setup_node_bootmem(int nodeid, unsigned
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
- NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+ NODE_DATA(nodeid)->node_id = nodeid;
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
+#ifndef CONFIG_NO_BOOTMEM
+ NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
+
/*
* Find a place for the bootmem map
* nodedata_phys could be on other nodes by alloc_bootmem,
@@ -275,6 +280,7 @@ setup_node_bootmem(int nodeid, unsigned
printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
free_bootmem_with_active_regions(nodeid, end);
+#endif
node_set_online(nodeid);
}
@@ -733,6 +739,10 @@ unsigned long __init numa_free_all_bootm
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
+#ifdef CONFIG_NO_BOOTMEM
+ pages += free_all_memory_core_early(MAX_NUMNODES);
+#endif
+
return pages;
}
Index: linux-2.6/include/linux/bootmem.h
===================================================================
--- linux-2.6.orig/include/linux/bootmem.h
+++ linux-2.6/include/linux/bootmem.h
@@ -23,6 +23,7 @@ extern unsigned long max_pfn;
extern unsigned long saved_max_pfn;
#endif
+#ifndef CONFIG_NO_BOOTMEM
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
@@ -37,6 +38,7 @@ typedef struct bootmem_data {
} bootmem_data_t;
extern bootmem_data_t bootmem_node_data[];
+#endif
extern unsigned long bootmem_bootmap_pages(unsigned long);
@@ -46,6 +48,7 @@ extern unsigned long init_bootmem_node(p
unsigned long endpfn);
extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
+unsigned long free_all_memory_core_early(int nodeid);
extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
extern unsigned long free_all_bootmem(void);
@@ -86,6 +89,10 @@ extern void *__alloc_bootmem_node(pg_dat
unsigned long size,
unsigned long align,
unsigned long goal);
+void *__alloc_bootmem_node_high(pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal);
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -12,6 +12,7 @@
#include <linux/prio_tree.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
+#include <linux/range.h>
struct mempolicy;
struct anon_vma;
@@ -1047,6 +1048,10 @@ extern void get_pfn_range_for_nid(unsign
extern unsigned long find_min_pfn_with_active_regions(void);
extern void free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn);
+int add_from_early_node_map(struct range *range, int az,
+ int nr_range, int nid);
+void *__alloc_memory_core_early(int nodeid, u64 size, u64 align,
+ u64 goal, u64 limit);
typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
extern void sparse_memory_present_with_active_regions(int nid);
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -620,7 +620,9 @@ typedef struct pglist_data {
struct page_cgroup *node_page_cgroup;
#endif
#endif
+#ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata;
+#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Must be held any time you expect node_start_pfn, node_present_pages
Index: linux-2.6/mm/bootmem.c
===================================================================
--- linux-2.6.orig/mm/bootmem.c
+++ linux-2.6/mm/bootmem.c
@@ -13,6 +13,7 @@
#include <linux/bootmem.h>
#include <linux/module.h>
#include <linux/kmemleak.h>
+#include <linux/range.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
unsigned long saved_max_pfn;
#endif
+#ifndef CONFIG_NO_BOOTMEM
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsign
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-
+#endif
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -167,6 +169,53 @@ void __init free_bootmem_late(unsigned l
}
}
+#ifdef CONFIG_NO_BOOTMEM
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
+
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
+
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ return;
+ }
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+ int i;
+ u64 start, end;
+ unsigned long count = 0;
+ struct range *range = NULL;
+ int nr_range;
+
+ nr_range = get_free_all_memory_range(&range, nodeid);
+
+ for (i = 0; i < nr_range; i++) {
+ start = range[i].start;
+ end = range[i].end + 1;
+ count += end - start;
+ __free_pages_memory(start, end);
+ }
+
+ return count;
+}
+#else
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -227,6 +276,7 @@ static unsigned long __init free_all_boo
return count;
}
+#endif
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +287,12 @@ static unsigned long __init free_all_boo
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+#ifdef CONFIG_NO_BOOTMEM
+ /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
+#else
return free_all_bootmem_core(pgdat->bdata);
+#endif
}
/**
@@ -247,9 +302,14 @@ unsigned long __init free_all_bootmem_no
*/
unsigned long __init free_all_bootmem(void)
{
+#ifdef CONFIG_NO_BOOTMEM
+ return free_all_memory_core_early(NODE_DATA(0)->node_id);
+#else
return free_all_bootmem_core(NODE_DATA(0)->bdata);
+#endif
}
+#ifndef CONFIG_NO_BOOTMEM
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
@@ -436,6 +496,7 @@ static int __init mark_bootmem(unsigned
}
BUG();
}
+#endif
/**
* free_bootmem_node - mark a page range as usable
@@ -450,6 +511,9 @@ static int __init mark_bootmem(unsigned
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(physaddr, physaddr + size);
+#else
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
@@ -458,6 +522,7 @@ void __init free_bootmem_node(pg_data_t
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+#endif
}
/**
@@ -471,6 +536,9 @@ void __init free_bootmem_node(pg_data_t
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(addr, addr + size);
+#else
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
@@ -479,6 +547,7 @@ void __init free_bootmem(unsigned long a
end = PFN_DOWN(addr + size);
mark_bootmem(start, end, 0, 0);
+#endif
}
/**
@@ -495,12 +564,17 @@ void __init free_bootmem(unsigned long a
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(physaddr);
end = PFN_UP(physaddr + size);
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+#endif
}
/**
@@ -516,14 +590,20 @@ int __init reserve_bootmem_node(pg_data_
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
+#ifdef CONFIG_NO_BOOTMEM
+ panic("no bootmem");
+ return 0;
+#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
return mark_bootmem(start, end, 1, flags);
+#endif
}
+#ifndef CONFIG_NO_BOOTMEM
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
@@ -674,12 +754,33 @@ static void * __init alloc_arch_preferre
#endif
return NULL;
}
+#endif
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
+#ifdef CONFIG_NO_BOOTMEM
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+#else
bootmem_data_t *bdata;
void *region;
@@ -705,6 +806,7 @@ restart:
}
return NULL;
+#endif
}
/**
@@ -723,7 +825,13 @@ restart:
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem_nopanic(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -757,9 +865,16 @@ static void * __init ___alloc_bootmem(un
void * __init __alloc_bootmem(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem(size, align, goal, 0);
+ unsigned long limit = 0;
+
+#ifdef CONFIG_NO_BOOTMEM
+ limit = -1UL;
+#endif
+
+ return ___alloc_bootmem(size, align, goal, limit);
}
+#ifndef CONFIG_NO_BOOTMEM
static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -776,6 +891,7 @@ static void * __init ___alloc_bootmem_no
return ___alloc_bootmem(size, align, goal, limit);
}
+#endif
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
@@ -798,7 +914,46 @@ void * __init __alloc_bootmem_node(pg_da
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ return __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+#else
return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+#endif
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ new_goal, -1ULL);
+#else
+ ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+ new_goal, 0);
+#endif
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
}
#ifdef CONFIG_SPARSEMEM
@@ -812,6 +967,16 @@ void * __init __alloc_bootmem_node(pg_da
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
+#ifdef CONFIG_NO_BOOTMEM
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+ return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+ SMP_CACHE_BYTES, goal, limit);
+#else
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
@@ -821,6 +986,7 @@ void * __init alloc_bootmem_section(unsi
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+#endif
}
#endif
@@ -832,11 +998,16 @@ void * __init __alloc_bootmem_node_nopan
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+#else
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+#endif
if (ptr)
return ptr;
@@ -887,6 +1058,11 @@ void * __init __alloc_bootmem_low_node(p
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+ return __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+#else
return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
+#endif
}
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -3430,6 +3430,53 @@ void __init free_bootmem_with_active_reg
}
}
+int __init add_from_early_node_map(struct range *range, int az,
+ int nr_range, int nid)
+{
+ int i;
+ u64 start, end;
+
+ /* need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ start = early_node_map[i].start_pfn;
+ end = early_node_map[i].end_pfn;
+ nr_range = add_range(range, az, nr_range, start, end - 1);
+ }
+ return nr_range;
+}
+
+void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+ void *ptr;
+
+ /* need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+ addr = find_early_area(ei_start, ei_last,
+ goal, limit, size, align);
+
+ if (addr == -1ULL)
+ continue;
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ reserve_early_without_check(addr, addr + size, "BOOTMEM");
+
+ return ptr;
+ }
+
+ return NULL;
+}
+
+
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -4462,7 +4509,11 @@ void __init set_dma_reserve(unsigned lon
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = {
+#ifndef CONFIG_NO_BOOTMEM
+ .bdata = &bootmem_node_data[0]
+#endif
+ };
EXPORT_SYMBOL(contig_page_data);
#endif
Index: linux-2.6/mm/percpu.c
===================================================================
--- linux-2.6.orig/mm/percpu.c
+++ linux-2.6/mm/percpu.c
@@ -1927,7 +1927,10 @@ int __init pcpu_embed_first_chunk(size_t
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
+#ifndef CONFIG_NO_BOOTMEM
+ /* fix partial free ! */
free_fn(ptr + size_sum, ai->unit_size - size_sum);
+#endif
}
}
Index: linux-2.6/mm/sparse-vmemmap.c
===================================================================
--- linux-2.6.orig/mm/sparse-vmemmap.c
+++ linux-2.6/mm/sparse-vmemmap.c
@@ -40,7 +40,7 @@ static void * __init_refok __earlyonly_b
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node(NODE_DATA(node), size, align, goal);
+ return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
>
> fend off wrong range
>
> Signed-off-by: Yinghai Lu <yin...@kernel.org>
>
This could be merged with the previous patch.
--
Jesse Barnes, Intel Open Source Technology Center
yes, but first one just have moving and changing name etc.
YH
Okay, this does bring up two things that I have long griped about.
Firstly, I don't think this is the proper data structure. Even worse,
the range operations take *inclusive* ranges (e.g. 0x0000 to 0xffff is
64K, not 0x0000 to 0x10000). It would be one thing if it only affected
the internal representation, but as written, this is exposed through the
interfaces, too.
As far as the choice of data structures, I have used in other places,
with very good success, a data structure which looks like:
struct {
u64 start;
u32 attr;
};
Note that there is no end: the end is always given by an end token. The
"attr" here was an e820 attribute (or 0 for no attribute), but the
payload can be almost anything -- for a simple include/exclude it can
just be boolean.
This data structures doesn't permit things like out-of-order ranges,
overlapping ranges, and so on, and that's a good thing; it means the
data structure itself can never be ambiguous, and the interfaces clean
out most errors inherently.
http://git.kernel.org/?p=boot/syslinux/syslinux.git;a=blob;f=com32/lib/syslinux/zonelist.c;hb=HEAD
... contains an implementation of this data structure using linked lists
for internal storage, and
http://git.kernel.org/?p=boot/syslinux/syslinux.git;a=blob;f=memdisk/e820func.c;hb=HEAD
... contains one based on arrays. I'm not saying these should be
applied directly, but I think the equivalent concept might be
worthwhile, not just for this but also for the e820 memrange code.
-hpa
yes here is we have [start, end] instead of [start, end)
those range operation is used for one purpose:
add several ranges, and subtract some other ranges, it will take out overlap between those two sets.
and leave out the range that could be used.
and it is array based...for early stage.
YH
Yes. We should be able to use the *exact same code* for the e820
ranges, using the e820 attribute as the attribute parameters, and a
simple boolean for the case where all you want is inclusion/exclusion.
And yes, I agree array based is the right thing to do for the early code.
Note that my array-based version only had an insert operation, no delete
operation -- that is because the delete operation is simply an insert
operation of attribute 0.
-hpa
i could try to merge range handling to to early_res handling code.
first step is change the end in range to be consistent to the one in early_res.
later could move them into kernel/range.c or kernel/early_res.c after we convert all bootmem to early_res.
YH
can you put current 9 patches into tip? i will submit following patch that merge range and early_res
Thanks
Yinghai
Seems reasonable, I guess.
-hpa
I still have to review them (I had to spend the day deal with non-kernel
job duties that had been old hold for the merge window.) I will look at
it probably Monday or Tuesday.
Sorry,
-hpa
could save some buf instead of applying one by one
could help that system that is going to use early_res instead of bootmem
less entries in early_res make search more faster on system with more memory.
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
mm/sparse.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 66 insertions(+), 18 deletions(-)
Index: linux-2.6/mm/sparse.c
===================================================================
--- linux-2.6.orig/mm/sparse.c
+++ linux-2.6/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_
#ifdef CONFIG_MEMORY_HOTREMOVE
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
unsigned long section_nr;
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(
* this problem.
*/
section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
- return alloc_bootmem_section(usemap_size(), section_nr);
+ return alloc_bootmem_section(usemap_size() * count, section_nr);
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_
}
#else
static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+ unsigned long count)
{
return NULL;
}
@@ -339,27 +341,40 @@ static void __init check_usemap_section_
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
-static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
+static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long usemap_count, int nodeid)
{
- unsigned long *usemap;
- struct mem_section *ms = __nr_to_section(pnum);
- int nid = sparse_early_nid(ms);
-
- usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
- if (usemap)
- return usemap;
+ void *usemap;
+ unsigned long pnum;
+ int size = usemap_size();
- usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+ usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+ usemap_count);
if (usemap) {
- check_usemap_section_nr(nid, usemap);
- return usemap;
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ }
+ return;
}
- /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
- nid = 0;
+ usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+ if (usemap) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ usemap_map[pnum] = usemap;
+ usemap += size;
+ check_usemap_section_nr(nodeid, usemap_map[pnum]);
+ }
+ return;
+ }
printk(KERN_WARNING "%s: allocation failed\n", __func__);
- return NULL;
}
#ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -396,6 +411,7 @@ static struct page __init *sparse_early_
void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
}
+
/*
* Allocate the accumulated non-linear sections, allocate a mem_map
* for each and record the physical to section mapping.
@@ -407,6 +423,9 @@ void __init sparse_init(void)
unsigned long *usemap;
unsigned long **usemap_map;
int size;
+ int nodeid_begin = 0;
+ unsigned long pnum_begin = 0;
+ unsigned long usemap_count;
/*
* map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +444,39 @@ void __init sparse_init(void)
panic("can not allocate usemap_map\n");
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ usemap_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
if (!present_section_nr(pnum))
continue;
- usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ usemap_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
+ usemap_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ usemap_count = 1;
}
+ /* ok, last chunk */
+ sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
+ usemap_count, nodeid_begin);
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
it will fallback old wayif can not get that big.
it will help system with more memory that use early_res instead of bootmem
that can not handle too many entries
Signed-off-by: Yinghai Lu <yin...@kernel.org>
---
arch/x86/mm/init_64.c | 2
include/linux/mm.h | 7 +++
mm/sparse-vmemmap.c | 70 +++++++++++++++++++++++++++++++
mm/sparse.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-
4 files changed, 187 insertions(+), 3 deletions(-)
Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c
+++ linux-2.6/arch/x86/mm/init_64.c
@@ -961,7 +961,7 @@ vmemmap_populate(struct page *start_page
if (pmd_none(*pmd)) {
pte_t entry;
- p = vmemmap_alloc_block(PMD_SIZE, node);
+ p = vmemmap_alloc_block_buf(PMD_SIZE, node);
if (!p)
return -ENOMEM;
Index: linux-2.6/include/linux/mm.h
===================================================================
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -1323,12 +1323,19 @@ extern int randomize_va_space;
const char * arch_vma_name(struct vm_area_struct *vma);
void print_vma_addr(char *prefix, unsigned long rip);
+void sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count,
+ int nodeid);
+
struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
void *vmemmap_alloc_block(unsigned long size, int node);
+void *vmemmap_alloc_block_buf(unsigned long size, int node);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
int vmemmap_populate_basepages(struct page *start_page,
unsigned long pages, int node);
Index: linux-2.6/mm/sparse-vmemmap.c
===================================================================
--- linux-2.6.orig/mm/sparse-vmemmap.c
+++ linux-2.6/mm/sparse-vmemmap.c
@@ -43,6 +43,8 @@ static void * __init_refok __earlyonly_b
return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
}
+static void *buf;
+static void *buf_end;
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(uns
__pa(MAX_DMA_ADDRESS));
}
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+ void *ptr;
+
+ if (!buf)
+ return vmemmap_alloc_block(size, node);
+
+ /* take the from buf */
+ ptr = (void *)ALIGN((unsigned long)buf, size);
+ if (ptr + size > buf_end)
+ return vmemmap_alloc_block(size, node);
+
+ buf = ptr + size;
+
+ return ptr;
+}
+
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(p
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,51 @@ struct page * __meminit sparse_mem_map_p
return map;
}
+
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+ void *buf_start;
+
+ size = ALIGN(size, PMD_SIZE);
+ buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+ PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+
+ if (buf_start) {
+ buf = buf_start;
+ buf_end = buf_start + size * map_count;
+ }
+
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+
+ if (buf_start) {
+ /* need to free left buf */
+#ifdef CONFIG_NO_BOOTMEM
+ free_early(__pa(buf_start), __pa(buf_end));
+ if (buf_start < buf)
+ reserve_early_without_check(__pa(buf_start), __pa(buf),
+ "BOOTMAP");
+#else
+ free_bootmem(__pa(buf), buf_end - buf);
+#endif
+ buf = NULL;
+ buf_end = NULL;
+ }
+}
Index: linux-2.6/mm/sparse.c
===================================================================
--- linux-2.6.orig/mm/sparse.c
+++ linux-2.6/mm/sparse.c
@@ -390,8 +390,65 @@ struct page __init *sparse_mem_map_popul
PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
return map;
}
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ void *map;
+ unsigned long pnum;
+ unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+
+ map = alloc_remap(nodeid, size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ size = PAGE_ALIGN(size);
+ map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+ if (map) {
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = map;
+ map += size;
+ }
+ return;
+ }
+
+ /* fallback */
+ for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+ if (map_map[pnum])
+ continue;
+ ms = __nr_to_section(pnum);
+ printk(KERN_ERR "%s: sparsemem memory map backing failed "
+ "some memory will not be available.\n", __func__);
+ ms->section_mem_map = 0;
+ }
+}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+ unsigned long pnum_begin,
+ unsigned long pnum_end,
+ unsigned long map_count, int nodeid)
+{
+ sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+ map_count, nodeid);
+}
+
+#ifndef CONFIG_X86_64
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -407,6 +464,7 @@ static struct page __init *sparse_early_
ms->section_mem_map = 0;
return NULL;
}
+#endif
void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
{
@@ -420,12 +478,14 @@ void __init sparse_init(void)
{
unsigned long pnum;
struct page *map;
+ struct page **map_map;
unsigned long *usemap;
unsigned long **usemap_map;
- int size;
+ int size, size2;
int nodeid_begin = 0;
unsigned long pnum_begin = 0;
unsigned long usemap_count;
+ unsigned long map_count;
/*
* map is using big page (aka 2M in x86 64 bit)
@@ -478,6 +538,48 @@ void __init sparse_init(void)
sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
usemap_count, nodeid_begin);
+#ifdef CONFIG_X86_64
+ size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+ map_map = alloc_bootmem(size2);
+ if (!map_map)
+ panic("can not allocate map_map\n");
+
+ for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid_begin = sparse_early_nid(ms);
+ pnum_begin = pnum;
+ break;
+ }
+ map_count = 1;
+ for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+ struct mem_section *ms;
+ int nodeid;
+
+ if (!present_section_nr(pnum))
+ continue;
+ ms = __nr_to_section(pnum);
+ nodeid = sparse_early_nid(ms);
+ if (nodeid == nodeid_begin) {
+ map_count++;
+ continue;
+ }
+ /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+ map_count, nodeid_begin);
+ /* new start, update count etc*/
+ nodeid_begin = nodeid;
+ pnum_begin = pnum;
+ map_count = 1;
+ }
+ /* ok, last chunk */
+ sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+ map_count, nodeid_begin);
+#endif
+
for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
if (!present_section_nr(pnum))
continue;
@@ -486,7 +588,11 @@ void __init sparse_init(void)
if (!usemap)
continue;
+#ifdef CONFIG_X86_64
+ map = map_map[pnum];
+#else
map = sparse_early_mem_map_alloc(pnum);
+#endif
if (!map)
continue;
@@ -496,6 +602,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();
+#ifdef CONFIG_X86_64
+ free_bootmem(__pa(map_map), size2);
+#endif
free_bootmem(__pa(usemap_map), size);
> add vmemmap_alloc_block_buf for mem map only.
>
> it will fallback old wayif can not get that big.
>
> it will help system with more memory that use early_res instead of bootmem
> that can not handle too many entries
>
> Signed-off-by: Yinghai Lu <yin...@kernel.org>
>
> ---
> arch/x86/mm/init_64.c | 2
> include/linux/mm.h | 7 +++
> mm/sparse-vmemmap.c | 70 +++++++++++++++++++++++++++++++
> mm/sparse.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 187 insertions(+), 3 deletions(-)
> +++ linux-2.6/mm/sparse-vmemmap.c
> @@ -43,6 +43,8 @@ static void * __init_refok __earlyonly_b
> return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
> }
>
> +static void *buf;
> +static void *buf_end;
there's so many buf's in the kernel - this naming isnt very intuitive. Also,
they should perhaps be __initdata-ish?
> void * __meminit vmemmap_alloc_block(unsigned long size, int node)
> {
> @@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(uns
> __pa(MAX_DMA_ADDRESS));
> }
>
> +/* need to make sure size is all the same during early stage */
> +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
> +{
> + void *ptr;
> +
> + if (!buf)
> + return vmemmap_alloc_block(size, node);
> +
> + /* take the from buf */
> + ptr = (void *)ALIGN((unsigned long)buf, size);
Hm, two type cast in the same line.
these kinds of x86-64-isms:
> +++ linux-2.6/mm/sparse.c
>
> +#ifndef CONFIG_X86_64
> +#ifdef CONFIG_X86_64
> +#ifdef CONFIG_X86_64
> +#else
> +#endif
> +#ifdef CONFIG_X86_64
> +#endif
are not particularly welcome constructs in core MM files. Appropriately
structured Kconfig helper bools, selected by arch's, are cleaner.
These patches need more work.
Ingo