Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH v2 02/23] mm/memblock: debug: don't free reserved array if !ARCH_DISCARD_MEMBLOCK

2 views
Skip to first unread message

Santosh Shilimkar

unread,
Dec 2, 2013, 9:30:01 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Now the Nobootmem allocator will always try to free memory allocated for
reserved memory regions (free_low_memory_core_early()) without taking
into to account current memblock debugging configuration
(CONFIG_ARCH_DISCARD_MEMBLOCK and CONFIG_DEBUG_FS state).
As result if:
- CONFIG_DEBUG_FS defined
- CONFIG_ARCH_DISCARD_MEMBLOCK not defined;
- reserved memory regions array have been resized during boot

then:
- memory allocated for reserved memory regions array will be freed to
buddy allocator;
- debug_fs entry "sys/kernel/debug/memblock/reserved" will show garbage
instead of state of memory reservations. like:
0: 0x98393bc0..0x9a393bbf
1: 0xff120000..0xff11ffff
2: 0x00000000..0xffffffff

Hence, do not free memory allocated for reserved memory regions if
defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK).

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/memblock.c | 13 +++++++++++++
1 file changed, 13 insertions(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index aab5669..53da534 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -265,6 +265,19 @@ phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
if (memblock.reserved.regions == memblock_reserved_init_regions)
return 0;

+ /*
+ * Don't allow Nobootmem allocator to free reserved memory regions
+ * array if
+ * - CONFIG_DEBUG_FS is enabled;
+ * - CONFIG_ARCH_DISCARD_MEMBLOCK is not enabled;
+ * - reserved memory regions array have been resized during boot.
+ * Otherwise debug_fs entry "sys/kernel/debug/memblock/reserved"
+ * will show garbage instead of state of memory reservations.
+ */
+ if (IS_ENABLED(CONFIG_DEBUG_FS) &&
+ !IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ return 0;
+
*addr = __pa(memblock.reserved.regions);

return PAGE_ALIGN(sizeof(struct memblock_region) *
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Santosh Shilimkar

unread,
Dec 2, 2013, 9:30:01 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Clean-up to remove depedency with bootmem headers.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/memory_hotplug.c | 1 -
1 file changed, 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235..cf1736d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -9,7 +9,6 @@
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
-#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/pagevec.h>

Santosh Shilimkar

unread,
Dec 2, 2013, 9:30:02 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

The __free_pages_bootmem is used internally by MM core and
already defined in internal.h. So, remove duplicated declaration.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
include/linux/bootmem.h | 1 -
1 file changed, 1 deletion(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index f1f07d3..55d52fb 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -52,7 +52,6 @@ extern void free_bootmem_node(pg_data_t *pgdat,
unsigned long size);
extern void free_bootmem(unsigned long physaddr, unsigned long size);
extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
-extern void __free_pages_bootmem(struct page *page, unsigned int order);

/*
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,

Santosh Shilimkar

unread,
Dec 2, 2013, 9:30:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
init/main.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/init/main.c b/init/main.c
index febc511..934430d 100644
--- a/init/main.c
+++ b/init/main.c
@@ -355,9 +355,9 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { }
*/
static void __init setup_command_line(char *command_line)
{
- saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
- initcall_command_line = alloc_bootmem(strlen (boot_command_line)+1);
- static_command_line = alloc_bootmem(strlen (command_line)+1);
+ saved_command_line = memblock_virt_alloc(strlen(boot_command_line)+1);
+ initcall_command_line = memblock_virt_alloc(strlen (boot_command_line)+1);
+ static_command_line = memblock_virt_alloc(strlen(command_line)+1);
strcpy (saved_command_line, boot_command_line);
strcpy (static_command_line, command_line);

Santosh Shilimkar

unread,
Dec 2, 2013, 9:30:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
arch/arm/kernel/devtree.c | 2 +-
arch/arm/kernel/setup.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c
index 739c3df..85b9b3b 100644
--- a/arch/arm/kernel/devtree.c
+++ b/arch/arm/kernel/devtree.c
@@ -33,7 +33,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)

void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
{
- return alloc_bootmem_align(size, align);
+ return memblock_virt_alloc_align(size, align);
}

void __init arm_dt_memblock_reserve(void)
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 6a1b8a8..0d3c6aa 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -717,7 +717,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
kernel_data.end = virt_to_phys(_end - 1);

for_each_memblock(memory, region) {
- res = alloc_bootmem_low(sizeof(*res));
+ res = memblock_virt_alloc(sizeof(*res));
res->name = "System RAM";
res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
kernel/printk/printk.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index be7c86b..d8147f91 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -757,14 +757,10 @@ void __init setup_log_buf(int early)
return;

if (early) {
- unsigned long mem;
-
- mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- if (!mem)
- return;
- new_log_buf = __va(mem);
+ new_log_buf =
+ memblock_virt_alloc_align(new_log_buf_len, PAGE_SIZE);
} else {
- new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len);
}

if (unlikely(!new_log_buf)) {

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

drop WARN and use SMP_CACHE_BYTES as a default alignment in
memblock_alloc_base_nid() as recommended by Tejun Heo in
https://lkml.org/lkml/2013/10/13/117.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
---
mm/memblock.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 53da534..1d15e07 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -883,8 +883,8 @@ static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
{
phys_addr_t found;

- if (WARN_ON(!align))
- align = __alignof__(long long);
+ if (!align)
+ align = SMP_CACHE_BYTES;

/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
Cc: cgr...@vger.kernel.org
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/page_cgroup.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a..d8bd2c5 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)

table_size = sizeof(struct page_cgroup) * nr_pages;

- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Christoph Lameter <c...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/percpu.c | 41 +++++++++++++++++++++++++----------------
1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 0d10def..f74902c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1063,7 +1063,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

- ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size));
if (!ptr)
return NULL;
ai = ptr;
@@ -1088,7 +1088,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- free_bootmem(__pa(ai), ai->__ai_size);
+ memblock_free_early(__pa(ai), ai->__ai_size);
}

/**
@@ -1246,10 +1246,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

/* process group information and build config tables accordingly */
- group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
- group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
- unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
- unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+ group_offsets = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_offsets[0]));
+ group_sizes = memblock_virt_alloc(ai->nr_groups *
+ sizeof(group_sizes[0]));
+ unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]));
+ unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]));

for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1311,7 +1313,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+ pcpu_slot = memblock_virt_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);

@@ -1322,7 +1324,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
- schunk = alloc_bootmem(pcpu_chunk_struct_size);
+ schunk = memblock_virt_alloc(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->base_addr = base_addr;
schunk->map = smap;
@@ -1346,7 +1348,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,

/* init dynamic chunk if necessary */
if (dyn_size) {
- dchunk = alloc_bootmem(pcpu_chunk_struct_size);
+ dchunk = memblock_virt_alloc(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&dchunk->list);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
@@ -1626,7 +1628,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

- areas = alloc_bootmem_nopanic(areas_size);
+ areas = memblock_virt_alloc_nopanic(areas_size);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -1712,7 +1714,7 @@ out_free_areas:
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- free_bootmem(__pa(areas), areas_size);
+ memblock_free_early(__pa(areas), areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -1760,7 +1762,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = alloc_bootmem(pages_size);
+ pages = memblock_virt_alloc(pages_size);

/* allocate pages */
j = 0;
@@ -1823,7 +1825,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pages), pages_size);
+ memblock_free_early(__pa(pages), pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1848,12 +1850,15 @@ EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_virt_alloc_try_nid_nopanic(size, align,
+ __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ MAX_NUMNODES);
}

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- free_bootmem(__pa(ptr), size);
+ memblock_free_early(__pa(ptr), size);
}

void __init setup_per_cpu_areas(void)
@@ -1896,7 +1901,11 @@ void __init setup_per_cpu_areas(void)
void *fc;

ai = pcpu_alloc_alloc_info(1, 1);
- fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ fc = memblock_virt_alloc_try_nid_nopanic(unit_size,
+ PAGE_SIZE,
+ __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE,
+ MAX_NUMNODES);
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Correct ensure_zone_is_initialized() function description according
to the introduced memblock APIs for early memory allocations.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/memory_hotplug.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cf1736d..4f158ec 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -268,7 +268,7 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
}

/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic() */
+ * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
static int __ref ensure_zone_is_initialized(struct zone *zone,
unsigned long start_pfn, unsigned long num_pages)
{

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/sparse-vmemmap.c | 6 ++++--
mm/sparse.c | 27 +++++++++++++++------------
2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 27eeab3..4cba9c2 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,7 +40,8 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
+ return memblock_virt_alloc_try_nid(size, align, goal,
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
}

static void *vmemmap_buf;
@@ -226,7 +227,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,

if (vmemmap_buf_start) {
/* need to free left buf */
- free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+ memblock_free_early(__pa(vmemmap_buf),
+ vmemmap_buf_end - vmemmap_buf);
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0..02f57cc 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -69,7 +69,7 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
else
section = kzalloc(array_size, GFP_KERNEL);
} else {
- section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+ section = memblock_virt_alloc_node(array_size, nid);
}

return section;
@@ -279,8 +279,9 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ p = memblock_virt_alloc_try_nid_nopanic(size,
+ SMP_CACHE_BYTES, goal, limit,
+ nid);
if (!p && limit) {
limit = 0;
goto again;
@@ -331,7 +332,7 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return alloc_bootmem_node_nopanic(pgdat, size);
+ return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
}

static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -376,8 +377,9 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
return map;

size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
- map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
return map;
}
void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -401,8 +403,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}

size = PAGE_ALIGN(size);
- map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ map = memblock_virt_alloc_try_nid(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
@@ -545,7 +548,7 @@ void __init sparse_init(void)
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
*/
size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
- usemap_map = alloc_bootmem(size);
+ usemap_map = memblock_virt_alloc(size);
if (!usemap_map)
panic("can not allocate usemap_map\n");
alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
@@ -553,7 +556,7 @@ void __init sparse_init(void)

#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
- map_map = alloc_bootmem(size2);
+ map_map = memblock_virt_alloc(size2);
if (!map_map)
panic("can not allocate map_map\n");
alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
@@ -583,9 +586,9 @@ void __init sparse_init(void)
vmemmap_populate_print_last();

#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
- free_bootmem(__pa(map_map), size2);
+ memblock_free_early(__pa(map_map), size2);
#endif
- free_bootmem(__pa(usemap_map), size);
+ memblock_free_early(__pa(usemap_map), size);
}

#ifdef CONFIG_MEMORY_HOTPLUG

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
arch/arm/mm/init.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 3e8f106..bee6d2c 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -461,7 +461,7 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn)
* free the section of the memmap array.
*/
if (pg < pgend)
- free_bootmem(pg, pgend - pg);
+ memblock_free_early(pg, pgend - pg);
}

/*

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Clean-up to remove depedency with bootmem headers.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: William Hubbs <w.d....@gmail.com>
Cc: Chris Brannon <ch...@the-brannons.com>
Cc: Kirk Reiser <ki...@reisers.ca>
Cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
drivers/staging/speakup/main.c | 2 --
1 file changed, 2 deletions(-)

diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 47502fa..ef5933b 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c
@@ -37,8 +37,6 @@
#include <linux/input.h>
#include <linux/kmod.h>

-#include <linux/bootmem.h> /* for alloc_bootmem */
-
/* speakup_*_selection */
#include <linux/module.h>
#include <linux/sched.h>

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Clean-up to remove depedency with bootmem headers.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Arnd Bergmann <ar...@arndb.de>
Cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
drivers/char/mem.c | 1 -
1 file changed, 1 deletion(-)

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index f895a8c..92c5937 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -22,7 +22,6 @@
#include <linux/device.h>
#include <linux/highmem.h>
#include <linux/backing-dev.h>
-#include <linux/bootmem.h>
#include <linux/splice.h>
#include <linux/pfn.h>
#include <linux/export.h>

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
drivers/firmware/memmap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index e2e04b0..15550b2 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
{
struct firmware_map_entry *entry;

- entry = alloc_bootmem(sizeof(struct firmware_map_entry));
+ entry = memblock_virt_alloc(sizeof(struct firmware_map_entry));
if (WARN_ON(!entry))
return -ENOMEM;

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/hugetlb.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4..e16c56e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1280,9 +1280,9 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;

- addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- huge_page_size(h), huge_page_size(h), 0);
-
+ addr = memblock_virt_alloc_try_nid_nopanic(
+ huge_page_size(h), huge_page_size(h),
+ 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -1322,8 +1322,8 @@ static void __init gather_bootmem_prealloc(void)

#ifdef CONFIG_HIGHMEM
page = pfn_to_page(m->phys >> PAGE_SHIFT);
- free_bootmem_late((unsigned long)m,
- sizeof(struct huge_bootmem_page));
+ memblock_free_late(__pa(m),
+ sizeof(struct huge_bootmem_page));
#else
page = virt_to_page(m);
#endif

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Konrad Rzeszutek Wilk <konra...@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
lib/swiotlb.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index e4399fa..6c8712e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -172,8 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
/*
* Get the overflow emergency buffer
*/
- v_overflow_buffer = alloc_bootmem_low_pages_nopanic(
- PAGE_ALIGN(io_tlb_overflow));
+ v_overflow_buffer = memblock_virt_alloc_align_nopanic(
+ PAGE_ALIGN(io_tlb_overflow),
+ PAGE_SIZE);
if (!v_overflow_buffer)
return -ENOMEM;

@@ -184,11 +185,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
*/
- io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+ io_tlb_list = memblock_virt_alloc_align(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)),
+ PAGE_SIZE);
for (i = 0; i < io_tlb_nslabs; i++)
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
io_tlb_index = 0;
- io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ io_tlb_orig_addr = memblock_virt_alloc_align(
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)),
+ PAGE_SIZE);

if (verbose)
swiotlb_print_info();
@@ -215,13 +220,14 @@ swiotlb_init(int verbose)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;

/* Get IO TLB memory from the low pages */
- vstart = alloc_bootmem_low_pages_nopanic(PAGE_ALIGN(bytes));
+ vstart = memblock_virt_alloc_align_nopanic(PAGE_ALIGN(bytes),
+ PAGE_SIZE);
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;

if (io_tlb_start)
- free_bootmem(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ memblock_free_early(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
pr_warn("Cannot allocate SWIOTLB buffer");
no_iotlb_memory = true;
}
@@ -357,14 +363,14 @@ void __init swiotlb_free(void)
free_pages((unsigned long)phys_to_virt(io_tlb_start),
get_order(io_tlb_nslabs << IO_TLB_SHIFT));
} else {
- free_bootmem_late(io_tlb_overflow_buffer,
- PAGE_ALIGN(io_tlb_overflow));
- free_bootmem_late(__pa(io_tlb_orig_addr),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
- free_bootmem_late(__pa(io_tlb_list),
- PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
- free_bootmem_late(io_tlb_start,
- PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ memblock_free_late(io_tlb_overflow_buffer,
+ PAGE_ALIGN(io_tlb_overflow));
+ memblock_free_late(__pa(io_tlb_orig_addr),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
+ memblock_free_late(__pa(io_tlb_list),
+ PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
+ memblock_free_late(io_tlb_start,
+ PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
io_tlb_nslabs = 0;

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Pavel Machek <pa...@ucw.cz>
Acked-by: "Rafael J. Wysocki" <r...@sisk.pl>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
kernel/power/snapshot.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b38109e..917cbd4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -637,7 +637,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
BUG_ON(!region);
} else
/* This allocation cannot fail */
- region = alloc_bootmem(sizeof(struct nosave_region));
+ region = memblock_virt_alloc(sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:03 PM12/2/13
to
Introduce memblock memory allocation APIs which allow to support
PAE or LPAE extension on 32 bits archs where the physical memory start
address can be beyond 4GB. In such cases, existing bootmem APIs which
operate on 32 bit addresses won't work and needs memblock layer which
operates on 64 bit addresses.

So we add equivalent APIs so that we can replace usage of bootmem
with memblock interfaces. Architectures already converted to NO_BOOTMEM
use these new interfaces and other which still uses bootmem, these new
APIs just fallback to exiting bootmem APIs. So no functional change as
such.

In long run, once all the achitectures moves to NO_BOOTMEM, we can get rid of
bootmem layer completely. This is one step to remove the core code dependency
with bootmem and also gives path for architectures to move away from bootmem.

The proposed interface will became active if both CONFIG_HAVE_MEMBLOCK
and CONFIG_NO_BOOTMEM are specified by arch. In case !CONFIG_NO_BOOTMEM,
the memblock() wrappers will fallback to the existing bootmem apis so
that arch's not converted to NO_BOOTMEM continue to work as is.

The meaning of MEMBLOCK_ALLOC_ACCESSIBLE and MEMBLOCK_ALLOC_ANYWHERE is
kept same.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
include/linux/bootmem.h | 88 +++++++++++++++++++++
mm/memblock.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 283 insertions(+)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 55d52fb..d333ac4 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -141,6 +141,94 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)

+
+#if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM)
+
+/* FIXME: use MEMBLOCK_ALLOC_* variants here */
+#define BOOTMEM_ALLOC_ACCESSIBLE 0
+#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)
+
+/*
+ * FIXME: use NUMA_NO_NODE instead of MAX_NUMNODES when bootmem/nobootmem code
+ * will be removed.
+ * It can't be done now, because when MEMBLOCK or NO_BOOTMEM are not enabled
+ * all calls of the new API will be redirected to bottmem/nobootmem where
+ * MAX_NUMNODES is widely used.
+ * Also, memblock core APIs __next_free_mem_range_rev() and
+ * __next_free_mem_range() would need to be updated, and as result we will
+ * need to re-check/update all direct calls of memblock_alloc_xxx()
+ * APIs (including nobootmem).
+ */
+
+/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
+void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
+ phys_addr_t align, phys_addr_t from,
+ phys_addr_t max_addr, int nid);
+void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align,
+ phys_addr_t from, phys_addr_t max_addr, int nid);
+void __memblock_free_early(phys_addr_t base, phys_addr_t size);
+void __memblock_free_late(phys_addr_t base, phys_addr_t size);
+
+#define memblock_virt_alloc(x) \
+ memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+#define memblock_virt_alloc_align(x, align) \
+ memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+#define memblock_virt_alloc_nopanic(x) \
+ memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
+ BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, \
+ MAX_NUMNODES)
+#define memblock_virt_alloc_align_nopanic(x, align) \
+ memblock_virt_alloc_try_nid_nopanic(x, align, \
+ BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, \
+ MAX_NUMNODES)
+#define memblock_virt_alloc_node(x, nid) \
+ memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, nid)
+#define memblock_virt_alloc_node_nopanic(x, nid) \
+ memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
+ BOOTMEM_LOW_LIMIT, \
+ BOOTMEM_ALLOC_ACCESSIBLE, nid)
+
+#define memblock_free_early(x, s) __memblock_free_early(x, s)
+#define memblock_free_early_nid(x, s, nid) __memblock_free_early(x, s)
+#define memblock_free_late(x, s) __memblock_free_late(x, s)
+
+#else
+
+#define BOOTMEM_ALLOC_ACCESSIBLE 0
+
+
+/* Fall back to all the existing bootmem APIs */
+#define memblock_virt_alloc(x) \
+ __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_align(x, align) \
+ __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_nopanic(x) \
+ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_align_nopanic(x, align) \
+ __alloc_bootmem_nopanic(x, align, BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_node(x, nid) \
+ __alloc_bootmem_node(NODE_DATA(nid), x, SMP_CACHE_BYTES, \
+ BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_node_nopanic(x, nid) \
+ __alloc_bootmem_node_nopanic(NODE_DATA(nid), x, SMP_CACHE_BYTES, \
+ BOOTMEM_LOW_LIMIT)
+#define memblock_virt_alloc_try_nid(size, align, from, max_addr, nid) \
+ __alloc_bootmem_node_high(NODE_DATA(nid), size, align, from)
+#define memblock_virt_alloc_try_nid_nopanic(size, align, from, max_addr, nid) \
+ ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, \
+ from, max_addr)
+#define memblock_free_early(x, s) free_bootmem(x, s)
+#define memblock_free_early_nid(x, s, nid) \
+ free_bootmem_node(NODE_DATA(nid), x, s)
+#define memblock_free_late(x, s) free_bootmem_late(x, s)
+
+#endif /* defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) */
+
#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
extern void *alloc_remap(int nid, unsigned long size);
#else
diff --git a/mm/memblock.c b/mm/memblock.c
index 1d15e07..3311fbb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -21,6 +21,9 @@
#include <linux/memblock.h>

#include <asm-generic/sections.h>
+#include <asm/io.h>
+
+#include "internal.h"

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -933,6 +936,198 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}

+/**
+ * _memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @from: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * The @from limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @from.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The phys address of allocated boot memory block is converted to virtual and
+ * allocated memory is reset to 0.
+ *
+ * In addition, function sets sets the min_count for allocated boot memory block
+ * to 0 so that it is never reported as leaks.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init _memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t from, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available())) {
+ if (nid == MAX_NUMNODES)
+ return kzalloc(size, GFP_NOWAIT);
+ else
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+ }
+
+ if (!align)
+ align = SMP_CACHE_BYTES;
+
+ /* align @size to avoid excessive fragmentation on reserved array */
+ size = round_up(size, align);
+
+again:
+ alloc = memblock_find_in_range_node(from, max_addr, size, align, nid);
+ if (alloc)
+ goto done;
+
+ if (nid != MAX_NUMNODES) {
+ alloc = memblock_find_in_range_node(from, max_addr, size,
+ align, MAX_NUMNODES);
+ if (alloc)
+ goto done;
+ }
+
+ if (from) {
+ from = 0;
+ goto again;
+ } else {
+ goto error;
+ }
+
+done:
+ memblock_reserve(alloc, size);
+ ptr = phys_to_virt(alloc);
+ memset(ptr, 0, size);
+
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+
+ return ptr;
+
+error:
+ return NULL;
+}
+
+/**
+ * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @from: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
+ * additional debug information (including caller info), if enabled.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_nopanic(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t from, phys_addr_t max_addr,
+ int nid)
+{
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)from,
+ (u64)max_addr, (void *)_RET_IP_);
+ return _memblock_virt_alloc_try_nid_nopanic(size,
+ align, from, max_addr, nid);
+}
+
+/**
+ * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @from: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ *
+ * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * which provides debug information (including caller info), if enabled,
+ * and panics if the request can not be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t from, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)from,
+ (u64)max_addr, (void *)_RET_IP_);
+ ptr = _memblock_virt_alloc_try_nid_nopanic(size,
+ align, from, max_addr, nid);
+ if (ptr)
+ return ptr;
+
+ panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
+ __func__, (u64)size, (u64)align, nid, (u64)from, (u64)max_addr);
+ return NULL;
+}
+
+/**
+ * __memblock_free_early - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
+void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
+{
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ __memblock_remove(&memblock.reserved, base, size);
+}
+
+/*
+ * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * @addr: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are released directly
+ * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+{
+ u64 cursor, end;
+
+ memblock_dbg("%s: [%#016llx-%#016llx] %pF\n",
+ __func__, (u64)base, (u64)base + size - 1,
+ (void *)_RET_IP_);
+ kmemleak_free_part(__va(base), size);
+ cursor = PFN_UP(base);
+ end = PFN_DOWN(base + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}

/*
* Remaining API functions

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:04 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/page_alloc.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f0..68a30f6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4210,7 +4210,6 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;

/*
@@ -4226,7 +4225,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node_nopanic(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4346,13 +4346,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif

/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4364,9 +4365,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);

if (start_pfn < end_pfn)
- free_bootmem_node(NODE_DATA(this_nid),
- PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT);
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}

@@ -4637,8 +4638,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
- usemapsize);
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4832,7 +4834,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node_nopanic(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5858,7 +5861,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem_nopanic(size);
+ table = memblock_virt_alloc_nopanic(size);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:04 PM12/2/13
to
From: Grygorii Strashko <grygorii...@ti.com>

When debugging is enabled (cmdline has "memblock=debug") the memblock
will display upper memory boundary per each allocated/freed memory range
wrongly. For example:
memblock_reserve: [0x0000009e7e8000-0x0000009e7ed000] _memblock_early_alloc_try_nid_nopanic+0xfc/0x12c

The 0x0000009e7ed000 is displayed instead of 0x0000009e7ecfff

Hence, correct this by changing formula used to calculate upper memory
boundary to (u64)base + size - 1 instead of (u64)base + size everywhere
in the debug messages.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Acked-by: Tejun Heo <t...@kernel.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/memblock.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477b..aab5669 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -643,7 +643,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
+ (unsigned long long)base + size - 1,
(void *)_RET_IP_);

return __memblock_remove(&memblock.reserved, base, size);
@@ -655,7 +655,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)

memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n",
(unsigned long long)base,
- (unsigned long long)base + size,
+ (unsigned long long)base + size - 1,
(void *)_RET_IP_);

return memblock_add_region(_rgn, base, size, MAX_NUMNODES);

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
lib/cpumask.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index d327b87..44e492e 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -140,7 +140,7 @@ EXPORT_SYMBOL(zalloc_cpumask_var);
*/
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
- *mask = alloc_bootmem(cpumask_size());
+ *mask = memblock_virt_alloc(cpumask_size());
}

/**
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(free_cpumask_var);
*/
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
- free_bootmem(__pa(mask), cpumask_size());
+ memblock_free_early(__pa(mask), cpumask_size());
}
#endif

Santosh Shilimkar

unread,
Dec 2, 2013, 9:40:02 PM12/2/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Paul Walmsley <pa...@pwsan.com>
Cc: Tony Lindgren <to...@atomide.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
arch/arm/mach-omap2/omap_hwmod.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index e3f0eca..92d11e2 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2695,9 +2695,7 @@ static int __init _alloc_links(struct omap_hwmod_link **ml,
sz = sizeof(struct omap_hwmod_link) * LINKS_PER_OCP_IF;

*sl = NULL;
- *ml = alloc_bootmem(sz);
-
- memset(*ml, 0, sz);
+ *ml = memblock_virt_alloc(sz);

*sl = (void *)(*ml) + sizeof(struct omap_hwmod_link);

@@ -2816,9 +2814,7 @@ static int __init _alloc_linkspace(struct omap_hwmod_ocp_if **ois)
pr_debug("omap_hwmod: %s: allocating %d byte linkspace (%d links)\n",
__func__, sz, max_ls);

- linkspace = alloc_bootmem(sz);
-
- memset(linkspace, 0, sz);
+ linkspace = memblock_virt_alloc(sz);

return 0;

Tejun Heo

unread,
Dec 3, 2013, 6:00:02 PM12/3/13
to
On Mon, Dec 02, 2013 at 09:27:21PM -0500, Santosh Shilimkar wrote:
> From: Grygorii Strashko <grygorii...@ti.com>
>
> Clean-up to remove depedency with bootmem headers.
>
> Cc: Yinghai Lu <yin...@kernel.org>
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Andrew Morton <ak...@linux-foundation.org>
> Cc: Arnd Bergmann <ar...@arndb.de>
> Cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
> Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
> Signed-off-by: Santosh Shilimkar <santosh....@ti.com>

Please merge 4-6 into a single patch.

Thanks.

--
tejun

Tejun Heo

unread,
Dec 3, 2013, 6:00:02 PM12/3/13
to
On Mon, Dec 02, 2013 at 09:27:17PM -0500, Santosh Shilimkar wrote:
...
> Cc: Yinghai Lu <yin...@kernel.org>
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Andrew Morton <ak...@linux-foundation.org>
> Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
> Signed-off-by: Santosh Shilimkar <santosh....@ti.com>

Reviewed-by: Tejun Heo <t...@kernel.org>

> + /*
> + * Don't allow Nobootmem allocator to free reserved memory regions

Extreme nitpick: why the capitalization of "Nobootmem"?

Thanks.

--
tejun

Tejun Heo

unread,
Dec 3, 2013, 6:00:02 PM12/3/13
to
On Mon, Dec 02, 2013 at 09:27:22PM -0500, Santosh Shilimkar wrote:
> From: Grygorii Strashko <grygorii...@ti.com>
>
> drop WARN and use SMP_CACHE_BYTES as a default alignment in
> memblock_alloc_base_nid() as recommended by Tejun Heo in
> https://lkml.org/lkml/2013/10/13/117.

Can you please add description on why this change is being made? This
is in preparation of common alloc interface, right? The patch
description is kinda out-of-blue.

Thanks.

--
tejun

Tejun Heo

unread,
Dec 3, 2013, 6:00:02 PM12/3/13
to
On Mon, Dec 02, 2013 at 09:27:18PM -0500, Santosh Shilimkar wrote:
> From: Grygorii Strashko <grygorii...@ti.com>
>
> The __free_pages_bootmem is used internally by MM core and
> already defined in internal.h. So, remove duplicated declaration.
>
> Cc: Yinghai Lu <yin...@kernel.org>
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Andrew Morton <ak...@linux-foundation.org>
> Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
> Signed-off-by: Santosh Shilimkar <santosh....@ti.com>

Reviewed-by: Tejun Heo <t...@kernel.org>

--
tejun

Tejun Heo

unread,
Dec 3, 2013, 6:00:03 PM12/3/13
to
On Mon, Dec 02, 2013 at 09:27:19PM -0500, Santosh Shilimkar wrote:
> From: Grygorii Strashko <grygorii...@ti.com>
>
> Clean-up to remove depedency with bootmem headers.
>

Tejun Heo

unread,
Dec 3, 2013, 6:30:02 PM12/3/13
to
Hello,

On Mon, Dec 02, 2013 at 09:27:23PM -0500, Santosh Shilimkar wrote:
> So we add equivalent APIs so that we can replace usage of bootmem
> with memblock interfaces. Architectures already converted to NO_BOOTMEM
> use these new interfaces and other which still uses bootmem, these new
> APIs just fallback to exiting bootmem APIs. So no functional change as
> such.

The last part of the second last sentence doesn't parse too well. I
think it'd be worthwhile to improve and preferably expand on it as
this is a bit tricky to understand given the twisted state of early
memory allocation.

> In long run, once all the achitectures moves to NO_BOOTMEM, we can get rid of
> bootmem layer completely. This is one step to remove the core code dependency
> with bootmem and also gives path for architectures to move away from bootmem.

Lines too long?

> +/*
> + * FIXME: use NUMA_NO_NODE instead of MAX_NUMNODES when bootmem/nobootmem code
> + * will be removed.
> + * It can't be done now, because when MEMBLOCK or NO_BOOTMEM are not enabled
> + * all calls of the new API will be redirected to bottmem/nobootmem where
> + * MAX_NUMNODES is widely used.

I don't know. We're introducing a new API which will be used across
the kernel. I don't think it makes a lot of sense to use the wrong
constant now to convert all the users later. Wouldn't it be better to
make the new interface take NUMA_NO_NODE and do whatever it needs to
do to interface with bootmem?

> + * Also, memblock core APIs __next_free_mem_range_rev() and
> + * __next_free_mem_range() would need to be updated, and as result we will
> + * need to re-check/update all direct calls of memblock_alloc_xxx()
> + * APIs (including nobootmem).
> + */

Hmmm....

> +/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
> +void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
> + phys_addr_t align, phys_addr_t from,
> + phys_addr_t max_addr, int nid);

Wouldn't @min_addr instead of @from make more sense? Ditto for other
occurrences.

> +void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align,
> + phys_addr_t from, phys_addr_t max_addr, int nid);
> +void __memblock_free_early(phys_addr_t base, phys_addr_t size);
> +void __memblock_free_late(phys_addr_t base, phys_addr_t size);
> +
> +#define memblock_virt_alloc(x) \
> + memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)

The underlying function interprets 0 as the default align, so it
probably is a better idea to just use 0 here.

> +#define memblock_virt_alloc_align(x, align) \
> + memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)

Also, do we really need this align variant separate when the caller
can simply specify 0 for the default?

> +#define memblock_virt_alloc_nopanic(x) \
> + memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
> + BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, \
> + MAX_NUMNODES)
> +#define memblock_virt_alloc_align_nopanic(x, align) \
> + memblock_virt_alloc_try_nid_nopanic(x, align, \
> + BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, \
> + MAX_NUMNODES)
> +#define memblock_virt_alloc_node(x, nid) \
> + memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, nid)
> +#define memblock_virt_alloc_node_nopanic(x, nid) \
> + memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
> + BOOTMEM_LOW_LIMIT, \
> + BOOTMEM_ALLOC_ACCESSIBLE, nid)
> +
> +#define memblock_free_early(x, s) __memblock_free_early(x, s)
> +#define memblock_free_early_nid(x, s, nid) __memblock_free_early(x, s)
> +#define memblock_free_late(x, s) __memblock_free_late(x, s)

Please make the wrappers inline functions.
Ditto.

> diff --git a/mm/memblock.c b/mm/memblock.c
> index 1d15e07..3311fbb 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -21,6 +21,9 @@
> #include <linux/memblock.h>
>
> #include <asm-generic/sections.h>
> +#include <asm/io.h>
> +
> +#include "internal.h"
>
> static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
> static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
> @@ -933,6 +936,198 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
> return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
> }
>
> +/**
> + * _memblock_virt_alloc_try_nid_nopanic - allocate boot memory block

Please don't use both "__" and "_" prefixes. It gets confusing like
hell. Just give it an another name.

> + * @size: size of memory block to be allocated in bytes
> + * @align: alignment of the region and block's size
> + * @from: the lower bound of the memory region from where the allocation
> + * is preferred (phys address)
> + * @max_addr: the upper bound of the memory region from where the allocation
> + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
> + * allocate only from memory limited by memblock.current_limit value

It probably would be better style to make the above shorter and fit
each on a single line. If they need further explanation, they can be
done in the body of the comment.

> + * @nid: nid of the free area to find, %MAX_NUMNODES for any node
> + *
> + * The @from limit is dropped if it can not be satisfied and the allocation
> + * will fall back to memory below @from.
> + *
> + * Allocation may fall back to any node in the system if the specified node
> + * can not hold the requested memory.

Maybe combine the above two paragraphs?

> + * The phys address of allocated boot memory block is converted to virtual and
> + * allocated memory is reset to 0.
> + *
> + * In addition, function sets sets the min_count for allocated boot memory block

^^^^^^^^^
No mention of kmemleak at all is a bit confusing. min_count of what?
Not your fault but we probably wanna update these functions so that
their param orders are consistent.

Thanks.

--
tejun

Santosh Shilimkar

unread,
Dec 4, 2013, 10:00:02 AM12/4/13
to
On Tuesday 03 December 2013 05:55 PM, Tejun Heo wrote:
> On Mon, Dec 02, 2013 at 09:27:21PM -0500, Santosh Shilimkar wrote:
>> From: Grygorii Strashko <grygorii...@ti.com>
>>
>> Clean-up to remove depedency with bootmem headers.
>>
>> Cc: Yinghai Lu <yin...@kernel.org>
>> Cc: Tejun Heo <t...@kernel.org>
>> Cc: Andrew Morton <ak...@linux-foundation.org>
>> Cc: Arnd Bergmann <ar...@arndb.de>
>> Cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
>> Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
>> Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
>
> Please merge 4-6 into a single patch.
>
Will do

Santosh Shilimkar

unread,
Dec 4, 2013, 10:00:02 AM12/4/13
to
On Tuesday 03 December 2013 05:52 PM, Tejun Heo wrote:
> On Mon, Dec 02, 2013 at 09:27:17PM -0500, Santosh Shilimkar wrote:
> ...
>> Cc: Yinghai Lu <yin...@kernel.org>
>> Cc: Tejun Heo <t...@kernel.org>
>> Cc: Andrew Morton <ak...@linux-foundation.org>
>> Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
>> Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
>
> Reviewed-by: Tejun Heo <t...@kernel.org>
>
>> + /*
>> + * Don't allow Nobootmem allocator to free reserved memory regions
>
> Extreme nitpick: why the capitalization of "Nobootmem"?
>
Will fix that

Santosh Shilimkar

unread,
Dec 4, 2013, 11:00:02 AM12/4/13
to
On Tuesday 03 December 2013 06:24 PM, Tejun Heo wrote:
> Hello,
>
> On Mon, Dec 02, 2013 at 09:27:23PM -0500, Santosh Shilimkar wrote:
>> So we add equivalent APIs so that we can replace usage of bootmem
>> with memblock interfaces. Architectures already converted to NO_BOOTMEM
>> use these new interfaces and other which still uses bootmem, these new
>> APIs just fallback to exiting bootmem APIs. So no functional change as
>> such.
>
> The last part of the second last sentence doesn't parse too well. I
> think it'd be worthwhile to improve and preferably expand on it as
> this is a bit tricky to understand given the twisted state of early
> memory allocation.
>
Ok. Will expand bit more. Also agree with rest of the comments and
will fix accordingly except one ;-)

>
>> +/*
>> + * FIXME: use NUMA_NO_NODE instead of MAX_NUMNODES when bootmem/nobootmem code
>> + * will be removed.
>> + * It can't be done now, because when MEMBLOCK or NO_BOOTMEM are not enabled
>> + * all calls of the new API will be redirected to bottmem/nobootmem where
>> + * MAX_NUMNODES is widely used.
>
> I don't know. We're introducing a new API which will be used across
> the kernel. I don't think it makes a lot of sense to use the wrong
> constant now to convert all the users later. Wouldn't it be better to
> make the new interface take NUMA_NO_NODE and do whatever it needs to
> do to interface with bootmem?
>
Well as you know there are architectures still using bootmem even after
this series. Changing MAX_NUMNODES to NUMA_NO_NODE is too invasive and
actually should be done in a separate series. As commented, the best
time to do that would be when all remaining architectures moves to
memblock.

Just to give you perspective, look at the patch end of the email which
Grygorrii cooked up. It doesn't cover all the users of MAX_NUMNODES
and we are bot even sure whether the change is correct and its
impact on the code which we can't even tests. I would really want to
avoid touching all the architectures and keep the scope of the series
to core code as we aligned initially.

May be you have better idea to handle this change so do
let us know how to proceed with it. With such a invasive change the
$subject series can easily get into circles again :-(

Regards,
Santosh

---
arch/ia64/mm/discontig.c | 2 +-
arch/powerpc/mm/numa.c | 2 +-
arch/s390/mm/init.c | 2 +-
arch/sparc/mm/init_64.c | 2 +-
arch/x86/kernel/check.c | 2 +-
arch/x86/kernel/e820.c | 4 ++--
arch/x86/mm/init.c | 2 +-
arch/x86/mm/init_32.c | 2 +-
arch/x86/mm/init_64.c | 2 +-
arch/x86/mm/memtest.c | 2 +-
arch/x86/mm/numa.c | 2 +-
include/linux/bootmem.h | 20 ++++----------------
include/linux/memblock.h | 6 +++---
mm/memblock.c | 43 ++++++++++++++++++++-----------------------
mm/nobootmem.c | 8 ++++----
mm/page_alloc.c | 18 +++++++++---------
mm/percpu.c | 4 ++--
17 files changed, 54 insertions(+), 69 deletions(-)

diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 2de08f4..81ec37c 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -764,7 +764,7 @@ void __init paging_init(void)

efi_memmap_walk(filter_rsvd_memory, count_node_pages);

- sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_memory_present_with_active_regions(NUMA_NO_NODE);
sparse_init();

#ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 078d3e0..817a8b5 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -142,7 +142,7 @@ static void __init get_node_active_region(unsigned long pfn,
unsigned long start_pfn, end_pfn;
int i, nid;

- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, &nid) {
if (pfn >= start_pfn && pfn < end_pfn) {
node_ar->nid = nid;
node_ar->start_pfn = start_pfn;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ad446b0..f06220f 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -126,7 +126,7 @@ void __init paging_init(void)

atomic_set(&init_mm.context.attach_count, 1);

- sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_memory_present_with_active_regions(NUMA_NO_NODE);
sparse_init();
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5322e53..5b9458a 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -1346,7 +1346,7 @@ static unsigned long __init bootmem_init(unsigned long phys_base)

/* XXX cpu notifier XXX */

- sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_memory_present_with_active_regions(NUMA_NO_NODE);
sparse_init();

return end_pfn;
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7..83a7995 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)

corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);

- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5f..050b01e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1114,13 +1114,13 @@ void __init memblock_find_dma_reserve(void)
* need to use memblock to get free size in [0, MAX_DMA_PFN]
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, NULL) {
start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
nr_pages += end_pfn - start_pfn;
}

- for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f971306..ce959fa 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -379,7 +379,7 @@ static unsigned long __init init_range_memory_mapping(
unsigned long mapped_ram_size = 0;
int i;

- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, NULL) {
u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
if (start >= end)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 4287f1f..920e3bc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -706,7 +706,7 @@ void __init paging_init(void)
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_memory_present_with_active_regions(NUMA_NO_NODE);
sparse_init();
zone_sizes_init();
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 104d56a..3d5ab67 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -649,7 +649,7 @@ void __init initmem_init(void)

void __init paging_init(void)
{
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
+ sparse_memory_present_with_active_regions(NUMA_NO_NODE);
sparse_init();

/*
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 8dabbed..1e9da79 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -74,7 +74,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
u64 i;
phys_addr_t this_start, this_end;

- for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
this_start = clamp_t(phys_addr_t, this_start, start, end);
this_end = clamp_t(phys_addr_t, this_end, start, end);
if (this_start < this_end) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 24aec58..b4ec91a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -561,7 +561,7 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, NUMA_NO_NODE));
numa_reset_distance();

ret = init_func();
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index d333ac4..b518b75 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -148,18 +148,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
#define BOOTMEM_ALLOC_ACCESSIBLE 0
#define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0)

-/*
- * FIXME: use NUMA_NO_NODE instead of MAX_NUMNODES when bootmem/nobootmem code
- * will be removed.
- * It can't be done now, because when MEMBLOCK or NO_BOOTMEM are not enabled
- * all calls of the new API will be redirected to bottmem/nobootmem where
- * MAX_NUMNODES is widely used.
- * Also, memblock core APIs __next_free_mem_range_rev() and
- * __next_free_mem_range() would need to be updated, and as result we will
- * need to re-check/update all direct calls of memblock_alloc_xxx()
- * APIs (including nobootmem).
- */
-
/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
phys_addr_t align, phys_addr_t from,
@@ -171,20 +159,20 @@ void __memblock_free_late(phys_addr_t base, phys_addr_t size);

#define memblock_virt_alloc(x) \
memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
- BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+ BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE)
#define memblock_virt_alloc_align(x, align) \
memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
- BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+ BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE)
#define memblock_virt_alloc_nopanic(x) \
memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, \
- MAX_NUMNODES)
+ NUMA_NO_NODE)
#define memblock_virt_alloc_align_nopanic(x, align) \
memblock_virt_alloc_try_nid_nopanic(x, align, \
BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, \
- MAX_NUMNODES)
+ NUMA_NO_NODE)
#define memblock_virt_alloc_node(x, nid) \
memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, nid)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 77c60e5..c3b8c1f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -69,7 +69,7 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
/**
* for_each_mem_pfn_range - early memory pfn range iterator
* @i: an integer used as loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to ulong for start pfn of the range, can be %NULL
* @p_end: ptr to ulong for end pfn of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
@@ -87,7 +87,7 @@ void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start,
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
@@ -107,7 +107,7 @@ void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start,
/**
* for_each_free_mem_range_reverse - rev-iterate through free memblock areas
* @i: u64 used as loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @p_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memblock.c b/mm/memblock.c
index 3311fbb..e2de30f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -94,7 +94,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -126,7 +126,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -161,7 +161,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @size: size of free area to find
* @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -242,7 +242,7 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t align)
{
return memblock_find_in_range_node(start, end, size, align,
- MAX_NUMNODES);
+ NUMA_NO_NODE);
}

static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -258,7 +258,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->cnt = 1;
type->regions[0].base = 0;
type->regions[0].size = 0;
- memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
+ memblock_set_region_node(&type->regions[0], NUMA_NO_NODE);
}
}

@@ -558,7 +558,7 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,

int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES);
+ return memblock_add_region(&memblock.memory, base, size, NUMA_NO_NODE);
}

/**
@@ -674,13 +674,13 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
(unsigned long long)base + size - 1,
(void *)_RET_IP_);

- return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
+ return memblock_add_region(_rgn, base, size, NUMA_NO_NODE);
}

/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
@@ -715,7 +715,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;

/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
continue;

/* scan areas before each reservation for intersection */
@@ -756,7 +756,7 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
/**
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
@@ -783,7 +783,7 @@ void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;

/* only memory regions are associated with nodes, check it */
- if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m))
+ if (nid != NUMA_NO_NODE && nid != memblock_get_region_node(m))
continue;

/* scan areas before each reservation for intersection */
@@ -833,7 +833,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,

if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
- if (nid == MAX_NUMNODES || nid == r->nid)
+ if (nid == NUMA_NO_NODE || nid == r->nid)
break;
}
if (*idx >= type->cnt) {
@@ -906,7 +906,7 @@ phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int n

phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
}

phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -945,7 +945,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* The @from limit is dropped if it can not be satisfied and the allocation
* will fall back to memory below @from.
@@ -971,10 +971,7 @@ static void * __init _memblock_virt_alloc_try_nid_nopanic(
void *ptr;

if (WARN_ON_ONCE(slab_is_available())) {
- if (nid == MAX_NUMNODES)
- return kzalloc(size, GFP_NOWAIT);
- else
- return kzalloc_node(size, GFP_NOWAIT, nid);
+ return kzalloc_node(size, GFP_NOWAIT, nid);
}

if (!align)
@@ -988,9 +985,9 @@ again:
if (alloc)
goto done;

- if (nid != MAX_NUMNODES) {
+ if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(from, max_addr, size,
- align, MAX_NUMNODES);
+ align, NUMA_NO_NODE);
if (alloc)
goto done;
}
@@ -1028,7 +1025,7 @@ error:
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
* additional debug information (including caller info), if enabled.
@@ -1057,7 +1054,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
* which provides debug information (including caller info), if enabled,
@@ -1320,7 +1317,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
base = rgn->base;
size = rgn->size;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- if (memblock_get_region_node(rgn) != MAX_NUMNODES)
+ if (memblock_get_region_node(rgn) != NUMA_NO_NODE)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
#endif
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d3..3bf678c 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -117,7 +117,7 @@ static unsigned long __init free_low_memory_core_early(void)
phys_addr_t start, end, size;
u64 i;

- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);

/* free range that is used for reserved array if we allocate it */
@@ -161,7 +161,7 @@ unsigned long __init free_all_bootmem(void)
reset_all_zones_managed_pages();

/*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
*/
@@ -215,7 +215,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,

restart:

- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);

if (ptr)
return ptr;
@@ -299,7 +299,7 @@ again:
if (ptr)
return ptr;

- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
goal, limit);
if (ptr)
return ptr;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 68a30f6..fff0035 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4347,7 +4347,7 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)

/**
* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @nid: The node to free memory on. If NUMA_NO_NODE, all nodes are freed.
* @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
* If an architecture guarantees that all ranges registered with
@@ -4373,7 +4373,7 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)

/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
+ * @nid: The node to call memory_present for. If NUMA_NO_NODE, all nodes will be used.
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
@@ -4390,7 +4390,7 @@ void __init sparse_memory_present_with_active_regions(int nid)

/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @nid: The nid to return the range for. If NUMA_NO_NODE, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
@@ -4506,7 +4506,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
}

/*
- * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
+ * Return the number of holes in a range on a node. If nid is NUMA_NO_NODE,
* then all holes in the requested range will be accounted for.
*/
unsigned long __meminit __absent_pages_in_range(int nid,
@@ -4535,7 +4535,7 @@ unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
{
- return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
+ return __absent_pages_in_range(NUMA_NO_NODE, start_pfn, end_pfn);
}

/* Return the number of page frames in holes in a zone on a node */
@@ -4926,7 +4926,7 @@ unsigned long __init node_map_pfn_alignment(void)
int last_nid = -1;
int i, nid;

- for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start, &end, &nid) {
if (!start || last_nid < 0 || last_nid == nid) {
last_nid = nid;
last_end = end;
@@ -4977,7 +4977,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
- return find_min_pfn_for_node(MAX_NUMNODES);
+ return find_min_pfn_for_node(NUMA_NO_NODE);
}

/*
@@ -4991,7 +4991,7 @@ static unsigned long __init early_calculate_totalpages(void)
unsigned long start_pfn, end_pfn;
int i, nid;

- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, &nid) {
unsigned long pages = end_pfn - start_pfn;

totalpages += pages;
@@ -5231,7 +5231,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)

/* Print out the early node map */
printk("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ for_each_mem_pfn_range(i, NUMA_NO_NODE, &start_pfn, &end_pfn, &nid)
printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);

diff --git a/mm/percpu.c b/mm/percpu.c
index f74902c..f7cc387 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1853,7 +1853,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
return memblock_virt_alloc_try_nid_nopanic(size, align,
__pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE,
- MAX_NUMNODES);
+ NUMA_NO_NODE);
}

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
@@ -1905,7 +1905,7 @@ void __init setup_per_cpu_areas(void)
PAGE_SIZE,
__pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE,
- MAX_NUMNODES);
+ NUMA_NO_NODE);
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
--
1.7.9.5


Tejun Heo

unread,
Dec 4, 2013, 11:10:01 AM12/4/13
to
Hello,

On Wed, Dec 04, 2013 at 10:54:47AM -0500, Santosh Shilimkar wrote:
> Well as you know there are architectures still using bootmem even after
> this series. Changing MAX_NUMNODES to NUMA_NO_NODE is too invasive and
> actually should be done in a separate series. As commented, the best
> time to do that would be when all remaining architectures moves to
> memblock.
>
> Just to give you perspective, look at the patch end of the email which
> Grygorrii cooked up. It doesn't cover all the users of MAX_NUMNODES
> and we are bot even sure whether the change is correct and its
> impact on the code which we can't even tests. I would really want to
> avoid touching all the architectures and keep the scope of the series
> to core code as we aligned initially.
>
> May be you have better idea to handle this change so do
> let us know how to proceed with it. With such a invasive change the
> $subject series can easily get into circles again :-(

But we don't have to use MAX_NUMNODES for the new interface, no? Or
do you think that it'd be more confusing because it ends up mixing the
two? It kinda really bothers me this patchset is expanding the usage
of the wrong constant with only very far-out plan to fix that. All
archs converting to nobootmem will take a *long* time, that is, if
that happens at all. I don't really care about the order of things
happening but "this is gonna be fixed when everyone moves off
MAX_NUMNODES" really isn't good enough.

Thanks.

--
tejun

Santosh Shilimkar

unread,
Dec 4, 2013, 11:50:04 AM12/4/13
to
On Wednesday 04 December 2013 11:07 AM, Tejun Heo wrote:
> Hello,
>
> On Wed, Dec 04, 2013 at 10:54:47AM -0500, Santosh Shilimkar wrote:
>> Well as you know there are architectures still using bootmem even after
>> this series. Changing MAX_NUMNODES to NUMA_NO_NODE is too invasive and
>> actually should be done in a separate series. As commented, the best
>> time to do that would be when all remaining architectures moves to
>> memblock.
>>
>> Just to give you perspective, look at the patch end of the email which
>> Grygorrii cooked up. It doesn't cover all the users of MAX_NUMNODES
>> and we are bot even sure whether the change is correct and its
>> impact on the code which we can't even tests. I would really want to
>> avoid touching all the architectures and keep the scope of the series
>> to core code as we aligned initially.
>>
>> May be you have better idea to handle this change so do
>> let us know how to proceed with it. With such a invasive change the
>> $subject series can easily get into circles again :-(
>
> But we don't have to use MAX_NUMNODES for the new interface, no? Or
> do you think that it'd be more confusing because it ends up mixing the
> two?
The issue is memblock code already using MAX_NUMNODES. Please
look at __next_free_mem_range() and __next_free_mem_range_rev().
The new API use the above apis and hence use MAX_NUMNODES. If the
usage of these constant was consistent across bootmem and memblock
then we wouldn't have had the whole confusion.

It kinda really bothers me this patchset is expanding the usage
> of the wrong constant with only very far-out plan to fix that. All
> archs converting to nobootmem will take a *long* time, that is, if
> that happens at all. I don't really care about the order of things
> happening but "this is gonna be fixed when everyone moves off
> MAX_NUMNODES" really isn't good enough.
>
Fair enough though the patchset continue to use the constant
which is already used by few memblock APIs ;-)

If we can fix the __next_free_mem_range() and __next_free_mem_range_rev()
to not use MAX_NUMNODES then we can potentially avoid the wrong
usage of constant.

regards,
Santosh

Grygorii Strashko

unread,
Dec 5, 2013, 7:20:02 AM12/5/13
to
Hi Tejun,
I'll try to provide more technical details here.
As Santosh mentioned in previous e-mails, it's not easy to simply
get rid of using MAX_NUMNODES:
1) we introduce new interface memblock_allocX
2) our interface uses memblock APIs __next_free_mem_range_rev()
and __next_free_mem_range()
3) __next_free_mem_range_rev() and __next_free_mem_range() use MAX_NUMNODES
4) _next_free_mem_range_rev() and __next_free_mem_range() are used standalone,
outside of our interface as part of *for_each_free_mem_range* or for_each_mem_pfn_range ..

The point [4] leads to necessity to find and correct all places where memmblock APIs
are used and where it's expected to get MAX_NUMNODES as input parameter.
The major problem is that simple "grep" will not work, because memmblock APIs calls
are hidden inside other MM modules and it's not always clear
what will be passed as input parameters to APIs of these MM modules
(for example sparse_memory_present_with_active_regions() or sparse.c).

As result, WIP patch, I did, and which was posted by Santosh illustrates
the probable size and complexity of the change.

>
> It kinda really bothers me this patchset is expanding the usage
>> of the wrong constant with only very far-out plan to fix that. All
>> archs converting to nobootmem will take a *long* time, that is, if
>> that happens at all. I don't really care about the order of things
>> happening but "this is gonna be fixed when everyone moves off
>> MAX_NUMNODES" really isn't good enough.

Sorry, but question here is not "Do or not to do?", but rather 'how to do?",
taking into account complexity and state of the current MM code.
For example. would it be ok if I'll workaround the issue as in the attached patch?

Thanks for any advice.

Regards,
- grygorii

---
include/linux/bootmem.h | 8 ++++----
mm/memblock.c | 25 ++++++++++++++++++-------
mm/percpu.c | 2 +-
3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 9e67fe4..84e778d 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -171,20 +171,20 @@ void __memblock_free_late(phys_addr_t base, phys_addr_t size);

#define memblock_virt_alloc(x) \
memblock_virt_alloc_try_nid(x, 0, BOOTMEM_LOW_LIMIT, \
- BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+ BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE)
#define memblock_virt_alloc_align(x, align) \
memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
- BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
+ BOOTMEM_ALLOC_ACCESSIBLE, NUMA_NO_NODE)
#define memblock_virt_alloc_nopanic(x) \
memblock_virt_alloc_try_nid_nopanic(x, 0, \
BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, \
- MAX_NUMNODES)
+ NUMA_NO_NODE)
#define memblock_virt_alloc_align_nopanic(x, align) \
memblock_virt_alloc_try_nid_nopanic(x, align, \
BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, \
- MAX_NUMNODES)
+ NUMA_NO_NODE)
#define memblock_virt_alloc_node(x, nid) \
memblock_virt_alloc_try_nid(x, 0, BOOTMEM_LOW_LIMIT, \
BOOTMEM_ALLOC_ACCESSIBLE, nid)
diff --git a/mm/memblock.c b/mm/memblock.c
index 1503300..cae02a1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -945,7 +945,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* The @min_addr limit is dropped if it can not be satisfied and the allocation
* will fall back to memory below @min_addr.
@@ -970,16 +970,27 @@ static void * __init _memblock_virt_alloc_try_nid_nopanic(
phys_addr_t alloc;
void *ptr;

+ /*
+ * TODO: this is WA as we should get NUMA_NO_NODE as input parameter
+ * to work with any node, but there are no guarantee that we always will
+ * Remove it once memblock core is converted to use NUMA_NO_NODE.
+ */
+ nid = (nid == MAX_NUMNODES) ? NUMA_NO_NODE : nid;
+
if (WARN_ON_ONCE(slab_is_available())) {
- if (nid == MAX_NUMNODES)
- return kzalloc(size, GFP_NOWAIT);
- else
- return kzalloc_node(size, GFP_NOWAIT, nid);
+ return kzalloc_node(size, GFP_NOWAIT, nid);
}

if (!align)
align = SMP_CACHE_BYTES;

+ /*
+ * TODO: this is WA as we get NUMA_NO_NODE as input parameter, but
+ * memblock core still uses MAX_NUMNODES.
+ * Remove it once memblock core is converted to use NUMA_NO_NODE.
+ */
+ nid = (nid == NUMA_NO_NODE) ? MAX_NUMNODES : nid;
+
/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);

@@ -1028,7 +1039,7 @@ error:
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
* additional debug information (including caller info), if enabled.
@@ -1056,7 +1067,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
* @max_addr: the upper bound of the memory region from where the allocation
* is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
* Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
* which provides debug information (including caller info), if enabled,
diff --git a/mm/percpu.c b/mm/percpu.c
index f74902c..55a798e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1853,7 +1853,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
return memblock_virt_alloc_try_nid_nopanic(size, align,
__pa(MAX_DMA_ADDRESS),
BOOTMEM_ALLOC_ACCESSIBLE,
- MAX_NUMNODES);
+ NUMA_NO_NODE);
}

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
--
1.7.9.5

Grygorii Strashko

unread,
Dec 5, 2013, 10:40:02 AM12/5/13
to
Hi Tejun,

On 12/04/2013 01:24 AM, Tejun Heo wrote:
> Hello,
>
> On Mon, Dec 02, 2013 at 09:27:23PM -0500, Santosh Shilimkar wrote:
>> So we add equivalent APIs so that we can replace usage of bootmem
>> with memblock interfaces. Architectures already converted to NO_BOOTMEM
>> use these new interfaces and other which still uses bootmem, these new
>> APIs just fallback to exiting bootmem APIs. So no functional change as
>> such.
>
> The last part of the second last sentence doesn't parse too well. I
> think it'd be worthwhile to improve and preferably expand on it as
> this is a bit tricky to understand given the twisted state of early
> memory allocation.
>
>> In long run, once all the achitectures moves to NO_BOOTMEM, we can get rid of
>> bootmem layer completely. This is one step to remove the core code dependency
>> with bootmem and also gives path for architectures to move away from bootmem.
>
> Lines too long?
>

[...]

>
>> +/* FIXME: Move to memblock.h at a point where we remove nobootmem.c */
>> +void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size,
>> + phys_addr_t align, phys_addr_t from,
>> + phys_addr_t max_addr, int nid);
>
> Wouldn't @min_addr instead of @from make more sense? Ditto for other
> occurrences.
>
>> +void *memblock_virt_alloc_try_nid(phys_addr_t size, phys_addr_t align,
>> + phys_addr_t from, phys_addr_t max_addr, int nid);
>> +void __memblock_free_early(phys_addr_t base, phys_addr_t size);
>> +void __memblock_free_late(phys_addr_t base, phys_addr_t size);
>> +
>> +#define memblock_virt_alloc(x) \
>> + memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
>> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
>
> The underlying function interprets 0 as the default align, so it
> probably is a better idea to just use 0 here.
>
>> +#define memblock_virt_alloc_align(x, align) \
>> + memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
>> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
>
> Also, do we really need this align variant separate when the caller
> can simply specify 0 for the default?

Unfortunately Yes.
We need it to keep compatibility with bootmem/nobootmem
which don't handle 0 as default align value.

>
>> +#define memblock_virt_alloc_nopanic(x) \
>> + memblock_virt_alloc_try_nid_nopanic(x, SMP_CACHE_BYTES, \
>> + BOOTMEM_LOW_LIMIT, \
>> + BOOTMEM_ALLOC_ACCESSIBLE, \
>> + MAX_NUMNODES)
>> +#define memblock_virt_alloc_align_nopanic(x, align) \
>> + memblock_virt_alloc_try_nid_nopanic(x, align, \
>> + BOOTMEM_LOW_LIMIT, \
>> + BOOTMEM_ALLOC_ACCESSIBLE, \
>> + MAX_NUMNODES)
>> +#define memblock_virt_alloc_node(x, nid) \
>> + memblock_virt_alloc_try_nid(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, \
>> + BOOTMEM_ALLOC_ACCESSIBLE, nid)
>> +#define memblock_virt_alloc_node_nopanic(x, nid) \

Regards,
- grygorii

Tejun Heo

unread,
Dec 5, 2013, 12:00:02 PM12/5/13
to
Hello,

On Thu, Dec 05, 2013 at 06:35:00PM +0200, Grygorii Strashko wrote:
> >> +#define memblock_virt_alloc_align(x, align) \
> >> + memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
> >> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
> >
> > Also, do we really need this align variant separate when the caller
> > can simply specify 0 for the default?
>
> Unfortunately Yes.
> We need it to keep compatibility with bootmem/nobootmem
> which don't handle 0 as default align value.

Hmm... why wouldn't just interpreting 0 to SMP_CACHE_BYTES in the
memblock_virt*() function work?

Thanks.

--
tejun

Tejun Heo

unread,
Dec 5, 2013, 12:00:03 PM12/5/13
to
Hello,

On Thu, Dec 05, 2013 at 03:12:30PM +0200, Grygorii Strashko wrote:
> I'll try to provide more technical details here.
> As Santosh mentioned in previous e-mails, it's not easy to simply
> get rid of using MAX_NUMNODES:
> 1) we introduce new interface memblock_allocX
> 2) our interface uses memblock APIs __next_free_mem_range_rev()
> and __next_free_mem_range()
> 3) __next_free_mem_range_rev() and __next_free_mem_range() use MAX_NUMNODES
> 4) _next_free_mem_range_rev() and __next_free_mem_range() are used standalone,
> outside of our interface as part of *for_each_free_mem_range* or for_each_mem_pfn_range ..
>
> The point [4] leads to necessity to find and correct all places where memmblock APIs
> are used and where it's expected to get MAX_NUMNODES as input parameter.
> The major problem is that simple "grep" will not work, because memmblock APIs calls
> are hidden inside other MM modules and it's not always clear
> what will be passed as input parameters to APIs of these MM modules
> (for example sparse_memory_present_with_active_regions() or sparse.c).

Isn't that kinda trivial to work around? Make those functions accept
both MAX_NUMNODES and NUMA_NO_NODE but emit warning on MAX_NUMNODES
(preferably throttled reasonably). Given the history of API, we'd
probably want to keep such warning for extended period of time but
that's what we'd need to do no matter what.

> As result, WIP patch, I did, and which was posted by Santosh illustrates
> the probable size and complexity of the change.

Again, I don't really mind the order things happen but I don't think
it's a good idea to spread misusage with a new API. You gotta deal
with it one way or the other.

> Sorry, but question here is not "Do or not to do?", but rather 'how to do?",
> taking into account complexity and state of the current MM code.
> For example. would it be ok if I'll workaround the issue as in the attached patch?

Well, it's more of when. It's not really a technically difficult
task and all I'm saying is it better be sooner than later.

Thanks.

--
tejun

Santosh Shilimkar

unread,
Dec 5, 2013, 12:20:02 PM12/5/13
to
On Thursday 05 December 2013 11:59 AM, Tejun Heo wrote:
> Hello,
>
> On Thu, Dec 05, 2013 at 03:12:30PM +0200, Grygorii Strashko wrote:
>> I'll try to provide more technical details here.
>> As Santosh mentioned in previous e-mails, it's not easy to simply
>> get rid of using MAX_NUMNODES:
>> 1) we introduce new interface memblock_allocX
>> 2) our interface uses memblock APIs __next_free_mem_range_rev()
>> and __next_free_mem_range()
>> 3) __next_free_mem_range_rev() and __next_free_mem_range() use MAX_NUMNODES
>> 4) _next_free_mem_range_rev() and __next_free_mem_range() are used standalone,
>> outside of our interface as part of *for_each_free_mem_range* or for_each_mem_pfn_range ..
>>
>> The point [4] leads to necessity to find and correct all places where memmblock APIs
>> are used and where it's expected to get MAX_NUMNODES as input parameter.
>> The major problem is that simple "grep" will not work, because memmblock APIs calls
>> are hidden inside other MM modules and it's not always clear
>> what will be passed as input parameters to APIs of these MM modules
>> (for example sparse_memory_present_with_active_regions() or sparse.c).
>
> Isn't that kinda trivial to work around? Make those functions accept
> both MAX_NUMNODES and NUMA_NO_NODE but emit warning on MAX_NUMNODES
> (preferably throttled reasonably). Given the history of API, we'd
> probably want to keep such warning for extended period of time but
> that's what we'd need to do no matter what.
>
Looks a good idea.

>> As result, WIP patch, I did, and which was posted by Santosh illustrates
>> the probable size and complexity of the change.
>
> Again, I don't really mind the order things happen but I don't think
> it's a good idea to spread misusage with a new API. You gotta deal
> with it one way or the other.
>
>> Sorry, but question here is not "Do or not to do?", but rather 'how to do?",
>> taking into account complexity and state of the current MM code.
>> For example. would it be ok if I'll workaround the issue as in the attached patch?
>
> Well, it's more of when. It's not really a technically difficult
> task and all I'm saying is it better be sooner than later.
>
Fair enough. Based on your suggestion, we will try to see if
we can proceed with 4) accepting both MAX_NUMNODES and NUMA_NO_NODE.

Thanks for the suggestion.

regards,
Santosh

Strashko, Grygorii

unread,
Dec 5, 2013, 1:50:02 PM12/5/13
to
Hi Tejun,

>On Thu, Dec 05, 2013 at 06:35:00PM +0200, Grygorii Strashko wrote:
>> >> +#define memblock_virt_alloc_align(x, align) \
>> >> + memblock_virt_alloc_try_nid(x, align, BOOTMEM_LOW_LIMIT, \
>> >> + BOOTMEM_ALLOC_ACCESSIBLE, MAX_NUMNODES)
>> >
>> > Also, do we really need this align variant separate when the caller
>> > can simply specify 0 for the default?
>>
>> Unfortunately Yes.
>> We need it to keep compatibility with bootmem/nobootmem
>> which don't handle 0 as default align value.
>
>Hmm... why wouldn't just interpreting 0 to SMP_CACHE_BYTES in the
>memblock_virt*() function work?
>

Problem is not with memblock_virt*(). The issue will happen in case if
memblock or nobootmem are disabled in below code (memblock_virt*() is disabled).

+/* Fall back to all the existing bootmem APIs */
+#define memblock_virt_alloc(x) \
+ __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)

which will be transformed to
+/* Fall back to all the existing bootmem APIs */
+#define memblock_virt_alloc(x, align) \
+ __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)

and used as

memblock_virt_alloc(size, 0);

so, by default bootmem code will use 0 as default alignment and not SMP_CACHE_BYTES
and that is wrong.

Regards,
-grygorii--

Tejun Heo

unread,
Dec 5, 2013, 2:00:02 PM12/5/13
to
Hey,

On Thu, Dec 05, 2013 at 06:48:21PM +0000, Strashko, Grygorii wrote:
> +/* Fall back to all the existing bootmem APIs */
> +#define memblock_virt_alloc(x) \
> + __alloc_bootmem(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>
> which will be transformed to
> +/* Fall back to all the existing bootmem APIs */
> +#define memblock_virt_alloc(x, align) \
> + __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>
> and used as
>
> memblock_virt_alloc(size, 0);
>
> so, by default bootmem code will use 0 as default alignment and not SMP_CACHE_BYTES
> and that is wrong.

Just translate it to SMP_CACHE_BYTES? Am I missing something here?
You're defining a new API which wraps around two interfaces. Wrap
them so that they show the same desired behavior?

--
tejun

Santosh Shilimkar

unread,
Dec 5, 2013, 3:40:01 PM12/5/13
to
Grygorii,
Looks like you didn't understood the suggestion completely.
The fall back inline will look like below .....

static inline memblock_virt_alloc(x, align)
{
if (align == 0)
align = SMP_CACHE_BYTES
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT);
}

regards,
Santosh

Grygorii Strashko

unread,
Dec 6, 2013, 9:00:03 AM12/6/13
to
I understand. thanks.

Regards,
-grygorii

Santosh Shilimkar

unread,
Dec 9, 2013, 5:00:01 PM12/9/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
drivers/firmware/memmap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index e2e04b0..17cf96c 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -324,7 +324,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
{
struct firmware_map_entry *entry;

- entry = alloc_bootmem(sizeof(struct firmware_map_entry));
+ entry = memblock_virt_alloc(sizeof(struct firmware_map_entry), 0);
if (WARN_ON(!entry))
return -ENOMEM;

--
1.7.9.5

Santosh Shilimkar

unread,
Dec 9, 2013, 5:00:01 PM12/9/13
to
From: Grygorii Strashko <grygorii...@ti.com>

Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezaw...@jp.fujitsu.com>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/page_cgroup.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6d757e3a..d8bd2c5 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -54,8 +54,9 @@ static int __init alloc_node_page_cgroup(int nid)

table_size = sizeof(struct page_cgroup) * nr_pages;

- base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ base = memblock_virt_alloc_try_nid_nopanic(
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;

Santosh Shilimkar

unread,
Dec 9, 2013, 5:00:01 PM12/9/13
to
Switch to memblock interfaces for early memory allocator instead of
bootmem allocator. No functional change in beahvior than what it is
in current code from bootmem users points of view.

Archs already converted to NO_BOOTMEM now directly use memblock
interfaces instead of bootmem wrappers build on top of memblock. And the
archs which still uses bootmem, these new apis just fallback to exiting
bootmem APIs.

Cc: Yinghai Lu <yin...@kernel.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Andrew Morton <ak...@linux-foundation.org>
Signed-off-by: Grygorii Strashko <grygorii...@ti.com>
Signed-off-by: Santosh Shilimkar <santosh....@ti.com>
---
mm/page_alloc.c | 27 +++++++++++++++------------
1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f0..ed64107 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4210,7 +4210,6 @@ static noinline __init_refok
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
- struct pglist_data *pgdat = zone->zone_pgdat;
size_t alloc_size;

/*
@@ -4226,7 +4225,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)

if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node_nopanic(pgdat, alloc_size);
+ memblock_virt_alloc_node_nopanic(
+ alloc_size, zone->zone_pgdat->node_id);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4346,13 +4346,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
#endif

/**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
*/
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
{
@@ -4364,9 +4365,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
end_pfn = min(end_pfn, max_low_pfn);

if (start_pfn < end_pfn)
- free_bootmem_node(NODE_DATA(this_nid),
- PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT);
+ memblock_free_early_nid(PFN_PHYS(start_pfn),
+ (end_pfn - start_pfn) << PAGE_SHIFT,
+ this_nid);
}
}

@@ -4637,8 +4638,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
- usemapsize);
+ zone->pageblock_flags =
+ memblock_virt_alloc_node_nopanic(usemapsize,
+ pgdat->node_id);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4832,7 +4834,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node_nopanic(pgdat, size);
+ map = memblock_virt_alloc_node_nopanic(size,
+ pgdat->node_id);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5858,7 +5861,7 @@ void *__init alloc_large_system_hash(const char *tablename,
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
- table = alloc_bootmem_nopanic(size);
+ table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {

Andrew Morton

unread,
Jan 10, 2014, 8:00:02 PM1/10/14
to
So where do we now stand with this MAX_NUMNODES-vs-NUMA_NO_NODE mess?
Is the conversion to NUMA_NO_NODE in current linux-next completed and
nicely tested?

Thanks.

Santosh Shilimkar

unread,
Jan 10, 2014, 8:10:02 PM1/10/14
to
From all the report so far, there were actually only 3 places in x86
code using MAX_NUMNODES and fix for that is already in your queue.
So I guess we are good on that aspect now.

Regards,
Santosh
0 new messages