[PATCH 0/6] drmgr: Add NUMA based CPU removal support

1 view
Skip to first unread message

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:20 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
The current process removes CPUs from last in the CPU dr_info
list which may result in more CPU less NUMA nodes with more
memory.

This patch series adds NUMA based CPU removal process which
removes CPUs from specific NUMA nodes based on memory per CPU
called node ratio. So CPU will be selected based on the node
which has less node ratio. Repeat this process and select
nodes for all requested number of CPUs.

If the NUMA topology is not enabled on the system, fall back
to the current process.

drmgr-Move-numa_topology-code-to-common_numa.c.patcah: The
build NUMA topology code is also needed for CPU removal. So
move this code to common_numa.c.
drmgr-Move-read-lmb-size-property-code-to-common_ofd.patch:
Read LMB size property is also used for CPU removal. Add
get_dynamic_lmb_size() in common_ofdt.c
drmgr-Add-get_next_cpu-to-identify-the-removable-CPU.patch:
Move the current code to find the next removal CPU as a
separate function. It helps to add NUMA based next removal
CPU code.
drmgr-Allocate-CPU-bitmap-for-each-NUMA-node.patch: The
current code uses CPU bitmap to find the number of CPUs
for each NUMA node (for NUMA memory removal) but does not
retain for each node. In the case of CPU removal, need to
save CPU bitmap for each node which is used to determine
whether CPU belongs to a specific node.
drmgr-Add-NUMA-configuration-update-for-CPU-remove.patch:
Changes to add NUMA configuration values such as ncpus and
node memory size needed specific to CPU removal
drmgr-Add-NUMA-based-CPU-removal.patch: NUMA based
CPU removal process.

Haren Myneni (6):
drmgr: Move numa_topology code to common_numa.c
drmgr: Move read lmb-size property code to common_ofdt.c
drmgr: Add get_next_cpu() to identify the removable CPU
drmgr: Allocate CPU bitmap for each NUMA node
drmgr: Add NUMA configuration update for CPU remove
drmgr: Add NUMA based CPU removal

src/drmgr/common_numa.c | 60 +++++++++-
src/drmgr/common_numa.h | 9 +-
src/drmgr/common_ofdt.c | 17 +++
src/drmgr/drslot_chrp_cpu.c | 222 ++++++++++++++++++++++++++++++++++--
src/drmgr/drslot_chrp_mem.c | 45 ++------
src/drmgr/ofdt.h | 1 +
6 files changed, 298 insertions(+), 56 deletions(-)

--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:23 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
get_dynamic_lmb_size() is used to get lmb size from "ibm,lmb-size"
property. This lmb size is needed to determine the number of LMBs
and used to find NUMA node ratio for NUMA aware memory and CPU
removal code. So move this function to common_ofdt.c

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/common_ofdt.c | 17 +++++++++++++++++
src/drmgr/drslot_chrp_mem.c | 11 ++---------
src/drmgr/ofdt.h | 1 +
3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/drmgr/common_ofdt.c b/src/drmgr/common_ofdt.c
index 1e5fe53..0655559 100644
--- a/src/drmgr/common_ofdt.c
+++ b/src/drmgr/common_ofdt.c
@@ -28,6 +28,7 @@
#include <errno.h>
#include "dr.h"
#include "ofdt.h"
+#include "drmem.h"

#define RTAS_DIRECTORY "/proc/device-tree/rtas"
#define CHOSEN_DIRECTORY "/proc/device-tree/chosen"
@@ -932,3 +933,19 @@ int of_associativity_to_node(const char *dir, int min_common_depth)
return be32toh(prop[min_common_depth]);
}

+int get_dynamic_lmb_size(uint64_t *lmb_sz)
+{
+ int rc = 0;
+ uint64_t sz;
+
+ rc = get_property(DYNAMIC_RECONFIG_MEM, "ibm,lmb-size",
+ &sz, sizeof(sz));
+ if (rc) {
+ say(DEBUG, "Could not retrieve drconf LMB size\n");
+ return rc;
+ }
+
+ /* convert for LE systems */
+ *lmb_sz = be64toh(sz);
+ return 0;
+}
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index eb75ccf..2d22bff 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -506,16 +506,9 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list)
uint64_t lmb_sz;
int rc = 0;

- rc = get_property(DYNAMIC_RECONFIG_MEM, "ibm,lmb-size",
- &lmb_sz, sizeof(lmb_sz));
-
- /* convert for LE systems */
- lmb_sz = be64toh(lmb_sz);
-
- if (rc) {
- say(DEBUG, "Could not retrieve drconf LMB size\n");
+ rc = get_dynamic_lmb_size(&lmb_sz);
+ if (rc)
return rc;
- }

if (stat(DYNAMIC_RECONFIG_MEM_V1, &sbuf) == 0) {
rc = get_dynamic_reconfig_lmbs_v1(lmb_sz, lmb_list);
diff --git a/src/drmgr/ofdt.h b/src/drmgr/ofdt.h
index e9ebd03..c79ed65 100644
--- a/src/drmgr/ofdt.h
+++ b/src/drmgr/ofdt.h
@@ -185,6 +185,7 @@ int get_assoc_arrays(const char *dir, struct assoc_arrays *aa,
int min_common_depth);
int of_associativity_to_node(const char *dir, int min_common_depth);
int init_node(struct dr_node *);
+int get_dynamic_lmb_size(uint64_t *lmb_sz);

static inline int aa_index_to_node(struct assoc_arrays *aa, uint32_t aa_index)
{
--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:24 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
Move build_numa_topology and sort NUMA node ratio list code to
common_numa.c. These functions will also be used for NUMA aware
CPU removal in later patch.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/common_numa.c | 37 ++++++++++++++++++++++++++++++++++++-
src/drmgr/common_numa.h | 6 +++++-
src/drmgr/drslot_chrp_mem.c | 29 ++---------------------------
3 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
index 898aab6..6bc2ea8 100644
--- a/src/drmgr/common_numa.c
+++ b/src/drmgr/common_numa.c
@@ -27,6 +27,9 @@
#include "drmem.h" /* for DYNAMIC_RECONFIG_MEM */
#include "common_numa.h"

+int numa_enabled = 0;
+struct ppcnuma_topology numa;
+
struct ppcnuma_node *ppcnuma_fetch_node(struct ppcnuma_topology *numa, int nid)
{
struct ppcnuma_node *node;
@@ -118,7 +121,7 @@ static int read_numa_topology(struct ppcnuma_topology *numa)
return rc;
}

-int ppcnuma_get_topology(struct ppcnuma_topology *numa)
+static int ppcnuma_get_topology(struct ppcnuma_topology *numa)
{
int rc;

@@ -145,3 +148,35 @@ int ppcnuma_get_topology(struct ppcnuma_topology *numa)

return 0;
}
+
+void build_numa_topology(void)
+{
+ int rc;
+
+ rc = ppcnuma_get_topology(&numa);
+ if (rc)
+ return;
+
+ numa_enabled = 1;
+}
+
+void order_numa_node_ratio_list(void)
+{
+ int nid;
+ struct ppcnuma_node *node, *n, **p;
+
+ numa.ratio = NULL;
+
+ /* Create an ordered link of the nodes */
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ p = &numa.ratio;
+ for (n = numa.ratio;
+ n && n->ratio < node->ratio; n = n->ratio_next)
+ p = &n->ratio_next;
+ *p = node;
+ node->ratio_next = n;
+ }
+}
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index c209a3e..2b0901e 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -44,7 +44,11 @@ struct ppcnuma_topology {
struct assoc_arrays aa;
};

-int ppcnuma_get_topology(struct ppcnuma_topology *numa);
+extern int numa_enabled;
+extern struct ppcnuma_topology numa;
+void build_numa_topology(void);
+void order_numa_node_ratio_list(void);
+
struct ppcnuma_node *ppcnuma_fetch_node(struct ppcnuma_topology *numa,
int node_id);

diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 4a36c73..eb75ccf 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -38,9 +38,6 @@ static char *state_strs[] = {"offline", "online"};

static char *usagestr = "-c mem {-a | -r} {-q <quantity> -p {variable_weight | ent_capacity} | {-q <quantity> | -s [<drc_name> | <drc_index>]}}";

-static struct ppcnuma_topology numa;
-static int numa_enabled = 0;
-
/**
* mem_usage
* @brief return usage string
@@ -1605,7 +1602,7 @@ static int remove_cpuless_lmbs(uint32_t count)
static void update_node_ratio(void)
{
int nid;
- struct ppcnuma_node *node, *n, **p;
+ struct ppcnuma_node *node;
uint32_t cpu_ratio, mem_ratio;

/*
@@ -1626,18 +1623,7 @@ static void update_node_ratio(void)
node->ratio = (cpu_ratio * 9 + mem_ratio) / 10;
}

- /* Create an ordered link of the nodes */
- ppcnuma_foreach_node(&numa, nid, node) {
- if (!node->n_lmbs || !node->n_cpus)
- continue;
-
- p = &numa.ratio;
- for (n = numa.ratio;
- n && n->ratio < node->ratio; n = n->ratio_next)
- p = &n->ratio_next;
- *p = node;
- node->ratio_next = n;
- }
+ order_numa_node_ratio_list();
}

/*
@@ -1693,17 +1679,6 @@ static int remove_cpu_lmbs(uint32_t count)
return done;
}

-static void build_numa_topology(void)
-{
- int rc;
-
- rc = ppcnuma_get_topology(&numa);
- if (rc)
- return;
-
- numa_enabled = 1;
-}
-
static void clear_numa_lmb_links(void)
{
int nid;
--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:26 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
Move code which identifies the removable CPU to get_next_cpu().
This function is used only for the current non-numa based CPU
removal but helps to add for NUMA based CPU removal code in later
patch.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/drslot_chrp_cpu.c | 34 ++++++++++++++++++++++++----------
1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index 3ef24f4..6a21663 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -160,11 +160,33 @@ static struct dr_node *get_available_cpu_by_index(struct dr_info *dr_info)
return cpu;
}

+/*
+ * Scan all CPUs from the last one for the next available CPU.
+ * Used only for non-NUMA based CPU removal.
+ */
+static struct dr_node *get_next_cpu(struct dr_info *dr_info)
+{
+ struct dr_node *cpu = NULL;
+ struct thread *t;
+
+ /* Find the first cpu with an online thread */
+ for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
+ if (cpu->unusable)
+ continue;
+
+ for (t = cpu->cpu_threads; t; t = t->next) {
+ if (get_thread_state(t) == ONLINE)
+ return cpu;
+ }
+ }
+
+ return NULL;
+}
+
static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)
{
struct dr_node *cpu = NULL;
struct dr_node *survivor = NULL;
- struct thread *t;

if (usr_action == ADD) {
for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
@@ -177,15 +199,7 @@ static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)
cpu = survivor;
} else if (usr_action == REMOVE) {
/* Find the first cpu with an online thread */
- for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
- if (cpu->unusable)
- continue;
-
- for (t = cpu->cpu_threads; t; t = t->next) {
- if (get_thread_state(t) == ONLINE)
- return cpu;
- }
- }
+ cpu = get_next_cpu(dr_info);
}

if (!cpu)
--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:27 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
This patch adds NUMA node config update for CPU removal. Updates
number of LMBs and CPUs for each node and also calculates total
number of CPUs from memory less nodes. This node configuration is
used to identify the node based on the node ratio from which the
CPU is selected to remove

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/common_numa.h | 1 +
src/drmgr/drslot_chrp_cpu.c | 47 +++++++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+)

diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index 7aea026..edf4349 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -38,6 +38,7 @@ struct ppcnuma_topology {
unsigned int lmb_count;
unsigned int cpuless_node_count;
unsigned int cpuless_lmb_count;
+ unsigned int memless_cpu_count;
unsigned int node_count, node_min, node_max;
struct ppcnuma_node *nodes[MAX_NUMNODES];
struct ppcnuma_node *ratio;
diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index 6a21663..89a346a 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -26,10 +26,12 @@
#include <sys/types.h>
#include <dirent.h>
#include <librtas.h>
+#include <numa.h>
#include "dr.h"
#include "drcpu.h"
#include "drpci.h"
#include "ofdt.h"
+#include "common_numa.h"

struct cpu_operation;
typedef int (cpu_op_func_t) (void);
@@ -395,6 +397,39 @@ static int smt_threads_func(struct dr_info *dr_info)
return rc;
}

+/*
+ * Per node CPUs are defined as part of build_numa_topology().
+ * This function calculates number of LMBs per node based on
+ * node mememory / lmb-size.
+ * n_cpus and n_lmbs are used to determine node ratio.
+ */
+static int cpu_update_numa_config(void)
+{
+ struct ppcnuma_node *node;
+ unsigned long long node_size;
+ int rc, nid;
+ uint64_t lmb_sz;
+
+ rc = get_dynamic_lmb_size(&lmb_sz);
+ /* Use the default value if lmb-size property is not available */
+ if (rc)
+ lmb_sz = 0x10000000;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ node_size = numa_node_size(nid, 0);
+ /*
+ * Node has memory
+ * n_lmbs = Total memory / lmb-size
+ */
+ if (node_size) {
+ node->n_lmbs = node_size / lmb_sz;
+ } else
+ numa.memless_cpu_count += node->n_cpus;
+ }
+
+ return 0;
+}
+
int valid_cpu_options(void)
{
/* default to a quantity of 1 */
@@ -442,6 +477,15 @@ int drslot_chrp_cpu(void)
return -1;
}

+ /*
+ * Maintain NUMA aware hotplug only for remove and with count request.
+ */
+ if (usr_drc_count && (usr_action == REMOVE)) {
+ build_numa_topology();
+ if (numa_enabled)
+ cpu_update_numa_config();
+ }
+
/* If a user specifies a drc name, the quantity to add/remove is
* one. Enforce that here so the loops in add/remove code behave
* accordingly.
@@ -473,6 +517,9 @@ int drslot_chrp_cpu(void)
if (usr_action == ADD || usr_action == REMOVE)
run_hooks(DRC_TYPE_CPU, usr_action, HOOK_POST, count);

+ if ((usr_action == REMOVE) && numa_enabled)
+ free_numa_topology();
+
free_cpu_drc_info(&dr_info);
return rc;
}
--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:27 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
The current code allocates one bitmap and uses it to determine
number of valid CPUs for each NUMA node and then frees the bitmap.
The NUMA based CPU removal needs this bitmap per node to determine
the valid CPU in that node. So this patch retains bitmap per node
and frees it after DLPAR memory / CPU removal.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/common_numa.c | 23 ++++++++++++++++++-----
src/drmgr/common_numa.h | 2 ++
src/drmgr/drslot_chrp_mem.c | 5 +++--
3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
index 6bc2ea8..e836364 100644
--- a/src/drmgr/common_numa.c
+++ b/src/drmgr/common_numa.c
@@ -84,9 +84,6 @@ static int read_numa_topology(struct ppcnuma_topology *numa)

rc = 0;

- /* In case of allocation error, the libnuma is calling exit() */
- cpus = numa_allocate_cpumask();
-
for (nid = 0; nid <= max_node; nid++) {

if (!numa_bitmask_isbitset(numa_nodes_ptr, nid))
@@ -98,6 +95,9 @@ static int read_numa_topology(struct ppcnuma_topology *numa)
break;
}

+ /* In case of allocation error, the libnuma is calling exit() */
+ cpus = numa_allocate_cpumask();
+
rc = numa_node_to_cpus(nid, cpus);
if (rc < 0)
break;
@@ -107,11 +107,10 @@ static int read_numa_topology(struct ppcnuma_topology *numa)
if (numa_bitmask_isbitset(cpus, i))
node->n_cpus++;

+ node->cpus = cpus;
numa->cpu_count += node->n_cpus;
}

- numa_bitmask_free(cpus);
-
if (rc) {
ppcnuma_foreach_node(numa, nid, node)
node->n_cpus = 0;
@@ -160,6 +159,20 @@ void build_numa_topology(void)
numa_enabled = 1;
}

+void free_numa_topology(void)
+{
+ struct ppcnuma_node *node;
+ int i;
+
+ for (i=0; i < numa.node_count; i++) {
+ node = numa.nodes[i];
+ if (node) {
+ numa_bitmask_free(node->cpus);
+ free(node);
+ }
+ }
+}
+
void order_numa_node_ratio_list(void)
{
int nid;
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index 2b0901e..7aea026 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -30,6 +30,7 @@ struct ppcnuma_node {
unsigned int ratio;
struct dr_node *lmbs; /* linked by lmb_numa_next */
struct ppcnuma_node *ratio_next;
+ struct bitmask *cpus;
};

struct ppcnuma_topology {
@@ -48,6 +49,7 @@ extern int numa_enabled;
extern struct ppcnuma_topology numa;
void build_numa_topology(void);
void order_numa_node_ratio_list(void);
+void free_numa_topology(void);

struct ppcnuma_node *ppcnuma_fetch_node(struct ppcnuma_topology *numa,
int node_id);
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 2d22bff..fe04ad1 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -1731,9 +1731,10 @@ int do_mem_kernel_dlpar(void)
if (usr_action == REMOVE && usr_drc_count && !usr_drc_index) {
build_numa_topology();
if (numa_enabled) {
- if (!numa_based_remove(usr_drc_count))
+ rc = numa_based_remove(usr_drc_count);
+ free_numa_topology();
+ if (!rc)
return 0;
-
/*
* If the NUMA based removal failed, lets try the legacy
* way.
--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 4, 2026, 5:29:30 PM (10 days ago) Apr 4
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
The current CPU removal process reads CPU list from last CPU and
remove CPUs based on the userspace request. This process can
result in CPU less NUMA nodes even though these nodes have more
memory which can affect on the system performance.

This patch adds NUMA aware CPU removal process to remove CPUs from
specific NUMA nodes and maintains NUMA balance. The selection of
node from which the CPU to be removed is based on the available
memory per CPU in that node called node ratio. So CPU is selected
from the node which has lower ratio.

If the NUMA topology can't be read, fallback using the current
process.

The node selection process is as follows:
- For each CPU removal request, update node ratios and sort the list.
- Select the next removable CPU from the dr_info CPU list and it
should belong to the first node.
- CPU associated to memory less nodes is considered first and then
the first node that has memory in the list.
- Repeat all CPUs in dr_info list until the next removable CPU is
matched with node CPU bitmap.
- The total number of CPU threads in the selected node is
decremented and cleared in the node CPU bitmap.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/drslot_chrp_cpu.c | 143 +++++++++++++++++++++++++++++++++++-
1 file changed, 141 insertions(+), 2 deletions(-)

diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index 89a346a..6002bbf 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -162,6 +162,141 @@ static struct dr_node *get_available_cpu_by_index(struct dr_info *dr_info)
return cpu;
}

+/*
+ * Return node if CPU ID matches in node CPU bitmap.
+ */
+static struct ppcnuma_node *match_cpu_node(struct ppcnuma_node *node,
+ struct dr_node *cpu)
+{
+ int nid;
+
+ if (cpu->cpu_threads) {
+ nid = numa_node_of_cpu(cpu->cpu_threads->id);
+ if (nid == node->node_id) {
+ if (numa_bitmask_isbitset(node->cpus,
+ cpu->cpu_threads->id))
+ return node;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Return node if CPU belongs to any memoryless NUMA node.
+ */
+static struct ppcnuma_node *find_cpu_memless_node(struct dr_node *cpu)
+{
+ struct ppcnuma_node *node = NULL;
+ int nid;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (node->n_lmbs)
+ continue;
+
+ if (match_cpu_node(node, cpu))
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * The node list is sorted by node ratio (less memory per CPU).
+ * So consider the first node
+ * Return node if CPU belongs to the first NUMA node which
+ * has memory.
+ */
+static struct ppcnuma_node *find_cpu_numa_node(struct dr_node *cpu)
+{
+ struct ppcnuma_node *node = NULL;
+ int found = 0;
+
+ ppcnuma_foreach_node_by_ratio(&numa, node) {
+ if (node->n_cpus && node->n_lmbs) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found && match_cpu_node(node, cpu))
+ return node;
+
+ return NULL;
+}
+
+/*
+ * Calculate node ratio based on amount of memory per CPU and sort
+ * the node ratio list.
+ */
+static void cpu_update_node_ratio(void)
+{
+ struct ppcnuma_node *node;
+ int nid;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ /*
+ * Node ratio = n_lmbs per CPU
+ */
+ node->ratio = (node->n_lmbs * 100) / node->n_cpus;
+ }
+
+ order_numa_node_ratio_list();
+}
+
+/*
+ * Scan CPUs from the last one in the list and select the first CPU
+ * based on:
+ * - CPU from memory less node
+ * - If no CPUs are available in memory less nodes, CPU belongs to
+ * the first node from node ratio list.
+ */
+static struct dr_node *numa_get_next_cpu(struct dr_info *dr_info)
+{
+ struct ppcnuma_node *node;
+ struct dr_node *cpu = NULL;
+ struct thread *t;
+ int i, found = 0;
+
+ /*
+ * Update node ratio for each CPU removal request
+ */
+ cpu_update_node_ratio();
+
+ /* Find the first cpu with an online thread */
+ for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
+ if (cpu->unusable)
+ continue;
+
+ if (numa.memless_cpu_count)
+ node = find_cpu_memless_node(cpu);
+ else
+ node = find_cpu_numa_node(cpu);
+
+ if (!node)
+ continue;
+
+ t = cpu->cpu_threads;
+ for (i = 0; i < cpu->cpu_nthreads && t; i++, t = t->next) {
+ if (get_thread_state(t) == ONLINE)
+ found = 1;
+ numa_bitmask_clearbit(node->cpus, t->id);
+ }
+ if (found) {
+ node->n_cpus -= cpu->cpu_nthreads;
+ numa.cpu_count -= cpu->cpu_nthreads;
+ if (!node->n_lmbs)
+ numa.memless_cpu_count -= cpu->cpu_nthreads;
+ return cpu;
+ }
+ }
+
+ return NULL;
+}
+
/*
* Scan all CPUs from the last one for the next available CPU.
* Used only for non-NUMA based CPU removal.
@@ -200,8 +335,12 @@ static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)

cpu = survivor;
} else if (usr_action == REMOVE) {
- /* Find the first cpu with an online thread */
- cpu = get_next_cpu(dr_info);
+ if (numa_enabled)
+ /* Find the first CPU from NUMA nodes */
+ cpu = numa_get_next_cpu(dr_info);
+ else
+ /* Find the first cpu with an online thread */

Dave Marquardt

<davemarq@linux.ibm.com>
unread,
Apr 6, 2026, 11:16:08 AM (9 days ago) Apr 6
to Haren Myneni, powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com
If numa_node_to_cpus() returns a negative number, do you have a memory
leak?

-Dave

Dave Marquardt

<davemarq@linux.ibm.com>
unread,
Apr 6, 2026, 4:27:45 PM (8 days ago) Apr 6
to Haren Myneni, powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com
Is there a #define for this somewhere? If not, can we create one? Or
initialize something from PHYP?

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 6, 2026, 5:56:35 PM (8 days ago) Apr 6
to Dave Marquardt, powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, mmc@linux.ibm.com
I can add macro definition, Got this value from lmb-size property and
can use in case if LPAR does not have this property (Generally older
FWs). For CPU removal case, we need just memory size per CPU. So does
not matter if the system has this property. 

Thanks
Haren

>

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 6, 2026, 6:01:52 PM (8 days ago) Apr 6
to Dave Marquardt, powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, mmc@linux.ibm.com
Thanks for your comments.

Generally we should see numa_node_to_cpus() failure (from librtas) if
CPUs bitmap is not enough or no NUMA nodes configured which should not
happen. But will move numa_allocate_cpumask() later so that the code is
easy to understand.

Thanks
Haren

>
> -Dave

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:12 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
Move build_numa_topology and sort NUMA node ratio list code to
common_numa.c. These functions will also be used for NUMA aware
CPU removal in later patch.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/common_numa.c | 37 ++++++++++++++++++++++++++++++++++++-
src/drmgr/common_numa.h | 6 +++++-
src/drmgr/drslot_chrp_mem.c | 29 ++---------------------------
3 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
index 898aab6..6bc2ea8 100644
--- a/src/drmgr/common_numa.c
+++ b/src/drmgr/common_numa.c
+{
+ int nid;
+ struct ppcnuma_node *node, *n, **p;
+
+ numa.ratio = NULL;
+
+ /* Create an ordered link of the nodes */
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ p = &numa.ratio;
+ for (n = numa.ratio;
+ n && n->ratio < node->ratio; n = n->ratio_next)
+ p = &n->ratio_next;
+ *p = node;
+ node->ratio_next = n;
+ }
+}
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index c209a3e..2b0901e 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -44,7 +44,11 @@ struct ppcnuma_topology {
struct assoc_arrays aa;
};

-int ppcnuma_get_topology(struct ppcnuma_topology *numa);
+extern int numa_enabled;
+extern struct ppcnuma_topology numa;
+void build_numa_topology(void);
+void order_numa_node_ratio_list(void);
+
struct ppcnuma_node *ppcnuma_fetch_node(struct ppcnuma_topology *numa,
int node_id);

diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 4a36c73..eb75ccf 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:13 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
Changelog:
v2:
The following changes are added based on comments from Dave Marquardt
- Free node CPU bitmap when numa_node_to_cpus() returns failure
- Define DEFAULT_LMB_SIZE macro

Haren Myneni (6):
drmgr: Move numa_topology code to common_numa.c
drmgr: Move read lmb-size property code to common_ofdt.c
drmgr: Add get_next_cpu() to identify the removable CPU
drmgr: Allocate CPU bitmap for each NUMA node
drmgr: Add NUMA configuration update for CPU remove
drmgr: Add NUMA based CPU removal

src/drmgr/common_numa.c | 65 ++++++++--
src/drmgr/common_numa.h | 9 +-
src/drmgr/common_ofdt.c | 17 +++
src/drmgr/drslot_chrp_cpu.c | 228 ++++++++++++++++++++++++++++++++++--
src/drmgr/drslot_chrp_mem.c | 45 ++-----
src/drmgr/ofdt.h | 1 +
6 files changed, 308 insertions(+), 57 deletions(-)

--
2.50.1

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:14 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
get_dynamic_lmb_size() is used to get lmb size from "ibm,lmb-size"
property. This lmb size is needed to determine the number of LMBs
and used to find NUMA node ratio for NUMA aware memory and CPU
removal code. So move this function to common_ofdt.c

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index eb75ccf..2d22bff 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c
@@ -506,16 +506,9 @@ get_dynamic_reconfig_lmbs(struct lmb_list_head *lmb_list)
uint64_t lmb_sz;
int rc = 0;

- rc = get_property(DYNAMIC_RECONFIG_MEM, "ibm,lmb-size",
- &lmb_sz, sizeof(lmb_sz));
-
- /* convert for LE systems */
- lmb_sz = be64toh(lmb_sz);
-
- if (rc) {
- say(DEBUG, "Could not retrieve drconf LMB size\n");
+ rc = get_dynamic_lmb_size(&lmb_sz);

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:16 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
Move code which identifies the removable CPU to get_next_cpu().
This function is used only for the current non-numa based CPU
removal but helps to add for NUMA based CPU removal code in later
patch.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/drslot_chrp_cpu.c | 34 ++++++++++++++++++++++++----------
1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index 3ef24f4..6a21663 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -160,11 +160,33 @@ static struct dr_node *get_available_cpu_by_index(struct dr_info *dr_info)
return cpu;
}

+/*
+ * Scan all CPUs from the last one for the next available CPU.
+ * Used only for non-NUMA based CPU removal.
+ */
+static struct dr_node *get_next_cpu(struct dr_info *dr_info)
+{
+ struct dr_node *cpu = NULL;
+ struct thread *t;
+
+ /* Find the first cpu with an online thread */
+ for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
+ if (cpu->unusable)
+ continue;
+
+ for (t = cpu->cpu_threads; t; t = t->next) {
+ if (get_thread_state(t) == ONLINE)
+ return cpu;
+ }
+ }
+
+ return NULL;
+}
+
static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)
{
struct dr_node *cpu = NULL;
struct dr_node *survivor = NULL;
- struct thread *t;

if (usr_action == ADD) {
for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
@@ -177,15 +199,7 @@ static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)
cpu = survivor;
} else if (usr_action == REMOVE) {
/* Find the first cpu with an online thread */
- for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
- if (cpu->unusable)
- continue;
-
- for (t = cpu->cpu_threads; t; t = t->next) {
- if (get_thread_state(t) == ONLINE)
- return cpu;
- }
- }

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:17 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
The current code allocates one bitmap and uses it to determine
number of valid CPUs for each NUMA node and then frees the bitmap.
The NUMA based CPU removal needs this bitmap per node to determine
the valid CPU in that node. So this patch retains bitmap per node
and frees it after DLPAR memory / CPU removal.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
Reviewed-by: Dave Marquardt <dave...@linux.ibm.com>
---
src/drmgr/common_numa.c | 28 ++++++++++++++++++++++------
src/drmgr/common_numa.h | 2 ++
src/drmgr/drslot_chrp_mem.c | 5 +++--
3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/src/drmgr/common_numa.c b/src/drmgr/common_numa.c
index 6bc2ea8..edf693d 100644
--- a/src/drmgr/common_numa.c
+++ b/src/drmgr/common_numa.c
@@ -84,9 +84,6 @@ static int read_numa_topology(struct ppcnuma_topology *numa)

rc = 0;

- /* In case of allocation error, the libnuma is calling exit() */
- cpus = numa_allocate_cpumask();
-
for (nid = 0; nid <= max_node; nid++) {

if (!numa_bitmask_isbitset(numa_nodes_ptr, nid))
@@ -98,20 +95,24 @@ static int read_numa_topology(struct ppcnuma_topology *numa)
break;
}

+ /* In case of allocation error, the libnuma is calling exit() */
+ cpus = numa_allocate_cpumask();
+
rc = numa_node_to_cpus(nid, cpus);
- if (rc < 0)
+ if (rc < 0) {
+ numa_bitmask_free(cpus);
break;
+ }

/* Count the CPUs in that node */
for (i = 0; i < cpus->size; i++)
if (numa_bitmask_isbitset(cpus, i))
node->n_cpus++;

+ node->cpus = cpus;
numa->cpu_count += node->n_cpus;
}

- numa_bitmask_free(cpus);
-
if (rc) {
ppcnuma_foreach_node(numa, nid, node)
node->n_cpus = 0;
@@ -160,6 +161,21 @@ void build_numa_topology(void)
numa_enabled = 1;
}

+void free_numa_topology(void)
+{
+ struct ppcnuma_node *node;
+ int i;
+
+ for (i=0; i < numa.node_count; i++) {
+ node = numa.nodes[i];
+ if (node) {
+ if (node->cpus)
+ numa_bitmask_free(node->cpus);
+ free(node);
+ }
+ }
+}
+
void order_numa_node_ratio_list(void)
{
int nid;
diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index 2b0901e..7aea026 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -30,6 +30,7 @@ struct ppcnuma_node {
unsigned int ratio;
struct dr_node *lmbs; /* linked by lmb_numa_next */
struct ppcnuma_node *ratio_next;
+ struct bitmask *cpus;
};

struct ppcnuma_topology {
@@ -48,6 +49,7 @@ extern int numa_enabled;
extern struct ppcnuma_topology numa;
void build_numa_topology(void);
void order_numa_node_ratio_list(void);
+void free_numa_topology(void);

struct ppcnuma_node *ppcnuma_fetch_node(struct ppcnuma_topology *numa,
int node_id);
diff --git a/src/drmgr/drslot_chrp_mem.c b/src/drmgr/drslot_chrp_mem.c
index 2d22bff..fe04ad1 100644
--- a/src/drmgr/drslot_chrp_mem.c
+++ b/src/drmgr/drslot_chrp_mem.c

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:19 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
This patch adds NUMA node config update for CPU removal. Updates
number of LMBs and CPUs for each node and also calculates total
number of CPUs from memory less nodes. This node configuration is
used to identify the node based on the node ratio from which the
CPU is selected to remove

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
Reviewed-by: Dave Marquardt <dave...@linux.ibm.com>
---
src/drmgr/common_numa.h | 1 +
src/drmgr/drslot_chrp_cpu.c | 53 +++++++++++++++++++++++++++++++++++++
2 files changed, 54 insertions(+)

diff --git a/src/drmgr/common_numa.h b/src/drmgr/common_numa.h
index 7aea026..edf4349 100644
--- a/src/drmgr/common_numa.h
+++ b/src/drmgr/common_numa.h
@@ -38,6 +38,7 @@ struct ppcnuma_topology {
unsigned int lmb_count;
unsigned int cpuless_node_count;
unsigned int cpuless_lmb_count;
+ unsigned int memless_cpu_count;
unsigned int node_count, node_min, node_max;
struct ppcnuma_node *nodes[MAX_NUMNODES];
struct ppcnuma_node *ratio;
diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index 6a21663..e367634 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -26,10 +26,14 @@
#include <sys/types.h>
#include <dirent.h>
#include <librtas.h>
+#include <numa.h>
#include "dr.h"
#include "drcpu.h"
#include "drpci.h"
#include "ofdt.h"
+#include "common_numa.h"
+
+#define DEFAULT_LMB_SIZE 0x10000000 /* 256MB */

struct cpu_operation;
typedef int (cpu_op_func_t) (void);
@@ -395,6 +399,43 @@ static int smt_threads_func(struct dr_info *dr_info)
return rc;
}

+/*
+ * Per node CPUs are defined as part of build_numa_topology().
+ * This function calculates number of LMBs per node based on
+ * node mememory / lmb-size.
+ * n_cpus and n_lmbs are used to determine node ratio.
+ */
+static int cpu_update_numa_config(void)
+{
+ struct ppcnuma_node *node;
+ unsigned long long node_size;
+ int rc, nid;
+ uint64_t lmb_sz;
+
+ rc = get_dynamic_lmb_size(&lmb_sz);
+ /*
+ * Use the default value if lmb-size property is not available.
+ * For CPU removal, node ratio will be calculated based on
+ * total n_lmbs per CPU.
+ */
+ if (rc)
+ lmb_sz = DEFAULT_LMB_SIZE;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ node_size = numa_node_size(nid, 0);
+ /*
+ * Node has memory
+ * n_lmbs = Total memory / lmb-size
+ */
+ if (node_size) {
+ node->n_lmbs = node_size / lmb_sz;
+ } else
+ numa.memless_cpu_count += node->n_cpus;
+ }
+
+ return 0;
+}
+
int valid_cpu_options(void)
{
/* default to a quantity of 1 */
@@ -442,6 +483,15 @@ int drslot_chrp_cpu(void)
return -1;
}

+ /*
+ * Maintain NUMA aware hotplug only for remove and with count request.
+ */
+ if (usr_drc_count && (usr_action == REMOVE)) {
+ build_numa_topology();
+ if (numa_enabled)
+ cpu_update_numa_config();
+ }
+
/* If a user specifies a drc name, the quantity to add/remove is
* one. Enforce that here so the loops in add/remove code behave
* accordingly.
@@ -473,6 +523,9 @@ int drslot_chrp_cpu(void)

Haren Myneni

<haren@linux.ibm.com>
unread,
Apr 9, 2026, 4:30:20 AM (6 days ago) Apr 9
to powerpc-utils-devel@googlegroups.com, tyreld@linux.ibm.com, davemarq@linux.ibm.com, mmc@linux.ibm.com, hbabu@us.ibm.com, haren@linux.ibm.com
The current CPU removal process reads CPU list from last CPU and
remove CPUs based on the userspace request. This process can
result in CPU less NUMA nodes even though these nodes have more
memory which can affect on the system performance.

This patch adds NUMA aware CPU removal process to remove CPUs from
specific NUMA nodes and maintains NUMA balance. The selection of
node from which the CPU to be removed is based on the available
memory per CPU in that node called node ratio. So CPU is selected
from the node which has lower ratio.

If the NUMA topology can't be read, fallback using the current
process.

The node selection process is as follows:
- For each CPU removal request, update node ratios and sort the list.
- Select the next removable CPU from the dr_info CPU list and it
should belong to the first node.
- CPU associated to memory less nodes is considered first and then
the first node that has memory in the list.
- Repeat all CPUs in dr_info list until the next removable CPU is
matched with node CPU bitmap.
- The total number of CPU threads in the selected node is
decremented and cleared in the node CPU bitmap.

Signed-off-by: Haren Myneni <ha...@linux.ibm.com>
---
src/drmgr/drslot_chrp_cpu.c | 143 +++++++++++++++++++++++++++++++++++-
1 file changed, 141 insertions(+), 2 deletions(-)

diff --git a/src/drmgr/drslot_chrp_cpu.c b/src/drmgr/drslot_chrp_cpu.c
index e367634..60956a0 100644
--- a/src/drmgr/drslot_chrp_cpu.c
+++ b/src/drmgr/drslot_chrp_cpu.c
@@ -164,6 +164,141 @@ static struct dr_node *get_available_cpu_by_index(struct dr_info *dr_info)
return cpu;
}

+/*
+ * Return node if CPU ID matches in node CPU bitmap.
+ */
+static struct ppcnuma_node *match_cpu_node(struct ppcnuma_node *node,
+ struct dr_node *cpu)
+{
+ int nid;
+
+ if (cpu->cpu_threads) {
+ nid = numa_node_of_cpu(cpu->cpu_threads->id);
+ if (nid == node->node_id) {
+ if (numa_bitmask_isbitset(node->cpus,
+ cpu->cpu_threads->id))
+ return node;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Return node if CPU belongs to any memoryless NUMA node.
+ */
+static struct ppcnuma_node *find_cpu_memless_node(struct dr_node *cpu)
+{
+ struct ppcnuma_node *node = NULL;
+ int nid;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (node->n_lmbs)
+ continue;
+
+ if (match_cpu_node(node, cpu))
+ return node;
+ }
+
+ return NULL;
+}
+
+/*
+ * The node list is sorted by node ratio (less memory per CPU).
+ * So consider the first node
+ * Return node if CPU belongs to the first NUMA node which
+ * has memory.
+ */
+static struct ppcnuma_node *find_cpu_numa_node(struct dr_node *cpu)
+{
+ struct ppcnuma_node *node = NULL;
+ int found = 0;
+
+ ppcnuma_foreach_node_by_ratio(&numa, node) {
+ if (node->n_cpus && node->n_lmbs) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (found && match_cpu_node(node, cpu))
+ return node;
+
+ return NULL;
+}
+
+/*
+ * Calculate node ratio based on amount of memory per CPU and sort
+ * the node ratio list.
+ */
+static void cpu_update_node_ratio(void)
+{
+ struct ppcnuma_node *node;
+ int nid;
+
+ ppcnuma_foreach_node(&numa, nid, node) {
+ if (!node->n_lmbs || !node->n_cpus)
+ continue;
+
+ /*
+ * Node ratio = n_lmbs per CPU
+ */
+ node->ratio = (node->n_lmbs * 100) / node->n_cpus;
+ }
+
+ order_numa_node_ratio_list();
+}
+
+/*
+ * Scan CPUs from the last one in the list and select the first CPU
+ * based on:
+ * - CPU from memory less node
+ * - If no CPUs are available in memory less nodes, CPU belongs to
+ * the first node from node ratio list.
+ */
+static struct dr_node *numa_get_next_cpu(struct dr_info *dr_info)
+{
+ struct ppcnuma_node *node;
+ struct dr_node *cpu = NULL;
+ struct thread *t;
+ int i, found = 0;
+
+ /*
+ * Update node ratio for each CPU removal request
+ */
+ cpu_update_node_ratio();
+
+ /* Find the first cpu with an online thread */
+ for (cpu = dr_info->all_cpus; cpu; cpu = cpu->next) {
+ if (cpu->unusable)
+ continue;
+
+ if (numa.memless_cpu_count)
+ node = find_cpu_memless_node(cpu);
+ else
+ node = find_cpu_numa_node(cpu);
+
+ if (!node)
+ continue;
+
+ t = cpu->cpu_threads;
+ for (i = 0; i < cpu->cpu_nthreads && t; i++, t = t->next) {
+ if (get_thread_state(t) == ONLINE)
+ found = 1;
+ numa_bitmask_clearbit(node->cpus, t->id);
+ }
+ if (found) {
+ node->n_cpus -= cpu->cpu_nthreads;
+ numa.cpu_count -= cpu->cpu_nthreads;
+ if (!node->n_lmbs)
+ numa.memless_cpu_count -= cpu->cpu_nthreads;
+ return cpu;
+ }
+ }
+
+ return NULL;
+}
+
/*
* Scan all CPUs from the last one for the next available CPU.
* Used only for non-NUMA based CPU removal.
@@ -202,8 +337,12 @@ static struct dr_node *get_next_available_cpu(struct dr_info *dr_info)

cpu = survivor;
} else if (usr_action == REMOVE) {
- /* Find the first cpu with an online thread */
- cpu = get_next_cpu(dr_info);
+ if (numa_enabled)
+ /* Find the first CPU from NUMA nodes */
+ cpu = numa_get_next_cpu(dr_info);
+ else
+ /* Find the first cpu with an online thread */
Reply all
Reply to author
Forward
0 new messages