From ad43d23701df932aa94a6573ea3ec71dbe66f6c5 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Apr 25 2023 08:46:41 +0000 Subject: powerpc/numa: Restrict possible nodes based on platform (bsc#1209999 ltc#202140 bsc#1142685 ltc#179509 FATE#327775 git-fixes). --- diff --git a/patches.suse/powerpc-numa-Restrict-possible-nodes-based-on-platfo.patch b/patches.suse/powerpc-numa-Restrict-possible-nodes-based-on-platfo.patch new file mode 100644 index 0000000..5a598ba --- /dev/null +++ b/patches.suse/powerpc-numa-Restrict-possible-nodes-based-on-platfo.patch @@ -0,0 +1,150 @@ +From 67df77845c181166d4bc324cbb0382f7e81c7631 Mon Sep 17 00:00:00 2001 +From: Srikar Dronamraju +Date: Mon, 17 Aug 2020 11:22:57 +0530 +Subject: [PATCH] powerpc/numa: Restrict possible nodes based on platform + +References: bsc#1209999 ltc#202140 bsc#1142685 ltc#179509 FATE#327775 git-fixes +Patch-mainline: v5.10-rc1 +Git-commit: 67df77845c181166d4bc324cbb0382f7e81c7631 + +As per draft LoPAPR (Revision 2.9_pre7), section B.5.3 "Run Time +Abstraction Services (RTAS) Node" available at: + https://openpowerfoundation.org/wp-content/uploads/2020/07/LoPAR-20200611.pdf + +... there are 2 device tree properties: + + "ibm,max-associativity-domains" + which defines the maximum number of domains that the firmware i.e + PowerVM can support. + +and: + + "ibm,current-associativity-domains" + which defines the maximum number of domains that the current + platform can support. + +The value of "ibm,max-associativity-domains" is always greater than or +equal to "ibm,current-associativity-domains" property. If the latter +property is not available, use "ibm,max-associativity-domain" as a +fallback. In this yet to be released LoPAPR, "ibm,current-associativity-domains" +is mentioned in page 833 / B.5.3 which is covered under under +"Appendix B. System Binding" section + +Currently powerpc uses the "ibm,max-associativity-domains" property +while setting the possible number of nodes. This is currently set at +32. However the possible number of nodes for a platform may be +significantly less. Hence set the possible number of nodes based on +"ibm,current-associativity-domains" property. + +Nathan Lynch had raised a valid concern that post LPM (Live Partition +Migration), a user could DLPAR add processors and memory after LPM +with "new" associativity properties: + https://lore.kernel.org/linuxppc-dev/871rljfet9.fsf@linux.ibm.com/t/#u + +He also pointed out that "ibm,max-associativity-domains" has the same +contents on all currently available PowerVM systems, unlike +"ibm,current-associativity-domains" and hence may be better able to +handle the new NUMA associativity properties. + +However with the recent commit dbce45628085 ("powerpc/numa: Limit +possible nodes to within num_possible_nodes"), all new NUMA +associativity properties are capped to initially set nr_node_ids. +Hence this commit should be safe with any new DLPAR add post LPM. + + $ lsprop /proc/device-tree/rtas/ibm,*associ*-domains + /proc/device-tree/rtas/ibm,current-associativity-domains + 00000005 00000001 00000002 00000002 00000002 00000010 + /proc/device-tree/rtas/ibm,max-associativity-domains + 00000005 00000001 00000008 00000020 00000020 00000100 + + $ cat /sys/devices/system/node/possible ##Before patch + 0-31 + + $ cat /sys/devices/system/node/possible ##After patch + 0-1 + +Note the maximum nodes this platform can support is only 2 but the +possible nodes is set to 32. + +This is important because lot of kernel and user space code allocate +structures for all possible nodes leading to a lot of memory that is +allocated but not used. + +I ran a simple experiment to create and destroy 100 memory cgroups on +boot on a 8 node machine (Power8 Alpine). + +Before patch: + free -k at boot + total used free shared buff/cache available + Mem: 523498176 4106816 518820608 22272 570752 516606720 + Swap: 4194240 0 4194240 + + free -k after creating 100 memory cgroups + total used free shared buff/cache available + Mem: 523498176 4628416 518246464 22336 623296 516058688 + Swap: 4194240 0 4194240 + + free -k after destroying 100 memory cgroups + total used free shared buff/cache available + Mem: 523498176 4697408 518173760 22400 627008 515987904 + Swap: 4194240 0 4194240 + +After patch: + free -k at boot + total used free shared buff/cache available + Mem: 523498176 3969472 518933888 22272 594816 516731776 + Swap: 4194240 0 4194240 + + free -k after creating 100 memory cgroups + total used free shared buff/cache available + Mem: 523498176 4181888 518676096 22208 640192 516496448 + Swap: 4194240 0 4194240 + + free -k after destroying 100 memory cgroups + total used free shared buff/cache available + Mem: 523498176 4232320 518619904 22272 645952 516443264 + Swap: 4194240 0 4194240 + +Observations: + Fixed kernel takes 137344 kb (4106816-3969472) less to boot. + Fixed kernel takes 309184 kb (4628416-4181888-137344) less to create 100 memcgs. + +Signed-off-by: Srikar Dronamraju +[mpe: Reformat change log a bit for readability] +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20200817055257.110873-1-srikar@linux.vnet.ibm.com +Acked-by: Michal Suchanek +--- + arch/powerpc/mm/numa.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index 1f61fa2148b5..5ddc83ba20f4 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -900,10 +900,19 @@ static void __init find_possible_nodes(void) + if (!rtas) + return; + +- if (of_property_read_u32_index(rtas, +- "ibm,max-associativity-domains", ++ if (of_property_read_u32_index(rtas, "ibm,current-associativity-domains", ++ min_common_depth, &numnodes)) { ++ /* ++ * ibm,current-associativity-domains is a fairly recent ++ * property. If it doesn't exist, then fallback on ++ * ibm,max-associativity-domains. Current denotes what the ++ * platform can support compared to max which denotes what the ++ * Hypervisor can support. ++ */ ++ if (of_property_read_u32_index(rtas, "ibm,max-associativity-domains", + min_common_depth, &numnodes)) +- goto out; ++ goto out; ++ } + + for (i = 0; i < numnodes; i++) { + if (!node_possible(i)) +-- +2.40.0 + diff --git a/series.conf b/series.conf index bfbf493..71fb4e2 100644 --- a/series.conf +++ b/series.conf @@ -58547,6 +58547,7 @@ patches.suse/powerpc-pseries-explicitly-reschedule-during-drmem_l.patch patches.suse/pseries-drmem-don-t-cache-node-id-in-drmem_lmb-struc.patch patches.suse/powerepc-book3s64-hash-Align-start-end-address-corre.patch + patches.suse/powerpc-numa-Restrict-possible-nodes-based-on-platfo.patch patches.suse/powerpc-numa-Prefer-node-id-queried-from-vphn.patch patches.suse/powerpc-powernv-elog-Fix-race-while-processing-OPAL-.patch patches.suse/powerpc-pseries-add-new-branch-prediction-security-b.patch