From: Vlastimil Babka <vbabka@suse.cz>
Subject: kabi: fix struct zone kabi after adding unaccepted_pages and NR_UNACCEPTED
Patch-mainline: Never, KABI
References: jsc#PED-7167 bsc#1218643
To add unaccepted_pages, we can move it to the hole before pad1. Keep exposing
it to kabi checker outside of x86_64 so the hole assumption is forced to be
rechecked in case e.g. arm64 tries to enable it later.
Adding NR_UNACCEPTED to enum zone_stat_item breaks KABI because it increments
NR_VM_ZONE_STAT_ITEMS and enlarges zone.vm_stat[] thus shifts
zone.vm_numa_event[], which is visible to everyone, most helpers are static
inline etc.
Solve the KABI issue by creating zone_stat_item_2 for NR_UNACCEPTED, adding
vm_stat_2 to the end of struct zone and duplicating just enough helpers to work
with NR_UNACCEPTED. Also vm_zone_stat_2 is added for the global counters.
Since this is not a hot counter and updates are done under zone->lock anyway,
we can skip the pcp vmstat diffs. Also ignore !CONFIG_SMP and !CONFIG_NUMA
helper variants.
Caveat: any future patches adding code with NR_UNACCEPTED using the normal
helpers will compile, but modify NR_FREE_PAGES due to having the same value and
typing on enums not being strong enough.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
[mkoutny: Strictly speaking vmstat_text[] is as part of KABI as enum
zone_stat_item. Instead of adding the secondary vmstat_text[] translation
table, carefully account for the added member in idx->name helpers. The
correction reduces to 0 when !CONFIG_UNACCEPTED_MEMORY.
Helpers inlined in 3rd party code remain broken (but assume this code won't
depend on/expose the string representations).]
---
drivers/base/node.c | 2 +-
fs/proc/meminfo.c | 2 +-
include/linux/mmzone.h | 30 ++++++++++++++++++++++++------
include/linux/vmstat.h | 40 ++++++++++++++++++++++++++++++++++++----
mm/page_alloc.c | 6 +++---
mm/vmstat.c | 26 ++++++++++++++++++++++++++
6 files changed, 91 insertions(+), 15 deletions(-)
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -480,7 +480,7 @@ static ssize_t node_read_meminfo(struct
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
,
- nid, K(sum_zone_node_page_state(nid, NR_UNACCEPTED))
+ nid, K(sum_zone_node_page_state_2(nid, NR_UNACCEPTED))
#endif
);
len += hugetlb_report_node_meminfo(buf, len, nid);
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -148,7 +148,7 @@ static int meminfo_proc_show(struct seq_
#ifdef CONFIG_UNACCEPTED_MEMORY
show_val_kb(m, "Unaccepted: ",
- global_zone_page_state(NR_UNACCEPTED));
+ global_zone_page_state_2(NR_UNACCEPTED));
#endif
hugetlb_report_meminfo(m);
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -160,10 +160,14 @@ enum zone_stat_item {
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
NR_FREE_CMA_PAGES,
+ NR_VM_ZONE_STAT_ITEMS };
+
+enum zone_stat_item_2 {
#ifdef CONFIG_UNACCEPTED_MEMORY
NR_UNACCEPTED,
#endif
- NR_VM_ZONE_STAT_ITEMS };
+ NR_VM_ZONE_STAT_ITEMS_2
+};
enum node_stat_item {
NR_LRU_BASE,
@@ -612,17 +616,24 @@ struct zone {
int initialized;
+/*
+ * There is a hole on x86_64 thanks to _pad1_ but haven't checked other
+ * architectures so restrict this to CONFIG_X86_64. In case we later enable this
+ * on e.g. arm64, kabi check will fail and we'll need to re-evaluate.
+ */
+#if !defined(__GENKSYMS__) && defined(CONFIG_X86_64)
+#ifdef CONFIG_UNACCEPTED_MEMORY
+ /* Pages to be accepted. All pages on the list are MAX_ORDER */
+ struct list_head unaccepted_pages;
+#endif
+#endif
+
/* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
-#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Pages to be accepted. All pages on the list are MAX_ORDER */
- struct list_head unaccepted_pages;
-#endif
-
/* zone flags, see below */
unsigned long flags;
@@ -671,7 +682,14 @@ struct zone {
/* Zone statistics */
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
+#ifndef __GENKSYMS__
+ union {
+ atomic_long_t vm_stat_2[NR_VM_ZONE_STAT_ITEMS_2];
+ void *suse_kabi_padding;
+ };
+#else
void *suse_kabi_padding;
+#endif
} ____cacheline_internodealigned_in_smp;
enum pgdat_flags {
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -138,6 +138,7 @@ static inline void vm_events_fold_cpu(in
* Zone and node-based page accounting with per cpu differentials.
*/
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_zone_stat_2[NR_VM_ZONE_STAT_ITEMS_2];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
@@ -169,6 +170,13 @@ static inline void zone_page_state_add(l
atomic_long_add(x, &vm_zone_stat[item]);
}
+static inline void zone_page_state_add_2(long x, struct zone *zone,
+ enum zone_stat_item_2 item)
+{
+ atomic_long_add(x, &zone->vm_stat_2[item]);
+ atomic_long_add(x, &vm_zone_stat_2[item]);
+}
+
static inline void node_page_state_add(long x, struct pglist_data *pgdat,
enum node_stat_item item)
{
@@ -186,6 +194,16 @@ static inline unsigned long global_zone_
return x;
}
+static inline unsigned long global_zone_page_state_2(enum zone_stat_item_2 item)
+{
+ long x = atomic_long_read(&vm_zone_stat_2[item]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
@@ -215,6 +233,17 @@ static inline unsigned long zone_page_st
return x;
}
+static inline unsigned long zone_page_state_2(struct zone *zone,
+ enum zone_stat_item_2 item)
+{
+ long x = atomic_long_read(&zone->vm_stat_2[item]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/*
* More accurate version that also considers the currently pending
* deltas. For that we need to loop over all cpus to find the current
@@ -257,6 +286,8 @@ __count_numa_events(struct zone *zone, e
extern unsigned long sum_zone_node_page_state(int node,
enum zone_stat_item item);
+extern unsigned long sum_zone_node_page_state_2(int node,
+ enum zone_stat_item_2 item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
@@ -274,6 +305,7 @@ static inline void fold_vm_numa_events(v
#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
+void __mod_zone_page_state_2(struct zone *, enum zone_stat_item_2 item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);
@@ -433,14 +465,14 @@ static inline const char *zone_stat_name
#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
- return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_ZONE_STAT_ITEMS_2 +
item];
}
#endif /* CONFIG_NUMA */
static inline const char *node_stat_name(enum node_stat_item item)
{
- return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_ZONE_STAT_ITEMS_2 +
NR_VM_NUMA_EVENT_ITEMS +
item];
}
@@ -452,7 +484,7 @@ static inline const char *lru_list_name(
static inline const char *writeback_stat_name(enum writeback_stat_item item)
{
- return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_ZONE_STAT_ITEMS_2 +
NR_VM_NUMA_EVENT_ITEMS +
NR_VM_NODE_STAT_ITEMS +
item];
@@ -461,7 +493,7 @@ static inline const char *writeback_stat
#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
- return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+ return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_ZONE_STAT_ITEMS_2 +
NR_VM_NUMA_EVENT_ITEMS +
NR_VM_NODE_STAT_ITEMS +
NR_VM_WRITEBACK_STAT_ITEMS +
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3834,7 +3834,7 @@ static inline long __zone_watermark_unus
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
+ unusable_free += zone_page_state_2(z, NR_UNACCEPTED);
#endif
return unusable_free;
@@ -9682,7 +9682,7 @@ static bool try_to_accept_memory_one(str
last = list_empty(&zone->unaccepted_pages);
__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
- __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ __mod_zone_page_state_2(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
accept_page(page, MAX_ORDER - 1);
@@ -9734,7 +9734,7 @@ static bool __free_unaccepted(struct pag
first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
- __mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ __mod_zone_page_state_2(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
spin_unlock_irqrestore(&zone->lock, flags);
if (first)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -160,6 +160,7 @@ void vm_events_fold_cpu(int cpu)
* vm_stat contains the global counters
*/
atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_zone_stat_2[NR_VM_ZONE_STAT_ITEMS_2] __cacheline_aligned_in_smp;
atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
@@ -372,6 +373,18 @@ void __mod_zone_page_state(struct zone *
}
EXPORT_SYMBOL(__mod_zone_page_state);
+void __mod_zone_page_state_2(struct zone *zone, enum zone_stat_item_2 item,
+ long delta)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
+
+ zone_page_state_add_2(delta, zone, item);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+}
+
void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long delta)
{
@@ -1003,6 +1016,19 @@ unsigned long sum_zone_node_page_state(i
return count;
}
+
+unsigned long sum_zone_node_page_state_2(int node,
+ enum zone_stat_item_2 item)
+{
+ struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state_2(zones + i, item);
+
+ return count;
+}
/* Determine the per node value of a numa stat item. */
unsigned long sum_zone_numa_event_state(int node,