From a664f8e985d3e72bf57ebe38cf02af929155ee5f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Jul 09 2019 09:29:10 +0000 Subject: Merge remote-tracking branch 'origin/SLE15-SP1' into SLE12-SP5 Conflicts: series.conf --- diff --git a/patches.arch/x86-amd_nb-add-pci-device-ids-for-family-17h-model-30h.patch b/patches.arch/x86-amd_nb-add-pci-device-ids-for-family-17h-model-30h.patch index f45d898..6652c25 100644 --- a/patches.arch/x86-amd_nb-add-pci-device-ids-for-family-17h-model-30h.patch +++ b/patches.arch/x86-amd_nb-add-pci-device-ids-for-family-17h-model-30h.patch @@ -1,9 +1,11 @@ +From be3518a16ef270e3b030a6ae96055f83f51bd3dd Mon Sep 17 00:00:00 2001 From: "Woods, Brian" Date: Tue, 6 Nov 2018 20:08:18 +0000 -Subject: x86/amd_nb: Add PCI device IDs for family 17h, model 30h -Git-commit: be3518a16ef270e3b030a6ae96055f83f51bd3dd -Patch-mainline: v5.0-rc1 +Subject: [PATCH] x86/amd_nb: Add PCI device IDs for family 17h, model 30h + References: fate#326884 +Patch-mainline: v5.0-rc1 +Git-commit: be3518a16ef270e3b030a6ae96055f83f51bd3dd Add the PCI device IDs for family 17h model 30h, since they are needed for accessing various registers via the data fabric/SMN interface. @@ -24,10 +26,12 @@ CC: Thomas Gleixner CC: x86-ml Link: http://lkml.kernel.org/r/20181106200754.60722-4-brian.woods@amd.com --- - arch/x86/kernel/amd_nb.c | 6 ++++++ - include/linux/pci_ids.h | 1 + + arch/x86/kernel/amd_nb.c | 6 ++++++ + include/linux/pci_ids.h | 1 + 2 files changed, 7 insertions(+) +diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c +index cc34266e3c62..cc51275c8759 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -16,8 +16,10 @@ @@ -53,7 +57,7 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-4-brian.woods@amd.com #define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 const struct pci_device_id amd_nb_misc_ids[] = { -@@ -43,6 +47,7 @@ const struct pci_device_id amd_nb_misc_i +@@ -43,6 +47,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, @@ -61,7 +65,7 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-4-brian.woods@amd.com { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, {} }; -@@ -56,6 +61,7 @@ static const struct pci_device_id amd_nb +@@ -56,6 +61,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, @@ -69,9 +73,11 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-4-brian.woods@amd.com { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; +diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h +index 78d5cd29778a..349276fbd269 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h -@@ -542,6 +542,7 @@ +@@ -547,6 +547,7 @@ #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F4 0x1584 #define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F3 0x15eb @@ -79,3 +85,6 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-4-brian.woods@amd.com #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 +-- +2.21.0 + diff --git a/patches.arch/x86-amd_nb-add-support-for-newer-pci-topologies.patch b/patches.arch/x86-amd_nb-add-support-for-newer-pci-topologies.patch index 400d7c2..9ff61a0 100644 --- a/patches.arch/x86-amd_nb-add-support-for-newer-pci-topologies.patch +++ b/patches.arch/x86-amd_nb-add-support-for-newer-pci-topologies.patch @@ -1,9 +1,11 @@ +From 556e4c62baffa71e2045a298379db7e57dd47f3d Mon Sep 17 00:00:00 2001 From: "Woods, Brian" Date: Tue, 6 Nov 2018 20:08:16 +0000 -Subject: x86/amd_nb: Add support for newer PCI topologies -Git-commit: 556e4c62baffa71e2045a298379db7e57dd47f3d -Patch-mainline: v5.0-rc1 +Subject: [PATCH] x86/amd_nb: Add support for newer PCI topologies + References: fate#326884 +Patch-mainline: v5.0-rc1 +Git-commit: 556e4c62baffa71e2045a298379db7e57dd47f3d Add support for new processors which have multiple PCI root complexes per data fabric/system management network interface. If there are (N) @@ -37,16 +39,15 @@ CC: Pu Wen CC: Thomas Gleixner CC: x86-ml Link: http://lkml.kernel.org/r/20181106200754.60722-3-brian.woods@amd.com - -[ bp: Fix build without the hygon stuff from c6babb5806b77 ] - --- - arch/x86/kernel/amd_nb.c | 44 ++++++++++++++++++++++++++++++++++++++------ + arch/x86/kernel/amd_nb.c | 44 ++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) +diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c +index 19d489ee2b1e..cc34266e3c62 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c -@@ -208,7 +208,10 @@ int amd_cache_northbridges(void) +@@ -213,7 +213,10 @@ int amd_cache_northbridges(void) const struct pci_device_id *root_ids = amd_root_ids; struct pci_dev *root, *misc, *link; struct amd_northbridge *nb; @@ -58,7 +59,7 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-3-brian.woods@amd.com if (amd_northbridges.num) return 0; -@@ -221,26 +224,55 @@ int amd_cache_northbridges(void) +@@ -226,26 +229,55 @@ int amd_cache_northbridges(void) misc = NULL; while ((misc = next_northbridge(misc, misc_ids)) != NULL) @@ -71,7 +72,7 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-3-brian.woods@amd.com - nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL); + root = NULL; -+ while ((root = next_northbridge(root, amd_root_ids)) != NULL) ++ while ((root = next_northbridge(root, root_ids)) != NULL) + root_count++; + + if (root_count) { @@ -115,7 +116,10 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-3-brian.woods@amd.com + * correct PCI roots. + */ + for (j = 1; j < roots_per_misc; j++) -+ root = next_northbridge(root, amd_root_ids); ++ root = next_northbridge(root, root_ids); } if (amd_gart_present()) +-- +2.21.0 + diff --git a/patches.arch/x86-cpu-get-cache-info-and-setup-cache-cpumap-for-hygon-dhyana.patch b/patches.arch/x86-cpu-get-cache-info-and-setup-cache-cpumap-for-hygon-dhyana.patch index fa08958..94dda05 100644 --- a/patches.arch/x86-cpu-get-cache-info-and-setup-cache-cpumap-for-hygon-dhyana.patch +++ b/patches.arch/x86-cpu-get-cache-info-and-setup-cache-cpumap-for-hygon-dhyana.patch @@ -27,45 +27,43 @@ Cc: x86@kernel.org Cc: thomas.lendacky@amd.com Link: https://lkml.kernel.org/r/2a686b2ac0e2f5a1f2f5f101124d9dd44f949731.1537533369.git.puwen@hygon.cn --- - arch/x86/include/asm/cacheinfo.h | 1 + - arch/x86/kernel/cpu/cacheinfo.c | 31 +++++++++++++++++++++++++++++-- - arch/x86/kernel/cpu/cpu.h | 1 + - arch/x86/kernel/cpu/hygon.c | 3 +++ - 4 files changed, 34 insertions(+), 2 deletions(-) + arch/x86/kernel/cpu/cpu.h | 2 ++ + arch/x86/kernel/cpu/hygon.c | 3 +++ + arch/x86/kernel/cpu/cacheinfo.c | 31 +++++++++++++++++++++++++++++-- + 3 files changed, 34 insertions(+), 2 deletions(-) ---- a/arch/x86/include/asm/cacheinfo.h -+++ b/arch/x86/include/asm/cacheinfo.h -@@ -3,5 +3,6 @@ - #define _ASM_X86_CACHEINFO_H - - void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); +--- a/arch/x86/kernel/cpu/cpu.h ++++ b/arch/x86/kernel/cpu/cpu.h +@@ -50,5 +50,7 @@ extern void x86_spec_ctrl_setup_ap(void) + extern int detect_extended_topology_early(struct cpuinfo_x86 *c); + extern int detect_extended_topology(struct cpuinfo_x86 *c); + extern int detect_ht_early(struct cpuinfo_x86 *c); ++extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); +void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); - #endif /* _ASM_X86_CACHEINFO_H */ + #endif /* ARCH_X86_CPU_H */ +--- a/arch/x86/kernel/cpu/hygon.c ++++ b/arch/x86/kernel/cpu/hygon.c +@@ -86,6 +86,7 @@ static void hygon_get_topology(struct cp + if (!err) + c->x86_coreid_bits = get_count_order(c->x86_max_cores); + ++ cacheinfo_hygon_init_llc_id(c, cpu, node_id); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + +@@ -320,6 +321,8 @@ static void init_hygon(struct cpuinfo_x8 + hygon_get_topology(c); + srat_detect_node(c); + ++ init_hygon_cacheinfo(c); ++ + if (cpu_has(c, X86_FEATURE_XMM2)) { + unsigned long long val; + int ret; --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c -@@ -599,6 +599,10 @@ cpuid4_cache_lookup_regs(int index, stru - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); -+ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { -+ cpuid_count(0x8000001d, index, &eax.full, -+ &ebx.full, &ecx.full, &edx); -+ amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } -@@ -622,7 +626,8 @@ static int find_num_cache_leaves(struct - union _cpuid4_leaf_eax cache_eax; - int i = -1; - -- if (c->x86_vendor == X86_VENDOR_AMD) -+ if (c->x86_vendor == X86_VENDOR_AMD || -+ c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; -@@ -675,6 +680,22 @@ void cacheinfo_amd_init_llc_id(struct cp +@@ -395,6 +395,22 @@ static void amd_l3_disable_index(struct } } @@ -85,10 +83,31 @@ Link: https://lkml.kernel.org/r/2a686b2ac0e2f5a1f2f5f101124d9dd44f949731.1537533 + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; +} + - void init_amd_cacheinfo(struct cpuinfo_x86 *c) - { + /* + * disable a L3 cache index by using a disable-slot + * +@@ -599,6 +615,10 @@ cpuid4_cache_lookup_regs(int index, stru + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); ++ } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { ++ cpuid_count(0x8000001d, index, &eax.full, ++ &ebx.full, &ecx.full, &edx); ++ amd_init_l3_cache(this_leaf, index); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } +@@ -622,7 +642,8 @@ static int find_num_cache_leaves(struct + union _cpuid4_leaf_eax cache_eax; + int i = -1; -@@ -688,6 +709,11 @@ void init_amd_cacheinfo(struct cpuinfo_x +- if (c->x86_vendor == X86_VENDOR_AMD) ++ if (c->x86_vendor == X86_VENDOR_AMD || ++ c->x86_vendor == X86_VENDOR_HYGON) + op = 0x8000001d; + else + op = 4; +@@ -649,6 +670,11 @@ void init_amd_cacheinfo(struct cpuinfo_x } } @@ -100,7 +119,7 @@ Link: https://lkml.kernel.org/r/2a686b2ac0e2f5a1f2f5f101124d9dd44f949731.1537533 unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ -@@ -910,7 +936,8 @@ static void __cache_cpumap_setup(unsigne +@@ -871,7 +897,8 @@ static void __cache_cpumap_setup(unsigne int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -110,32 +129,3 @@ Link: https://lkml.kernel.org/r/2a686b2ac0e2f5a1f2f5f101124d9dd44f949731.1537533 if (__cache_amd_cpumap_setup(cpu, index, base)) return; } ---- a/arch/x86/kernel/cpu/cpu.h -+++ b/arch/x86/kernel/cpu/cpu.h -@@ -27,6 +27,7 @@ struct cpu_dev { - } legacy_models[5]; - #endif - }; -+extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c); - - struct _tlb_table { - unsigned char descriptor; ---- a/arch/x86/kernel/cpu/hygon.c -+++ b/arch/x86/kernel/cpu/hygon.c -@@ -87,6 +87,7 @@ static void hygon_get_topology(struct cp - if (!err) - c->x86_coreid_bits = get_count_order(c->x86_max_cores); - -+ cacheinfo_hygon_init_llc_id(c, cpu, node_id); - } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { - u64 value; - -@@ -321,6 +322,8 @@ static void init_hygon(struct cpuinfo_x8 - hygon_get_topology(c); - srat_detect_node(c); - -+ init_hygon_cacheinfo(c); -+ - if (cpu_has(c, X86_FEATURE_XMM2)) { - unsigned long long val; - int ret; diff --git a/patches.drivers/hwmon-k10temp-add-support-for-amd-family-17h-model-30h-cpus.patch b/patches.drivers/hwmon-k10temp-add-support-for-amd-family-17h-model-30h-cpus.patch index ff0d205..3fc875e 100644 --- a/patches.drivers/hwmon-k10temp-add-support-for-amd-family-17h-model-30h-cpus.patch +++ b/patches.drivers/hwmon-k10temp-add-support-for-amd-family-17h-model-30h-cpus.patch @@ -1,9 +1,11 @@ +From 210ba1201ff950b3d05bfd8fa5d47540cea393c0 Mon Sep 17 00:00:00 2001 From: "Woods, Brian" Date: Tue, 6 Nov 2018 20:08:21 +0000 -Subject: hwmon/k10temp: Add support for AMD family 17h, model 30h CPUs -Git-commit: 210ba1201ff950b3d05bfd8fa5d47540cea393c0 -Patch-mainline: v5.0-rc1 +Subject: [PATCH] hwmon/k10temp: Add support for AMD family 17h, model 30h CPUs + References: fate#326884 +Patch-mainline: v5.0-rc1 +Git-commit: 210ba1201ff950b3d05bfd8fa5d47540cea393c0 Add support for AMD family 17h model 30h processors for k10temp. Model 30h is functionally the same as model 01h processors (as far as k10temp @@ -25,12 +27,14 @@ CC: Thomas Gleixner CC: x86-ml Link: http://lkml.kernel.org/r/20181106200754.60722-5-brian.woods@amd.com --- - drivers/hwmon/k10temp.c | 1 + + drivers/hwmon/k10temp.c | 1 + 1 file changed, 1 insertion(+) +diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c +index bc6871c8dd4e..9790f1f5eb98 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c -@@ -215,6 +215,7 @@ static const struct pci_device_id k10tem +@@ -360,6 +360,7 @@ static const struct pci_device_id k10temp_id_table[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, @@ -38,3 +42,6 @@ Link: http://lkml.kernel.org/r/20181106200754.60722-5-brian.woods@amd.com {} }; MODULE_DEVICE_TABLE(pci, k10temp_id_table); +-- +2.21.0 + diff --git a/patches.fixes/0001-mm-hwpoison-fix-thp-split-handing-in-soft_offline_in.patch b/patches.fixes/0001-mm-hwpoison-fix-thp-split-handing-in-soft_offline_in.patch index 3d03f6b..1b43492 100644 --- a/patches.fixes/0001-mm-hwpoison-fix-thp-split-handing-in-soft_offline_in.patch +++ b/patches.fixes/0001-mm-hwpoison-fix-thp-split-handing-in-soft_offline_in.patch @@ -48,7 +48,7 @@ Signed-off-by: Michal Hocko --- a/mm/memory-failure.c +++ b/mm/memory-failure.c -@@ -1826,19 +1826,17 @@ static int soft_offline_in_use_page(stru +@@ -1823,19 +1823,17 @@ static int soft_offline_in_use_page(stru struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { @@ -73,4 +73,4 @@ Signed-off-by: Michal Hocko + unlock_page(page); } - if (PageHuge(page)) + /* diff --git a/patches.fixes/fs-hugetlbfs-inode.c-fix-hwpoison-reserve-accounting.patch b/patches.fixes/fs-hugetlbfs-inode.c-fix-hwpoison-reserve-accounting.patch new file mode 100644 index 0000000..3461f00 --- /dev/null +++ b/patches.fixes/fs-hugetlbfs-inode.c-fix-hwpoison-reserve-accounting.patch @@ -0,0 +1,64 @@ +From ab615a5b879292e83653be60aa82113f7c6f462d Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Thu, 2 Nov 2017 15:59:41 -0700 +Subject: [PATCH] fs/hugetlbfs/inode.c: fix hwpoison reserve accounting +Patch-mainline: v4.14-rc8 +Git-commit: ab615a5b879292e83653be60aa82113f7c6f462d +References: bsc#1139712 + +Calling madvise(MADV_HWPOISON) on a hugetlbfs page will result in bad +(negative) reserved huge page counts. This may not happen immediately, +but may happen later when the underlying file is removed or filesystem +unmounted. For example: + + AnonHugePages: 0 kB + ShmemHugePages: 0 kB + HugePages_Total: 1 + HugePages_Free: 0 + HugePages_Rsvd: 18446744073709551615 + HugePages_Surp: 0 + Hugepagesize: 2048 kB + +In routine hugetlbfs_error_remove_page(), hugetlb_fix_reserve_counts is +called after remove_huge_page. hugetlb_fix_reserve_counts is designed +to only be called/used only if a failure is returned from +hugetlb_unreserve_pages. Therefore, call hugetlb_unreserve_pages as +required and only call hugetlb_fix_reserve_counts in the unlikely event +that hugetlb_unreserve_pages returns an error. + +Link: http://lkml.kernel.org/r/20171019230007.17043-2-mike.kravetz@oracle.com +Fixes: 78bb920344b8 ("mm: hwpoison: dissolve in-use hugepage in unrecoverable memory error") +Signed-off-by: Mike Kravetz +Acked-by: Naoya Horiguchi +Cc: Michal Hocko +Cc: Aneesh Kumar +Cc: Anshuman Khandual +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + fs/hugetlbfs/inode.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c +index 59073e9f01a4..ed113ea17aff 100644 +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -842,9 +842,12 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, + struct page *page) + { + struct inode *inode = mapping->host; ++ pgoff_t index = page->index; + + remove_huge_page(page); +- hugetlb_fix_reserve_counts(inode); ++ if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) ++ hugetlb_fix_reserve_counts(inode); ++ + return 0; + } + +-- +2.12.3 + diff --git a/patches.fixes/mm-fix-race-on-soft-offlining-free-huge-pages.patch b/patches.fixes/mm-fix-race-on-soft-offlining-free-huge-pages.patch new file mode 100644 index 0000000..30918a1 --- /dev/null +++ b/patches.fixes/mm-fix-race-on-soft-offlining-free-huge-pages.patch @@ -0,0 +1,194 @@ +From 6bc9b56433b76e40d11099338d27fbc5cd2935ca Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Thu, 23 Aug 2018 17:00:38 -0700 +Subject: [PATCH] mm: fix race on soft-offlining free huge pages +Patch-mainline: v4.19-rc1 +Git-commit: 6bc9b56433b76e40d11099338d27fbc5cd2935ca +References: bsc#1139712 + +Patch series "mm: soft-offline: fix race against page allocation". + +Xishi recently reported the issue about race on reusing the target pages +of soft offlining. Discussion and analysis showed that we need make +sure that setting PG_hwpoison should be done in the right place under +zone->lock for soft offline. 1/2 handles free hugepage's case, and 2/2 +hanldes free buddy page's case. + +This patch (of 2): + +There's a race condition between soft offline and hugetlb_fault which +causes unexpected process killing and/or hugetlb allocation failure. + +The process killing is caused by the following flow: + + CPU 0 CPU 1 CPU 2 + + soft offline + get_any_page + // find the hugetlb is free + mmap a hugetlb file + page fault + ... + hugetlb_fault + hugetlb_no_page + alloc_huge_page + // succeed + soft_offline_free_page + // set hwpoison flag + mmap the hugetlb file + page fault + ... + hugetlb_fault + hugetlb_no_page + find_lock_page + return VM_FAULT_HWPOISON + mm_fault_error + do_sigbus + // kill the process + +The hugetlb allocation failure comes from the following flow: + + CPU 0 CPU 1 + + mmap a hugetlb file + // reserve all free page but don't fault-in + soft offline + get_any_page + // find the hugetlb is free + soft_offline_free_page + // set hwpoison flag + dissolve_free_huge_page + // fail because all free hugepages are reserved + page fault + ... + hugetlb_fault + hugetlb_no_page + alloc_huge_page + ... + dequeue_huge_page_node_exact + // ignore hwpoisoned hugepage + // and finally fail due to no-mem + +The root cause of this is that current soft-offline code is written based +on an assumption that PageHWPoison flag should be set at first to avoid +accessing the corrupted data. This makes sense for memory_failure() or +hard offline, but does not for soft offline because soft offline is about +corrected (not uncorrected) error and is safe from data lost. This patch +changes soft offline semantics where it sets PageHWPoison flag only after +containment of the error page completes successfully. + +Link: http://lkml.kernel.org/r/1531452366-11661-2-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Reported-by: Xishi Qiu +Suggested-by: Xishi Qiu +Tested-by: Mike Kravetz +Cc: Michal Hocko +Cc: +Cc: Mike Kravetz +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/hugetlb.c | 11 +++++------ + mm/memory-failure.c | 22 ++++++++++++++++------ + mm/migrate.c | 2 -- + 3 files changed, 21 insertions(+), 14 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 47566bb0b4b1..9f1c853f67b5 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + /* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the +- * number of free hugepages would be reduced below the number of reserved +- * hugepages. ++ * dissolution fails because a give page is not a free hugepage, or because ++ * free hugepages are fully reserved. + */ + int dissolve_free_huge_page(struct page *page) + { +- int rc = 0; ++ int rc = -EBUSY; + + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); +- if (h->free_huge_pages - h->resv_huge_pages == 0) { +- rc = -EBUSY; ++ if (h->free_huge_pages - h->resv_huge_pages == 0) + goto out; +- } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. +@@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page) + h->free_huge_pages_node[nid]--; + h->max_huge_pages--; + update_and_free_page(h, head); ++ rc = 0; + } + out: + spin_unlock(&hugetlb_lock); +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index c83a1746812f..49dc32c61137 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1598,8 +1598,18 @@ static int soft_offline_huge_page(struct page *page, int flags) + if (ret > 0) + ret = -EIO; + } else { +- if (PageHuge(page)) +- dissolve_free_huge_page(page); ++ /* ++ * We set PG_hwpoison only when the migration source hugepage ++ * was successfully dissolved, because otherwise hwpoisoned ++ * hugepage remains on free hugepage list, then userspace will ++ * find it as SIGBUS by allocation failure. That's not expected ++ * in soft-offlining. ++ */ ++ ret = dissolve_free_huge_page(page); ++ if (!ret) { ++ if (set_hwpoison_free_buddy_page(page)) ++ num_poisoned_pages_inc(); ++ } + } + return ret; + } +@@ -1715,13 +1725,13 @@ static int soft_offline_in_use_page(struct page *page, int flags) + + static void soft_offline_free_page(struct page *page) + { ++ int rc = 0; + struct page *head = compound_head(page); + +- if (!TestSetPageHWPoison(head)) { ++ if (PageHuge(head)) ++ rc = dissolve_free_huge_page(page); ++ if (!rc && !TestSetPageHWPoison(page)) + num_poisoned_pages_inc(); +- if (PageHuge(head)) +- dissolve_free_huge_page(page); +- } + } + + /** +diff --git a/mm/migrate.c b/mm/migrate.c +index c27e97b5b69d..91a99457127c 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1331,8 +1331,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); +- if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) +- num_poisoned_pages_inc(); + + /* + * If migration was not successful and there's a freeing callback, use +-- +2.12.3 + diff --git a/patches.fixes/mm-hugetlb-delete-dequeue_hwpoisoned_huge_page.patch b/patches.fixes/mm-hugetlb-delete-dequeue_hwpoisoned_huge_page.patch new file mode 100644 index 0000000..10aa1e3 --- /dev/null +++ b/patches.fixes/mm-hugetlb-delete-dequeue_hwpoisoned_huge_page.patch @@ -0,0 +1,115 @@ +From f9d7d3742626e815756d2da0f9b92d9407bf7961 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:53 -0700 +Subject: [PATCH 7/8] mm: hugetlb: delete dequeue_hwpoisoned_huge_page() +Patch-mainline: v4.13-rc1 +Git-commit: ddd40d8a2c4ef8f2152ea6d227e11475cf7e5bfa +References: bsc#1139712 + +dequeue_hwpoisoned_huge_page() is no longer used, so let's remove it. + +Link: http://lkml.kernel.org/r/1496305019-5493-9-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + include/linux/hugetlb.h | 6 +----- + mm/hugetlb.c | 34 ---------------------------------- + mm/memory-failure.c | 11 ----------- + 3 files changed, 1 insertion(+), 50 deletions(-) + +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index e495fcc1c38a..3620acbf64f8 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -92,7 +92,6 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, + vm_flags_t vm_flags); + long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed); +-int dequeue_hwpoisoned_huge_page(struct page *page); + bool isolate_huge_page(struct page *page, struct list_head *list); + void putback_active_hugepage(struct page *page); + void free_huge_page(struct page *page); +@@ -158,10 +157,7 @@ static inline void hugetlb_show_meminfo(void) + #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ + src_addr, pagep) ({ BUG(); 0; }) + #define huge_pte_offset(mm, address) 0 +-static inline int dequeue_hwpoisoned_huge_page(struct page *page) +-{ +- return 0; +-} ++ + + static inline bool isolate_huge_page(struct page *page, struct list_head *list) + { +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index dfebd5a2592d..9411e40d0e45 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -4734,40 +4734,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, + return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + } + +-#ifdef CONFIG_MEMORY_FAILURE +- +-/* +- * This function is called from memory failure code. +- */ +-int dequeue_hwpoisoned_huge_page(struct page *hpage) +-{ +- struct hstate *h = page_hstate(hpage); +- int nid = page_to_nid(hpage); +- int ret = -EBUSY; +- +- spin_lock(&hugetlb_lock); +- /* +- * Just checking !page_huge_active is not enough, because that could be +- * an isolated/hwpoisoned hugepage (which have >0 refcount). +- */ +- if (!page_huge_active(hpage) && !page_count(hpage)) { +- /* +- * Hwpoisoned hugepage isn't linked to activelist or freelist, +- * but dangling hpage->lru can trigger list-debug warnings +- * (this happens when we call unpoison_memory() on it), +- * so let it point to itself with list_del_init(). +- */ +- list_del_init(&hpage->lru); +- set_page_refcounted(hpage); +- h->free_huge_pages--; +- h->free_huge_pages_node[nid]--; +- ret = 0; +- } +- spin_unlock(&hugetlb_lock); +- return ret; +-} +-#endif +- + bool isolate_huge_page(struct page *page, struct list_head *list) + { + bool ret = true; +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 4a95c12f39d2..ba1f316e923d 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1458,17 +1458,6 @@ int unpoison_memory(unsigned long pfn) + } + + if (!get_hwpoison_page(p)) { +- /* +- * Since HWPoisoned hugepage should have non-zero refcount, +- * race between memory failure and unpoison seems to happen. +- * In such case unpoison fails and memory failure runs +- * to the end. +- */ +- if (PageHuge(page)) { +- unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", +- pfn, &unpoison_rs); +- return 0; +- } + if (TestClearPageHWPoison(p)) + num_poisoned_pages_dec(); + unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", +-- +2.12.3 + diff --git a/patches.fixes/mm-hugetlb-prevent-reuse-of-hwpoisoned-free-hugepage.patch b/patches.fixes/mm-hugetlb-prevent-reuse-of-hwpoisoned-free-hugepage.patch new file mode 100644 index 0000000..d9dd284 --- /dev/null +++ b/patches.fixes/mm-hugetlb-prevent-reuse-of-hwpoisoned-free-hugepage.patch @@ -0,0 +1,79 @@ +From 8a61ffcfee8495f9315449da5c0e4c856c953d6c Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:32 -0700 +Subject: [PATCH 1/8] mm: hugetlb: prevent reuse of hwpoisoned free hugepages +Patch-mainline: v4.13-rc1 +Git-commit: 243abd5b7803d540280f029bc5224a4a2892579a +References: bsc#1139712 + +Patch series "mm: hwpoison: fixlet for hugetlb migration". + +This patchset updates the hwpoison/hugetlb code to address 2 reported +issues. + +One is madvise(MADV_HWPOISON) failure reported by Intel's lkp robot (see +http://lkml.kernel.org/r/20170417055948.GM31394@yexl-desktop.) First +half was already fixed in mainline, and another half about hugetlb cases +are solved in this series. + +Another issue is "narrow-down error affected region into a single 4kB +page instead of a whole hugetlb page" issue, which was tried by Anshuman +(http://lkml.kernel.org/r/20170420110627.12307-1-khandual@linux.vnet.ibm.com) +and I updated it to apply it more widely. + +This patch (of 9): + +We no longer use MIGRATE_ISOLATE to prevent reuse of hwpoison hugepages +as we did before. So current dequeue_huge_page_node() doesn't work as +intended because it still uses is_migrate_isolate_page() for this check. +This patch fixes it with PageHWPoison flag. + +Link: http://lkml.kernel.org/r/1496305019-5493-2-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Cc: Michal Hocko +Cc: "Aneesh Kumar K.V" +Cc: Anshuman Khandual +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/hugetlb.c | 3 +-- + mm/memory-failure.c | 1 - + 2 files changed, 1 insertion(+), 3 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index f69400b723c1..0f13a44c8ae7 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -22,7 +22,6 @@ + #include + #include + #include +-#include + #include + + #include +@@ -872,7 +871,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) + struct page *page; + + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) +- if (!is_migrate_isolate_page(page)) ++ if (!PageHWPoison(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 717ab9d751c3..36ee53834924 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -49,7 +49,6 @@ + #include + #include + #include +-#include + #include + #include + #include +-- +2.12.3 + diff --git a/patches.fixes/mm-hugetlb-soft-offline-dissolve-source-hugepage-aft.patch b/patches.fixes/mm-hugetlb-soft-offline-dissolve-source-hugepage-aft.patch new file mode 100644 index 0000000..8db6bf7 --- /dev/null +++ b/patches.fixes/mm-hugetlb-soft-offline-dissolve-source-hugepage-aft.patch @@ -0,0 +1,149 @@ +From 5e38401c39b1d2cf3ffbc30b7d798c2d9dacd177 Mon Sep 17 00:00:00 2001 +From: Anshuman Khandual +Date: Mon, 10 Jul 2017 15:47:41 -0700 +Subject: [PATCH 3/8] mm: hugetlb: soft-offline: dissolve source hugepage after + successful migration +Patch-mainline: v4.13-rc1 +Git-commit: c3114a84f7f96c9d5c73c8bfa7e21ff42fda97e2 +References: bsc#1139712 + +Currently hugepage migrated by soft-offline (i.e. due to correctable +memory errors) is contained as a hugepage, which means many non-error +pages in it are unreusable, i.e. wasted. + +This patch solves this issue by dissolving source hugepages into buddy. +As done in previous patch, PageHWPoison is set only on a head page of +the error hugepage. Then in dissoliving we move the PageHWPoison flag +to the raw error page so that all healthy subpages return back to buddy. + +[arnd@arndb.de: fix warnings: replace some macros with inline functions] + Link: http://lkml.kernel.org/r/20170609102544.2947326-1-arnd@arndb.de +Link: http://lkml.kernel.org/r/1496305019-5493-5-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Anshuman Khandual +Signed-off-by: Naoya Horiguchi +Signed-off-by: Arnd Bergmann +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + include/linux/hugetlb.h | 31 +++++++++++++++++++++++++++---- + mm/hugetlb.c | 10 +++++++++- + mm/memory-failure.c | 5 +---- + mm/migrate.c | 2 ++ + 4 files changed, 39 insertions(+), 9 deletions(-) + +diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h +index bb1de231f84a..e495fcc1c38a 100644 +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -462,6 +462,7 @@ static inline pgoff_t basepage_index(struct page *page) + return __basepage_index(page); + } + ++extern int dissolve_free_huge_page(struct page *page); + extern int dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); + static inline bool hugepage_migration_supported(struct hstate *h) +@@ -524,15 +525,37 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) + { + return 1; + } +-#define hstate_index_to_shift(index) 0 +-#define hstate_index(h) 0 ++ ++static inline unsigned hstate_index_to_shift(unsigned index) ++{ ++ return 0; ++} ++ ++static inline int hstate_index(struct hstate *h) ++{ ++ return 0; ++} + + static inline pgoff_t basepage_index(struct page *page) + { + return page->index; + } +-#define dissolve_free_huge_pages(s, e) 0 +-#define hugepage_migration_supported(h) false ++ ++static inline int dissolve_free_huge_page(struct page *page) ++{ ++ return 0; ++} ++ ++static inline int dissolve_free_huge_pages(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ return 0; ++} ++ ++static inline bool hugepage_migration_supported(struct hstate *h) ++{ ++ return false; ++} + + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index 0f13a44c8ae7..dfebd5a2592d 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1445,7 +1445,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + * number of free hugepages would be reduced below the number of reserved + * hugepages. + */ +-static int dissolve_free_huge_page(struct page *page) ++int dissolve_free_huge_page(struct page *page) + { + int rc = 0; + +@@ -1458,6 +1458,14 @@ static int dissolve_free_huge_page(struct page *page) + rc = -EBUSY; + goto out; + } ++ /* ++ * Move PageHWPoison flag from head page to the raw error page, ++ * which makes any subpages rather than the error page reusable. ++ */ ++ if (PageHWPoison(head) && page != head) { ++ SetPageHWPoison(page); ++ ClearPageHWPoison(head); ++ } + list_del(&head->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index cbf872432bcc..c5f3411f8011 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1570,11 +1570,8 @@ static int soft_offline_huge_page(struct page *page, int flags) + if (ret > 0) + ret = -EIO; + } else { +- /* overcommit hugetlb page will be freed to buddy */ +- SetPageHWPoison(page); + if (PageHuge(page)) +- dequeue_hwpoisoned_huge_page(hpage); +- num_poisoned_pages_inc(); ++ dissolve_free_huge_page(page); + } + return ret; + } +diff --git a/mm/migrate.c b/mm/migrate.c +index 9bb22a4a5132..5ae662112f96 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1252,6 +1252,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); ++ if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) ++ num_poisoned_pages_inc(); + + /* + * If migration was not successful and there's a freeing callback, use +-- +2.12.3 + diff --git a/patches.fixes/mm-hugetlb-soft-offline-dissolve_free_huge_page-retu.patch b/patches.fixes/mm-hugetlb-soft-offline-dissolve_free_huge_page-retu.patch new file mode 100644 index 0000000..22a795f --- /dev/null +++ b/patches.fixes/mm-hugetlb-soft-offline-dissolve_free_huge_page-retu.patch @@ -0,0 +1,139 @@ +From faf53def3b143df11062d87c12afe6afeb6f8cc7 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Fri, 28 Jun 2019 12:06:56 -0700 +Subject: [PATCH 2/2] mm: hugetlb: soft-offline: dissolve_free_huge_page() + return zero on !PageHuge +Patch-mainline: v5.2-rc6 +Git-commit: faf53def3b143df11062d87c12afe6afeb6f8cc7 +References: bsc#1139712 + +madvise(MADV_SOFT_OFFLINE) often returns -EBUSY when calling soft offline +for hugepages with overcommitting enabled. That was caused by the +suboptimal code in current soft-offline code. See the following part: + + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); + if (ret) { + ... + } else { + /* + * We set PG_hwpoison only when the migration source hugepage + * was successfully dissolved, because otherwise hwpoisoned + * hugepage remains on free hugepage list, then userspace will + * find it as SIGBUS by allocation failure. That's not expected + * in soft-offlining. + */ + ret = dissolve_free_huge_page(page); + if (!ret) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + } + } + return ret; + +Here dissolve_free_huge_page() returns -EBUSY if the migration source page +was freed into buddy in migrate_pages(), but even in that case we actually +has a chance that set_hwpoison_free_buddy_page() succeeds. So that means +current code gives up offlining too early now. + +dissolve_free_huge_page() checks that a given hugepage is suitable for +dissolving, where we should return success for !PageHuge() case because +the given hugepage is considered as already dissolved. + +This change also affects other callers of dissolve_free_huge_page(), which +are cleaned up together. + +[n-horiguchi@ah.jp.nec.com: v3] + Link: http://lkml.kernel.org/r/1560761476-4651-3-git-send-email-n-horiguchi@ah.jp.nec.comLink: http://lkml.kernel.org/r/1560154686-18497-3-git-send-email-n-horiguchi@ah.jp.nec.com +Fixes: 6bc9b56433b76 ("mm: fix race on soft-offlining") +Signed-off-by: Naoya Horiguchi +Reported-by: Chen, Jerry T +Tested-by: Chen, Jerry T +Reviewed-by: Mike Kravetz +Reviewed-by: Oscar Salvador +Cc: Michal Hocko +Cc: Xishi Qiu +Cc: "Chen, Jerry T" +Cc: "Zhuo, Qiuxu" +Cc: [4.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/hugetlb.c | 29 ++++++++++++++++++++--------- + mm/memory-failure.c | 5 +---- + 2 files changed, 21 insertions(+), 13 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index ac843d3..ede7e7f 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -1510,16 +1510,29 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + + /* + * Dissolve a given free hugepage into free buddy pages. This function does +- * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the +- * dissolution fails because a give page is not a free hugepage, or because +- * free hugepages are fully reserved. ++ * nothing for in-use hugepages and non-hugepages. ++ * This function returns values like below: ++ * ++ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use ++ * (allocated or reserved.) ++ * 0: successfully dissolved free hugepages or the page is not a ++ * hugepage (considered as already dissolved) + */ + int dissolve_free_huge_page(struct page *page) + { + int rc = -EBUSY; + ++ /* Not to disrupt normal path by vainly holding hugetlb_lock */ ++ if (!PageHuge(page)) ++ return 0; ++ + spin_lock(&hugetlb_lock); +- if (PageHuge(page) && !page_count(page)) { ++ if (!PageHuge(page)) { ++ rc = 0; ++ goto out; ++ } ++ ++ if (!page_count(page)) { + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); +@@ -1564,11 +1577,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + page = pfn_to_page(pfn); +- if (PageHuge(page) && !page_count(page)) { +- rc = dissolve_free_huge_page(page); +- if (rc) +- break; +- } ++ rc = dissolve_free_huge_page(page); ++ if (rc) ++ break; + } + + return rc; +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 8ee7b16..d9cc660 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1856,11 +1856,8 @@ static int soft_offline_in_use_page(struct page *page, int flags) + + static int soft_offline_free_page(struct page *page) + { +- int rc = 0; +- struct page *head = compound_head(page); ++ int rc = dissolve_free_huge_page(page); + +- if (PageHuge(head)) +- rc = dissolve_free_huge_page(page); + if (!rc) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); +-- +1.7.12.4 + diff --git a/patches.fixes/mm-hugetlb-soft_offline-save-compound-page-order-bef.patch b/patches.fixes/mm-hugetlb-soft_offline-save-compound-page-order-bef.patch new file mode 100644 index 0000000..a615595 --- /dev/null +++ b/patches.fixes/mm-hugetlb-soft_offline-save-compound-page-order-bef.patch @@ -0,0 +1,77 @@ +From 19bfbe22f59a207417b2679e7e83c180419c9ec5 Mon Sep 17 00:00:00 2001 +From: Alexandru Moise <00moses.alexander00@gmail.com> +Date: Tue, 3 Oct 2017 16:14:31 -0700 +Subject: [PATCH] mm, hugetlb, soft_offline: save compound page order before + page migration +Patch-mainline: v4.14-rc4 +Git-commit: 19bfbe22f59a207417b2679e7e83c180419c9ec5 +References: bsc#1139712 + +This fixes a bug in madvise() where if you'd try to soft offline a +hugepage via madvise(), while walking the address range you'd end up, +using the wrong page offset due to attempting to get the compound order +of a former but presently not compound page, due to dissolving the huge +page (since commit c3114a84f7f9: "mm: hugetlb: soft-offline: dissolve +source hugepage after successful migration"). + +As a result I ended up with all my free pages except one being offlined. + +Link: http://lkml.kernel.org/r/20170912204306.GA12053@gmail.com +Fixes: c3114a84f7f9 ("mm: hugetlb: soft-offline: dissolve source hugepage after successful migration") +Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com> +Cc: Anshuman Khandual +Cc: Michal Hocko +Cc: Andrea Arcangeli +Cc: Minchan Kim +Cc: Hillf Danton +Cc: Shaohua Li +Cc: Mike Rapoport +Cc: "Kirill A. Shutemov" +Cc: Mel Gorman +Cc: David Rientjes +Cc: Rik van Riel +Cc: Naoya Horiguchi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/madvise.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/mm/madvise.c b/mm/madvise.c +index 21261ff0466f..25bade36e9ca 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior, + { + struct page *page; + struct zone *zone; ++ unsigned int order; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +- for (; start < end; start += PAGE_SIZE << +- compound_order(compound_head(page))) { ++ ++ for (; start < end; start += PAGE_SIZE << order) { + int ret; + + ret = get_user_pages_fast(start, 1, 0, &page); + if (ret != 1) + return ret; + ++ /* ++ * When soft offlining hugepages, after migrating the page ++ * we dissolve it, therefore in the second loop "page" will ++ * no longer be a compound page, and order will be 0. ++ */ ++ order = compound_order(compound_head(page)); ++ + if (PageHWPoison(page)) { + put_page(page); + continue; +-- +2.12.3 + diff --git a/patches.fixes/mm-hwpoison-change-PageHWPoison-behavior-on-hugetlb-.patch b/patches.fixes/mm-hwpoison-change-PageHWPoison-behavior-on-hugetlb-.patch new file mode 100644 index 0000000..bc159fc --- /dev/null +++ b/patches.fixes/mm-hwpoison-change-PageHWPoison-behavior-on-hugetlb-.patch @@ -0,0 +1,248 @@ +From 4d3eca6841a22ac10ac31af101e6f51340579615 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:38 -0700 +Subject: [PATCH 2/8] mm: hwpoison: change PageHWPoison behavior on hugetlb + pages +Patch-mainline: v4.13-rc1 +Git-commit: b37ff71cc626a0c1b5e098ff9a0b723815f6aaeb +References: bsc#1139712 + +We'd like to narrow down the error region in memory error on hugetlb +pages. However, currently we set PageHWPoison flags on all subpages in +the error hugepage and add # of subpages to num_hwpoison_pages, which +doesn't fit our purpose. + +So this patch changes the behavior and we only set PageHWPoison on the +head page then increase num_hwpoison_pages only by 1. This is a +preparation for narrow-down part which comes in later patches. + +Link: http://lkml.kernel.org/r/1496305019-5493-4-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Cc: Michal Hocko +Cc: "Aneesh Kumar K.V" +Cc: Anshuman Khandual +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + include/linux/swapops.h | 9 ----- + mm/memory-failure.c | 87 ++++++++++++++----------------------------------- + 2 files changed, 24 insertions(+), 72 deletions(-) + +diff --git a/include/linux/swapops.h b/include/linux/swapops.h +index 5c3a5f3e7eec..c5ff7b217ee6 100644 +--- a/include/linux/swapops.h ++++ b/include/linux/swapops.h +@@ -196,15 +196,6 @@ static inline void num_poisoned_pages_dec(void) + atomic_long_dec(&num_poisoned_pages); + } + +-static inline void num_poisoned_pages_add(long num) +-{ +- atomic_long_add(num, &num_poisoned_pages); +-} +- +-static inline void num_poisoned_pages_sub(long num) +-{ +- atomic_long_sub(num, &num_poisoned_pages); +-} + #else + + static inline swp_entry_t make_hwpoison_entry(struct page *page) +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 36ee53834924..cbf872432bcc 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1009,22 +1009,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, + return unmap_success; + } + +-static void set_page_hwpoison_huge_page(struct page *hpage) +-{ +- int i; +- int nr_pages = 1 << compound_order(hpage); +- for (i = 0; i < nr_pages; i++) +- SetPageHWPoison(hpage + i); +-} +- +-static void clear_page_hwpoison_huge_page(struct page *hpage) +-{ +- int i; +- int nr_pages = 1 << compound_order(hpage); +- for (i = 0; i < nr_pages; i++) +- ClearPageHWPoison(hpage + i); +-} +- + /** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page +@@ -1050,7 +1034,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + struct page *hpage; + struct page *orig_head; + int res; +- unsigned int nr_pages; + unsigned long page_flags; + + if (!sysctl_memory_failure_recovery) +@@ -1064,24 +1047,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + + p = pfn_to_page(pfn); + orig_head = hpage = compound_head(p); ++ ++ /* tmporary check code, to be updated in later patches */ ++ if (PageHuge(p)) { ++ if (TestSetPageHWPoison(hpage)) { ++ pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); ++ return 0; ++ } ++ goto tmp; ++ } + if (TestSetPageHWPoison(p)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); + return 0; + } + +- /* +- * Currently errors on hugetlbfs pages are measured in hugepage units, +- * so nr_pages should be 1 << compound_order. OTOH when errors are on +- * transparent hugepages, they are supposed to be split and error +- * measurement is done in normal page units. So nr_pages should be one +- * in this case. +- */ +- if (PageHuge(p)) +- nr_pages = 1 << compound_order(hpage); +- else /* normal page or thp */ +- nr_pages = 1; +- num_poisoned_pages_add(nr_pages); ++tmp: ++ num_poisoned_pages_inc(); + + /* + * We need/can do nothing about count=0 pages. +@@ -1109,12 +1091,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + if (PageHWPoison(hpage)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { +- num_poisoned_pages_sub(nr_pages); ++ num_poisoned_pages_dec(); + unlock_page(hpage); + return 0; + } + } +- set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); +@@ -1137,7 +1118,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + pr_err("Memory failure: %#lx: thp split failed\n", + pfn); + if (TestClearPageHWPoison(p)) +- num_poisoned_pages_sub(nr_pages); ++ num_poisoned_pages_dec(); + put_hwpoison_page(p); + return -EBUSY; + } +@@ -1193,14 +1174,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + */ + if (!PageHWPoison(p)) { + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); +- num_poisoned_pages_sub(nr_pages); ++ num_poisoned_pages_dec(); + unlock_page(hpage); + put_hwpoison_page(hpage); + return 0; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) +- num_poisoned_pages_sub(nr_pages); ++ num_poisoned_pages_dec(); + unlock_page(hpage); + put_hwpoison_page(hpage); + return 0; +@@ -1219,14 +1200,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + put_hwpoison_page(hpage); + return 0; + } +- /* +- * Set PG_hwpoison on all pages in an error hugepage, +- * because containment is done in hugepage unit for now. +- * Since we have done TestSetPageHWPoison() for the head page with +- * page lock held, we can safely set PG_hwpoison bits on tail pages. +- */ +- if (PageHuge(p)) +- set_page_hwpoison_huge_page(hpage); + + /* + * It's very difficult to mess with pages currently under IO +@@ -1397,7 +1370,6 @@ int unpoison_memory(unsigned long pfn) + struct page *page; + struct page *p; + int freeit = 0; +- unsigned int nr_pages; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + +@@ -1442,8 +1414,6 @@ int unpoison_memory(unsigned long pfn) + return 0; + } + +- nr_pages = 1 << compound_order(page); +- + if (!get_hwpoison_page(p)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, +@@ -1473,10 +1443,8 @@ int unpoison_memory(unsigned long pfn) + if (TestClearPageHWPoison(page)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + pfn, &unpoison_rs); +- num_poisoned_pages_sub(nr_pages); ++ num_poisoned_pages_dec(); + freeit = 1; +- if (PageHuge(page)) +- clear_page_hwpoison_huge_page(page); + } + unlock_page(page); + +@@ -1603,14 +1571,10 @@ static int soft_offline_huge_page(struct page *page, int flags) + ret = -EIO; + } else { + /* overcommit hugetlb page will be freed to buddy */ +- if (PageHuge(page)) { +- set_page_hwpoison_huge_page(hpage); ++ SetPageHWPoison(page); ++ if (PageHuge(page)) + dequeue_hwpoisoned_huge_page(hpage); +- num_poisoned_pages_add(1 << compound_order(hpage)); +- } else { +- SetPageHWPoison(page); +- num_poisoned_pages_inc(); +- } ++ num_poisoned_pages_inc(); + } + return ret; + } +@@ -1726,15 +1690,12 @@ static int soft_offline_in_use_page(struct page *page, int flags) + + static void soft_offline_free_page(struct page *page) + { +- if (PageHuge(page)) { +- struct page *hpage = compound_head(page); ++ struct page *head = compound_head(page); + +- set_page_hwpoison_huge_page(hpage); +- if (!dequeue_hwpoisoned_huge_page(hpage)) +- num_poisoned_pages_add(1 << compound_order(hpage)); +- } else { +- if (!TestSetPageHWPoison(page)) +- num_poisoned_pages_inc(); ++ if (!TestSetPageHWPoison(head)) { ++ num_poisoned_pages_inc(); ++ if (PageHuge(head)) ++ dequeue_hwpoisoned_huge_page(head); + } + } + +-- +2.12.3 + diff --git a/patches.fixes/mm-hwpoison-dissolve-in-use-hugepage-in-unrecoverabl.patch b/patches.fixes/mm-hwpoison-dissolve-in-use-hugepage-in-unrecoverabl.patch new file mode 100644 index 0000000..509364e --- /dev/null +++ b/patches.fixes/mm-hwpoison-dissolve-in-use-hugepage-in-unrecoverabl.patch @@ -0,0 +1,212 @@ +From ffd800e5a2472fa046618c6217cd1e75f32e8855 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:50 -0700 +Subject: [PATCH 6/8] mm: hwpoison: dissolve in-use hugepage in unrecoverable + memory error +Patch-mainline: v4.13-rc1 +Git-commit: 78bb920344b8a6f04b79a7c254041723b931c94f +References: bsc#1139712 + +Currently me_huge_page() relies on dequeue_hwpoisoned_huge_page() to +keep the error hugepage away from the system, which is OK but not good +enough because the hugepage still has a refcount and unpoison doesn't +work on the error hugepage (PageHWPoison flags are cleared but pages are +still leaked.) And there's "wasting health subpages" issue too. This +patch reworks on me_huge_page() to solve these issues. + +For hugetlb file, recently we have truncating code so let's use it in +hugetlbfs specific ->error_remove_page(). + +For anonymous hugepage, it's helpful to dissolve the error page after +freeing it into free hugepage list. Migration entry and PageHWPoison in +the head page prevent the access to it. + +TODO: dissolve_free_huge_page() can fail but we don't considered it yet. +It's not critical (and at least no worse that now) because in such case +the error hugepage just stays in free hugepage list without being +dissolved. By virtue of PageHWPoison in head page, it's never allocated +to processes. + +[akpm@linux-foundation.org: fix unused var warnings] +Fixes: 23a003bfd23ea9ea0b7756b920e51f64b284b468 ("mm/madvise: pass return code of memory_failure() to userspace") +Link: http://lkml.kernel.org/r/20170417055948.GM31394@yexl-desktop +Link: http://lkml.kernel.org/r/1496305019-5493-8-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Reported-by: kernel test robot +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + fs/hugetlbfs/inode.c | 11 +++++++ + mm/memory-failure.c | 93 ++++++++++++++++++++++++++++++---------------------- + 2 files changed, 64 insertions(+), 40 deletions(-) + +diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c +index d44f5456eb9b..52388611635e 100644 +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -851,6 +851,16 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, + return MIGRATEPAGE_SUCCESS; + } + ++static int hugetlbfs_error_remove_page(struct address_space *mapping, ++ struct page *page) ++{ ++ struct inode *inode = mapping->host; ++ ++ remove_huge_page(page); ++ hugetlb_fix_reserve_counts(inode); ++ return 0; ++} ++ + static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) + { + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); +@@ -966,6 +976,7 @@ static const struct address_space_operations hugetlbfs_aops = { + .write_end = hugetlbfs_write_end, + .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, ++ .error_remove_page = hugetlbfs_error_remove_page, + }; + + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 3ee4b3b694a2..4a95c12f39d2 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -554,6 +554,39 @@ static int delete_from_lru_cache(struct page *p) + return -EIO; + } + ++static int truncate_error_page(struct page *p, unsigned long pfn, ++ struct address_space *mapping) ++{ ++ int ret = MF_FAILED; ++ ++ if (mapping->a_ops->error_remove_page) { ++ int err = mapping->a_ops->error_remove_page(mapping, p); ++ ++ if (err != 0) { ++ pr_info("Memory failure: %#lx: Failed to punch page: %d\n", ++ pfn, err); ++ } else if (page_has_private(p) && ++ !try_to_release_page(p, GFP_NOIO)) { ++ pr_info("Memory failure: %#lx: failed to release buffers\n", ++ pfn); ++ } else { ++ ret = MF_RECOVERED; ++ } ++ } else { ++ /* ++ * If the file system doesn't support it just invalidate ++ * This fails on dirty or anything with private pages ++ */ ++ if (invalidate_inode_page(p)) ++ ret = MF_RECOVERED; ++ else ++ pr_info("Memory failure: %#lx: Failed to invalidate\n", ++ pfn); ++ } ++ ++ return ret; ++} ++ + /* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we +@@ -578,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn) + */ + static int me_pagecache_clean(struct page *p, unsigned long pfn) + { +- int err; +- int ret = MF_FAILED; + struct address_space *mapping; + + delete_from_lru_cache(p); +@@ -611,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) + * + * Open: to take i_mutex or not for this? Right now we don't. + */ +- if (mapping->a_ops->error_remove_page) { +- err = mapping->a_ops->error_remove_page(mapping, p); +- if (err != 0) { +- pr_info("Memory failure: %#lx: Failed to punch page: %d\n", +- pfn, err); +- } else if (page_has_private(p) && +- !try_to_release_page(p, GFP_NOIO)) { +- pr_info("Memory failure: %#lx: failed to release buffers\n", +- pfn); +- } else { +- ret = MF_RECOVERED; +- } +- } else { +- /* +- * If the file system doesn't support it just invalidate +- * This fails on dirty or anything with private pages +- */ +- if (invalidate_inode_page(p)) +- ret = MF_RECOVERED; +- else +- pr_info("Memory failure: %#lx: Failed to invalidate\n", +- pfn); +- } +- return ret; ++ return truncate_error_page(p, pfn, mapping); + } + + /* +@@ -740,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn) + { + int res = 0; + struct page *hpage = compound_head(p); ++ struct address_space *mapping; + + if (!PageHuge(hpage)) + return MF_DELAYED; + +- /* +- * We can safely recover from error on free or reserved (i.e. +- * not in-use) hugepage by dequeuing it from freelist. +- * To check whether a hugepage is in-use or not, we can't use +- * page->lru because it can be used in other hugepage operations, +- * such as __unmap_hugepage_range() and gather_surplus_pages(). +- * So instead we use page_mapping() and PageAnon(). +- */ +- if (!(page_mapping(hpage) || PageAnon(hpage))) { +- res = dequeue_hwpoisoned_huge_page(hpage); +- if (!res) +- return MF_RECOVERED; ++ mapping = page_mapping(hpage); ++ if (mapping) { ++ res = truncate_error_page(hpage, pfn, mapping); ++ } else { ++ unlock_page(hpage); ++ /* ++ * migration entry prevents later access on error anonymous ++ * hugepage, so we can free and dissolve it into buddy to ++ * save healthy subpages. ++ */ ++ if (PageAnon(hpage)) ++ put_page(hpage); ++ dissolve_free_huge_page(p); ++ res = MF_RECOVERED; ++ lock_page(hpage); + } +- return MF_DELAYED; ++ ++ return res; + } + + /* +@@ -856,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p, + count = page_count(p) - 1; + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) + count--; +- if (count != 0) { ++ if (count > 0) { + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + pfn, action_page_types[ps->type], count); + result = MF_FAILED; +-- +2.12.3 + diff --git a/patches.fixes/mm-hwpoison-introduce-idenfity_page_state.patch b/patches.fixes/mm-hwpoison-introduce-idenfity_page_state.patch new file mode 100644 index 0000000..1317fa5 --- /dev/null +++ b/patches.fixes/mm-hwpoison-introduce-idenfity_page_state.patch @@ -0,0 +1,113 @@ +From 594053d43d61e2096aca29b15c6d44264ee436e2 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:56 -0700 +Subject: [PATCH 8/8] mm: hwpoison: introduce idenfity_page_state +Patch-mainline: v4.13-rc1 +Git-commit: 0348d2ebec9b00ea87b42dffdb3f393007303b82 +References: bsc#1139712 + +Factoring duplicate code into a function. + +Link: http://lkml.kernel.org/r/1496305019-5493-10-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/memory-failure.c | 57 +++++++++++++++++++++++------------------------------ + 1 file changed, 25 insertions(+), 32 deletions(-) + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index ba1f316e923d..b413fce59602 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1022,9 +1022,31 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, + return unmap_success; + } + +-static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) ++static int identify_page_state(unsigned long pfn, struct page *p, ++ unsigned long page_flags) + { + struct page_state *ps; ++ ++ /* ++ * The first check uses the current page flags which may not have any ++ * relevant information. The second check with the saved page flags is ++ * carried out only if the first check can't determine the page status. ++ */ ++ for (ps = error_states;; ps++) ++ if ((p->flags & ps->mask) == ps->res) ++ break; ++ ++ page_flags |= (p->flags & (1UL << PG_dirty)); ++ ++ if (!ps->mask) ++ for (ps = error_states;; ps++) ++ if ((page_flags & ps->mask) == ps->res) ++ break; ++ return page_action(ps, p, pfn); ++} ++ ++static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) ++{ + struct page *p = pfn_to_page(pfn); + struct page *head = compound_head(p); + int res; +@@ -1074,19 +1096,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) + goto out; + } + +- res = -EBUSY; +- +- for (ps = error_states;; ps++) +- if ((p->flags & ps->mask) == ps->res) +- break; +- +- page_flags |= (p->flags & (1UL << PG_dirty)); +- +- if (!ps->mask) +- for (ps = error_states;; ps++) +- if ((page_flags & ps->mask) == ps->res) +- break; +- res = page_action(ps, p, pfn); ++ res = identify_page_state(pfn, p, page_flags); + out: + unlock_page(head); + return res; +@@ -1112,7 +1122,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) + */ + int memory_failure(unsigned long pfn, int trapno, int flags) + { +- struct page_state *ps; + struct page *p; + struct page *hpage; + struct page *orig_head; +@@ -1273,23 +1282,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + } + + identify_page_state: +- res = -EBUSY; +- /* +- * The first check uses the current page flags which may not have any +- * relevant information. The second check with the saved page flagss is +- * carried out only if the first check can't determine the page status. +- */ +- for (ps = error_states;; ps++) +- if ((p->flags & ps->mask) == ps->res) +- break; +- +- page_flags |= (p->flags & (1UL << PG_dirty)); +- +- if (!ps->mask) +- for (ps = error_states;; ps++) +- if ((page_flags & ps->mask) == ps->res) +- break; +- res = page_action(ps, p, pfn); ++ res = identify_page_state(pfn, p, page_flags); + out: + unlock_page(p); + return res; +-- +2.12.3 + diff --git a/patches.fixes/mm-hwpoison-introduce-memory_failure_hugetlb.patch b/patches.fixes/mm-hwpoison-introduce-memory_failure_hugetlb.patch new file mode 100644 index 0000000..31dbba8 --- /dev/null +++ b/patches.fixes/mm-hwpoison-introduce-memory_failure_hugetlb.patch @@ -0,0 +1,234 @@ +From 1661392c29aef225172daae79555ec120718cc88 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:47 -0700 +Subject: [PATCH 5/8] mm: hwpoison: introduce memory_failure_hugetlb() +Patch-mainline: v4.13-rc1 +Git-commit: 761ad8d7c7b5485bb66fd5bccb58a891fe784544 +References: bsc#1139712 + +memory_failure() is a big function and hard to maintain. Handling +hugetlb- and non-hugetlb- case in a single function is not good, so this +patch separates PageHuge() branch into a new function, which saves many +PageHuge() check. + +Link: http://lkml.kernel.org/r/1496305019-5493-7-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/memory-failure.c | 134 ++++++++++++++++++++++++++++++++-------------------- + 1 file changed, 82 insertions(+), 52 deletions(-) + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index f1c85217adf8..3ee4b3b694a2 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1009,6 +1009,76 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, + return unmap_success; + } + ++static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) ++{ ++ struct page_state *ps; ++ struct page *p = pfn_to_page(pfn); ++ struct page *head = compound_head(p); ++ int res; ++ unsigned long page_flags; ++ ++ if (TestSetPageHWPoison(head)) { ++ pr_err("Memory failure: %#lx: already hardware poisoned\n", ++ pfn); ++ return 0; ++ } ++ ++ num_poisoned_pages_inc(); ++ ++ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { ++ /* ++ * Check "filter hit" and "race with other subpage." ++ */ ++ lock_page(head); ++ if (PageHWPoison(head)) { ++ if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) ++ || (p != head && TestSetPageHWPoison(head))) { ++ num_poisoned_pages_dec(); ++ unlock_page(head); ++ return 0; ++ } ++ } ++ unlock_page(head); ++ dissolve_free_huge_page(p); ++ action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); ++ return 0; ++ } ++ ++ lock_page(head); ++ page_flags = head->flags; ++ ++ if (!PageHWPoison(head)) { ++ pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); ++ num_poisoned_pages_dec(); ++ unlock_page(head); ++ put_hwpoison_page(head); ++ return 0; ++ } ++ ++ if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) { ++ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); ++ res = -EBUSY; ++ goto out; ++ } ++ ++ res = -EBUSY; ++ ++ for (ps = error_states;; ps++) ++ if ((p->flags & ps->mask) == ps->res) ++ break; ++ ++ page_flags |= (p->flags & (1UL << PG_dirty)); ++ ++ if (!ps->mask) ++ for (ps = error_states;; ps++) ++ if ((page_flags & ps->mask) == ps->res) ++ break; ++ res = page_action(ps, p, pfn); ++out: ++ unlock_page(head); ++ return res; ++} ++ + /** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page +@@ -1046,33 +1116,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + } + + p = pfn_to_page(pfn); +- orig_head = hpage = compound_head(p); +- +- /* tmporary check code, to be updated in later patches */ +- if (PageHuge(p)) { +- if (TestSetPageHWPoison(hpage)) { +- pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); +- return 0; +- } +- goto tmp; +- } ++ if (PageHuge(p)) ++ return memory_failure_hugetlb(pfn, trapno, flags); + if (TestSetPageHWPoison(p)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); + return 0; + } + +-tmp: ++ orig_head = hpage = compound_head(p); + num_poisoned_pages_inc(); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. +- * 2) it's a free hugepage, which is also safe: +- * an affected hugepage will be dequeued from hugepage freelist, +- * so there's no concern about reusing it ever after. +- * 3) it's part of a non-compound high order page. ++ * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. +@@ -1083,31 +1142,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + if (is_free_buddy_page(p)) { + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); + return 0; +- } else if (PageHuge(hpage)) { +- /* +- * Check "filter hit" and "race with other subpage." +- */ +- lock_page(hpage); +- if (PageHWPoison(hpage)) { +- if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) +- || (p != hpage && TestSetPageHWPoison(hpage))) { +- num_poisoned_pages_dec(); +- unlock_page(hpage); +- return 0; +- } +- } +- res = dequeue_hwpoisoned_huge_page(hpage); +- action_result(pfn, MF_MSG_FREE_HUGE, +- res ? MF_IGNORED : MF_DELAYED); +- unlock_page(hpage); +- return res; + } else { + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + return -EBUSY; + } + } + +- if (!PageHuge(p) && PageTransHuge(hpage)) { ++ if (PageTransHuge(hpage)) { + lock_page(p); + if (!PageAnon(p) || unlikely(split_huge_page(p))) { + unlock_page(p); +@@ -1145,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + return 0; + } + +- lock_page(hpage); ++ lock_page(p); + + /* + * The page could have changed compound pages during the locking. +@@ -1175,33 +1216,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + if (!PageHWPoison(p)) { + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); + num_poisoned_pages_dec(); +- unlock_page(hpage); +- put_hwpoison_page(hpage); ++ unlock_page(p); ++ put_hwpoison_page(p); + return 0; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + num_poisoned_pages_dec(); +- unlock_page(hpage); +- put_hwpoison_page(hpage); ++ unlock_page(p); ++ put_hwpoison_page(p); + return 0; + } + +- if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) ++ if (!PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + + /* +- * For error on the tail page, we should set PG_hwpoison +- * on the head page to show that the hugepage is hwpoisoned +- */ +- if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { +- action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); +- unlock_page(hpage); +- put_hwpoison_page(hpage); +- return 0; +- } +- +- /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ +@@ -1248,7 +1278,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) + break; + res = page_action(ps, p, pfn); + out: +- unlock_page(hpage); ++ unlock_page(p); + return res; + } + EXPORT_SYMBOL_GPL(memory_failure); +-- +2.12.3 + diff --git a/patches.fixes/mm-madvise_inject_error-Let-memory_failure-optionall.patch b/patches.fixes/mm-madvise_inject_error-Let-memory_failure-optionall.patch index 1ccebf5..b3ddac7 100644 --- a/patches.fixes/mm-madvise_inject_error-Let-memory_failure-optionall.patch +++ b/patches.fixes/mm-madvise_inject_error-Let-memory_failure-optionall.patch @@ -30,15 +30,15 @@ Signed-off-by: Dave Jiang Acked-by: Jan Kara --- - mm/madvise.c | 16 +++++++++++++--- - 1 file changed, 13 insertions(+), 3 deletions(-) + mm/madvise.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) --- a/mm/madvise.c +++ b/mm/madvise.c -@@ -615,11 +615,13 @@ static int madvise_inject_error(int beha +@@ -616,11 +616,13 @@ static int madvise_inject_error(int beha - for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(page))) { + + for (; start < end; start += PAGE_SIZE << order) { + unsigned long pfn; int ret; @@ -47,9 +47,9 @@ Acked-by: Jan Kara return ret; + pfn = page_to_pfn(page); - if (PageHWPoison(page)) { - put_page(page); -@@ -628,17 +630,25 @@ static int madvise_inject_error(int beha + /* + * When soft offlining hugepages, after migrating the page +@@ -636,7 +638,7 @@ static int madvise_inject_error(int beha if (behavior == MADV_SOFT_OFFLINE) { pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", @@ -58,10 +58,9 @@ Acked-by: Jan Kara ret = soft_offline_page(page, MF_COUNT_INCREASED); if (ret) - return ret; +@@ -644,9 +646,16 @@ static int madvise_inject_error(int beha continue; } -+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", - page_to_pfn(page), start); + pfn, start); diff --git a/patches.fixes/mm-memory_failure-Teach-memory_failure-about-dev_pag.patch b/patches.fixes/mm-memory_failure-Teach-memory_failure-about-dev_pag.patch index 917f7cf..41a5a41 100644 --- a/patches.fixes/mm-memory_failure-Teach-memory_failure-about-dev_pag.patch +++ b/patches.fixes/mm-memory_failure-Teach-memory_failure-about-dev_pag.patch @@ -51,7 +51,7 @@ Acked-by: Jan Kara --- a/include/linux/mm.h +++ b/include/linux/mm.h -@@ -2607,6 +2607,7 @@ enum mf_action_page_type { +@@ -2612,6 +2612,7 @@ enum mf_action_page_type { MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_BUDDY_2ND, @@ -61,14 +61,14 @@ Acked-by: Jan Kara --- a/mm/memory-failure.c +++ b/mm/memory-failure.c -@@ -56,6 +56,7 @@ +@@ -55,6 +55,7 @@ #include #include #include +#include #include #include - #include "internal.h" + #include @@ -272,6 +273,40 @@ void shake_page(struct page *p, int acce } EXPORT_SYMBOL_GPL(shake_page); @@ -139,8 +139,8 @@ Acked-by: Jan Kara [MF_MSG_UNKNOWN] = "unknown page", }; -@@ -1027,6 +1066,84 @@ static void clear_page_hwpoison_huge_pag - ClearPageHWPoison(hpage + i); +@@ -1104,6 +1143,84 @@ out: + return res; } +static int memory_failure_dev_pagemap(unsigned long pfn, int trapno, int flags, @@ -224,15 +224,15 @@ Acked-by: Jan Kara /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page -@@ -1051,6 +1168,7 @@ int memory_failure(unsigned long pfn, in +@@ -1127,6 +1244,7 @@ int memory_failure(unsigned long pfn, in struct page *p; struct page *hpage; struct page *orig_head; + struct dev_pagemap *pgmap; int res; - unsigned int nr_pages; unsigned long page_flags; -@@ -1064,6 +1182,10 @@ int memory_failure(unsigned long pfn, in + +@@ -1139,6 +1257,10 @@ int memory_failure(unsigned long pfn, in return -ENXIO; } @@ -241,5 +241,5 @@ Acked-by: Jan Kara + return memory_failure_dev_pagemap(pfn, trapno, flags, pgmap); + p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); - if (TestSetPageHWPoison(p)) { + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); diff --git a/patches.fixes/mm-soft-offline-close-the-race-against-page-allocati.patch b/patches.fixes/mm-soft-offline-close-the-race-against-page-allocati.patch new file mode 100644 index 0000000..85437dd --- /dev/null +++ b/patches.fixes/mm-soft-offline-close-the-race-against-page-allocati.patch @@ -0,0 +1,203 @@ +From d4ae9916ea2947341180d2b538f48875ff393a86 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Thu, 23 Aug 2018 17:00:42 -0700 +Subject: [PATCH] mm: soft-offline: close the race against page allocation +Patch-mainline: v4.19-rc1 +Git-commit: d4ae9916ea2947341180d2b538f48875ff393a86 +References: bsc#1139712 + +A process can be killed with SIGBUS(BUS_MCEERR_AR) when it tries to +allocate a page that was just freed on the way of soft-offline. This is +undesirable because soft-offline (which is about corrected error) is +less aggressive than hard-offline (which is about uncorrected error), +and we can make soft-offline fail and keep using the page for good +reason like "system is busy." + +Two main changes of this patch are: + +- setting migrate type of the target page to MIGRATE_ISOLATE. As done + in free_unref_page_commit(), this makes kernel bypass pcplist when + freeing the page. So we can assume that the page is in freelist just + after put_page() returns, + +- setting PG_hwpoison on free page under zone->lock which protects + freelists, so this allows us to avoid setting PG_hwpoison on a page + that is decided to be allocated soon. + +[akpm@linux-foundation.org: tweak set_hwpoison_free_buddy_page() comment] +Link: http://lkml.kernel.org/r/1531452366-11661-3-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Reported-by: Xishi Qiu +Tested-by: Mike Kravetz +Cc: Michal Hocko +Cc: +Cc: Mike Kravetz +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + include/linux/page-flags.h | 5 +++++ + include/linux/swapops.h | 10 ---------- + mm/memory-failure.c | 26 +++++++++++++++++++++----- + mm/migrate.c | 2 +- + mm/page_alloc.c | 30 ++++++++++++++++++++++++++++++ + 5 files changed, 57 insertions(+), 16 deletions(-) + +--- a/include/linux/page-flags.h ++++ b/include/linux/page-flags.h +@@ -358,8 +358,13 @@ PAGEFLAG_FALSE(Uncached) + PAGEFLAG(HWPoison, hwpoison, PF_ANY) + TESTSCFLAG(HWPoison, hwpoison, PF_ANY) + #define __PG_HWPOISON (1UL << PG_hwpoison) ++extern bool set_hwpoison_free_buddy_page(struct page *page); + #else + PAGEFLAG_FALSE(HWPoison) ++static inline bool set_hwpoison_free_buddy_page(struct page *page) ++{ ++ return 0; ++} + #define __PG_HWPOISON 0 + #endif + +--- a/include/linux/swapops.h ++++ b/include/linux/swapops.h +@@ -249,11 +249,6 @@ static inline int is_hwpoison_entry(swp_ + return swp_type(entry) == SWP_HWPOISON; + } + +-static inline bool test_set_page_hwpoison(struct page *page) +-{ +- return TestSetPageHWPoison(page); +-} +- + static inline void num_poisoned_pages_inc(void) + { + atomic_long_inc(&num_poisoned_pages); +@@ -276,11 +271,6 @@ static inline int is_hwpoison_entry(swp_ + return 0; + } + +-static inline bool test_set_page_hwpoison(struct page *page) +-{ +- return false; +-} +- + static inline void num_poisoned_pages_inc(void) + { + } +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + #include "internal.h" + #include "ras/ras_event.h" + +@@ -1694,6 +1695,7 @@ static int __soft_offline_page(struct pa + static int soft_offline_in_use_page(struct page *page, int flags) + { + int ret; ++ int mt; + struct page *hpage = compound_head(page); + + if (!PageHuge(page) && PageTransHuge(hpage)) { +@@ -1712,23 +1714,37 @@ static int soft_offline_in_use_page(stru + put_hwpoison_page(hpage); + } + ++ /* ++ * Setting MIGRATE_ISOLATE here ensures that the page will be linked ++ * to free list immediately (not via pcplist) when released after ++ * successful page migration. Otherwise we can't guarantee that the ++ * page is really free after put_page() returns, so ++ * set_hwpoison_free_buddy_page() highly likely fails. ++ */ ++ mt = get_pageblock_migratetype(page); ++ set_pageblock_migratetype(page, MIGRATE_ISOLATE); + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); +- ++ set_pageblock_migratetype(page, mt); + return ret; + } + +-static void soft_offline_free_page(struct page *page) ++static int soft_offline_free_page(struct page *page) + { + int rc = 0; + struct page *head = compound_head(page); + + if (PageHuge(head)) + rc = dissolve_free_huge_page(page); +- if (!rc && !TestSetPageHWPoison(page)) +- num_poisoned_pages_inc(); ++ if (!rc) { ++ if (set_hwpoison_free_buddy_page(page)) ++ num_poisoned_pages_inc(); ++ else ++ rc = -EBUSY; ++ } ++ return rc; + } + + /** +@@ -1772,7 +1788,7 @@ int soft_offline_page(struct page *page, + if (ret > 0) + ret = soft_offline_in_use_page(page, flags); + else if (ret == 0) +- soft_offline_free_page(page); ++ ret = soft_offline_free_page(page); + + return ret; + } +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1173,7 +1173,7 @@ out: + * intentionally. Although it's rather weird, + * it's how HWPoison flag works at the moment. + */ +- if (!test_set_page_hwpoison(page)) ++ if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + } + } else { +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -7877,3 +7877,33 @@ bool is_free_buddy_page(struct page *pag + + return order < MAX_ORDER; + } ++ ++#ifdef CONFIG_MEMORY_FAILURE ++/* ++ * Set PG_hwpoison flag if a given page is confirmed to be a free page. This ++ * test is performed under the zone lock to prevent a race against page ++ * allocation. ++ */ ++bool set_hwpoison_free_buddy_page(struct page *page) ++{ ++ struct zone *zone = page_zone(page); ++ unsigned long pfn = page_to_pfn(page); ++ unsigned long flags; ++ unsigned int order; ++ bool hwpoisoned = false; ++ ++ spin_lock_irqsave(&zone->lock, flags); ++ for (order = 0; order < MAX_ORDER; order++) { ++ struct page *page_head = page - (pfn & ((1 << order) - 1)); ++ ++ if (PageBuddy(page_head) && page_order(page_head) >= order) { ++ if (!TestSetPageHWPoison(page)) ++ hwpoisoned = true; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&zone->lock, flags); ++ ++ return hwpoisoned; ++} ++#endif diff --git a/patches.fixes/mm-soft-offline-dissolve-free-hugepage-if-soft-offli.patch b/patches.fixes/mm-soft-offline-dissolve-free-hugepage-if-soft-offli.patch new file mode 100644 index 0000000..e71a493 --- /dev/null +++ b/patches.fixes/mm-soft-offline-dissolve-free-hugepage-if-soft-offli.patch @@ -0,0 +1,36 @@ +From 515dce8214e35b90baa3967d5defa92e71392073 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Mon, 10 Jul 2017 15:47:44 -0700 +Subject: [PATCH 4/8] mm: soft-offline: dissolve free hugepage if soft-offlined +Patch-mainline: v4.13-rc1 +Git-commit: d4a3a60b37bf4609f9b17961a0db2f6e7ec746cd +References: bsc#1139712 + +Now we have code to rescue most of healthy pages from a hwpoisoned +hugepage. So let's apply it to soft_offline_free_page too. + +Link: http://lkml.kernel.org/r/1496305019-5493-6-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/memory-failure.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index c5f3411f8011..f1c85217adf8 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1692,7 +1692,7 @@ static void soft_offline_free_page(struct page *page) + if (!TestSetPageHWPoison(head)) { + num_poisoned_pages_inc(); + if (PageHuge(head)) +- dequeue_hwpoisoned_huge_page(head); ++ dissolve_free_huge_page(page); + } + } + +-- +2.12.3 + diff --git a/patches.fixes/mm-soft-offline-return-EBUSY-if-set_hwpoison_free_bu.patch b/patches.fixes/mm-soft-offline-return-EBUSY-if-set_hwpoison_free_bu.patch new file mode 100644 index 0000000..fbc2539 --- /dev/null +++ b/patches.fixes/mm-soft-offline-return-EBUSY-if-set_hwpoison_free_bu.patch @@ -0,0 +1,51 @@ +From b38e5962f8ed0d2a2b28a887fc2221f7f41db119 Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Fri, 28 Jun 2019 12:06:53 -0700 +Subject: [PATCH 1/2] mm: soft-offline: return -EBUSY if + set_hwpoison_free_buddy_page() fails +Patch-mainline: v5.2-rc6 +Git-commit: b38e5962f8ed0d2a2b28a887fc2221f7f41db119 +References: bsc#1139712 + +The pass/fail of soft offline should be judged by checking whether the +raw error page was finally contained or not (i.e. the result of +set_hwpoison_free_buddy_page()), but current code do not work like +that. It might lead us to misjudge the test result when +set_hwpoison_free_buddy_page() fails. + +Without this fix, there are cases where madvise(MADV_SOFT_OFFLINE) may +not offline the original page and will not return an error. + +Link: http://lkml.kernel.org/r/1560154686-18497-2-git-send-email-n-horiguchi@ah.jp.nec.com +Signed-off-by: Naoya Horiguchi +Fixes: 6bc9b56433b76 ("mm: fix race on soft-offlining") +Reviewed-by: Mike Kravetz +Reviewed-by: Oscar Salvador +Cc: Michal Hocko +Cc: Xishi Qiu +Cc: "Chen, Jerry T" +Cc: "Zhuo, Qiuxu" +Cc: [4.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Oscar Salvador +--- + mm/memory-failure.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 8da0334..8ee7b16 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -1730,6 +1730,8 @@ static int soft_offline_huge_page(struct page *page, int flags) + if (!ret) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); ++ else ++ ret = -EBUSY; + } + } + return ret; +-- +1.7.12.4 + diff --git a/patches.fixes/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch b/patches.fixes/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch index 977d41d..18abdcb 100644 --- a/patches.fixes/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch +++ b/patches.fixes/x86-mm-mm-hwpoison-clear-present-bit-for-kernel-1-1-mappings-of-poison-pages.patch @@ -138,12 +138,12 @@ Acked-by: Borislav Petkov #endif --- a/mm/memory-failure.c +++ b/mm/memory-failure.c -@@ -1063,6 +1063,8 @@ int memory_failure(unsigned long pfn, in +@@ -1137,6 +1137,8 @@ int memory_failure(unsigned long pfn, in return -ENXIO; } + arch_unmap_kpfn(pfn); + p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); - if (TestSetPageHWPoison(p)) { + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); diff --git a/patches.fixes/x86-mm-mm-hwpoison-don-t-unconditionally-unmap-kernel-1-1-pages.patch b/patches.fixes/x86-mm-mm-hwpoison-don-t-unconditionally-unmap-kernel-1-1-pages.patch index 267c629..4d56a37 100644 --- a/patches.fixes/x86-mm-mm-hwpoison-don-t-unconditionally-unmap-kernel-1-1-pages.patch +++ b/patches.fixes/x86-mm-mm-hwpoison-don-t-unconditionally-unmap-kernel-1-1-pages.patch @@ -76,6 +76,30 @@ Acked-by: Borislav Petkov #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION +--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h ++++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h +@@ -13,6 +13,21 @@ enum severity_level { + MCE_PANIC_SEVERITY, + }; + ++#ifndef CONFIG_X86_64 ++/* ++ * On 32-bit systems it would be difficult to safely unmap a poison page ++ * from the kernel 1:1 map because there are no non-canonical addresses that ++ * we can use to refer to the address without risking a speculative access. ++ * However, this isn't much of an issue because: ++ * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which ++ * are only mapped into the kernel as needed ++ * 2) Few people would run a 32-bit kernel on a machine that supports ++ * recoverable errors because they have too much memory to boot 32-bit. ++ */ ++static inline void mce_unmap_kpfn(unsigned long pfn) {} ++#define mce_unmap_kpfn mce_unmap_kpfn ++#endif ++ + extern struct blocking_notifier_head x86_mce_decoder_chain; + + #define ATTR_LEN 16 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -106,6 +106,10 @@ static struct irq_work mce_irq_work; @@ -89,7 +113,7 @@ Acked-by: Borislav Petkov /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. -@@ -582,7 +586,8 @@ static int srao_decode_notifier(struct n +@@ -591,7 +595,8 @@ static int srao_decode_notifier(struct n if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) { pfn = mce->addr >> PAGE_SHIFT; @@ -99,7 +123,7 @@ Acked-by: Borislav Petkov } return NOTIFY_OK; -@@ -1054,12 +1059,13 @@ static int do_memory_failure(struct mce +@@ -1063,12 +1068,13 @@ static int do_memory_failure(struct mce ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags); if (ret) pr_err("Memory error not recovered"); @@ -116,7 +140,7 @@ Acked-by: Borislav Petkov { unsigned long decoy_addr; -@@ -1070,7 +1076,7 @@ void arch_unmap_kpfn(unsigned long pfn) +@@ -1079,7 +1085,7 @@ void arch_unmap_kpfn(unsigned long pfn) * We would like to just call: * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a @@ -125,7 +149,7 @@ Acked-by: Borislav Petkov * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address -@@ -1095,7 +1101,6 @@ void arch_unmap_kpfn(unsigned long pfn) +@@ -1104,7 +1110,6 @@ void arch_unmap_kpfn(unsigned long pfn) if (set_memory_np(decoy_addr, 1)) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); @@ -133,30 +157,6 @@ Acked-by: Borislav Petkov } #endif ---- a/arch/x86/kernel/cpu/mcheck/mce-internal.h -+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h -@@ -13,6 +13,21 @@ enum severity_level { - MCE_PANIC_SEVERITY, - }; - -+#ifndef CONFIG_X86_64 -+/* -+ * On 32-bit systems it would be difficult to safely unmap a poison page -+ * from the kernel 1:1 map because there are no non-canonical addresses that -+ * we can use to refer to the address without risking a speculative access. -+ * However, this isn't much of an issue because: -+ * 1) Few unmappable pages are in the 1:1 map. Most are in HIGHMEM which -+ * are only mapped into the kernel as needed -+ * 2) Few people would run a 32-bit kernel on a machine that supports -+ * recoverable errors because they have too much memory to boot 32-bit. -+ */ -+static inline void mce_unmap_kpfn(unsigned long pfn) {} -+#define mce_unmap_kpfn mce_unmap_kpfn -+#endif -+ - extern struct blocking_notifier_head x86_mce_decoder_chain; - - #define ATTR_LEN 16 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -126,10 +126,4 @@ static __always_inline enum lru_list pag @@ -172,12 +172,12 @@ Acked-by: Borislav Petkov #endif --- a/mm/memory-failure.c +++ b/mm/memory-failure.c -@@ -1063,8 +1063,6 @@ int memory_failure(unsigned long pfn, in +@@ -1137,8 +1137,6 @@ int memory_failure(unsigned long pfn, in return -ENXIO; } - arch_unmap_kpfn(pfn); - p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); - if (TestSetPageHWPoison(p)) { + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); diff --git a/series.conf b/series.conf index 8c200f7..8c479e0 100644 --- a/series.conf +++ b/series.conf @@ -5160,7 +5160,15 @@ patches.drivers/0005-device-property-Add-FW-type-agnostic-fwnode_graph_ge.patch patches.drivers/device-property-Add-fwnode_graph_get_port_parent.patch patches.suse/mm-page_alloc-fallback-to-smallest-page-when-not-stealing-whole-pageblock.patch + patches.fixes/mm-hugetlb-prevent-reuse-of-hwpoisoned-free-hugepage.patch patches.fixes/mm-hugetlb-return-immediately-for-hugetlb-page-in-__.patch + patches.fixes/mm-hwpoison-change-PageHWPoison-behavior-on-hugetlb-.patch + patches.fixes/mm-hugetlb-soft-offline-dissolve-source-hugepage-aft.patch + patches.fixes/mm-soft-offline-dissolve-free-hugepage-if-soft-offli.patch + patches.fixes/mm-hwpoison-introduce-memory_failure_hugetlb.patch + patches.fixes/mm-hwpoison-dissolve-in-use-hugepage-in-unrecoverabl.patch + patches.fixes/mm-hugetlb-delete-dequeue_hwpoisoned_huge_page.patch + patches.fixes/mm-hwpoison-introduce-idenfity_page_state.patch patches.fixes/mm-make-PR_SET_THP_DISABLE-immediately-active.patch patches.fixes/mm-improve-readability-of-transparent_hugepage_enabl.patch patches.fixes/mm-always-enable-thp-for-dax-mappings.patch @@ -10956,6 +10964,7 @@ patches.fixes/lsm-fix-smack_inode_removexattr-and-xattr_getsecurit.patch patches.arch/include-linux-mm.h-fix-typo-in-VM_MPX-definition.patch patches.fixes/ksm-fix-unlocked-iteration-over-vmas-in-cmp_and_merge_page.patch + patches.fixes/mm-hugetlb-soft_offline-save-compound-page-order-bef.patch patches.suse/0001-mm-oom_reaper-skip-mm-structs-with-mmu-notifiers.patch patches.fixes/mm-compaction-serialize-waitqueue_active-checks-for-real.patch patches.fixes/mm-meminit-mark-init_reserved_page-as-__meminit.patch @@ -11546,6 +11555,7 @@ patches.arch/0001-arm64-ensure-__dump_instr-checks-addr_limit.patch patches.fixes/userfaultfd-hugetlbfs-prevent-UFFDIO_COPY-to-fill-be.patch patches.fixes/ocfs2-fstrim-Fix-start-offset-of-first-cluster-group.patch + patches.fixes/fs-hugetlbfs-inode.c-fix-hwpoison-reserve-accounting.patch patches.fixes/initramfs-fix-initramfs-rebuilds-w-compression-after-disabling.patch patches.fixes/mm-swap-fix-race-between-swap-count-continuation-operations.patch patches.suse/tcp_nv-fix-division-by-zero-in-tcpnv_acked.patch @@ -39335,6 +39345,8 @@ patches.fixes/hfsplus-fix-null-dereference-in-hfsplus_lookup.patch patches.fixes/hfs-prevent-crash-on-exit-from-failed-search.patch patches.fixes/namei-allow-restricted-O_CREAT-of-FIFOs-and-regular-.patch + patches.fixes/mm-fix-race-on-soft-offlining-free-huge-pages.patch + patches.fixes/mm-soft-offline-close-the-race-against-page-allocati.patch patches.drivers/fs-proc-vmcore.c-hide-vmcoredd_mmap_dumps-for-nommu-.patch patches.drivers/virtio-pci-legacy-Validate-queue-pfn.patch patches.drm/8809-drm-panel-simple-tv123wam-add-unprepare-delay @@ -47530,6 +47542,8 @@ patches.drivers/Bluetooth-Fix-regression-with-minimum-encryption-key.patch patches.arch/powerpc-mm-64s-hash-Reallocate-context-ids-on-fork.patch patches.drivers/ppp-mppe-Add-softdep-to-arc4.patch + patches.fixes/mm-soft-offline-return-EBUSY-if-set_hwpoison_free_bu.patch + patches.fixes/mm-hugetlb-soft-offline-dissolve_free_huge_page-retu.patch patches.arch/perf-x86-disable-extended-registers-for-non-supported-pmus.patch patches.arch/perf-x86-regs-check-reserved-bits.patch patches.fixes/Bluetooth-Fix-faulty-expression-for-minimum-encrypti.patch