Blob Blame History Raw
From a7f40cfe3b7ada57af9b62fd28430eeb4a7cfcb7 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 28 Mar 2019 20:43:55 -0700
Subject: [PATCH] mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT
 is specified
Git-commit: a7f40cfe3b7ada57af9b62fd28430eeb4a7cfcb7
Patch-mainline: v5.1-rc3
References: bsc#1185906

mhocko@suse.com:
This kernel splits THPs rather than migrate them so we have to check for
the mode and bail out early if this is a pure MPOL_MF_STRICT mode
without move. Otherwise would would just have split the page and
potentially end up on a different node breaking the syscall contract.

When MPOL_MF_STRICT was specified and an existing page was already on a
node that does not follow the policy, mbind() should return -EIO.  But
commit 6f4576e3687b ("mempolicy: apply page table walker on
queue_pages_range()") broke the rule.

And commit c8633798497c ("mm: mempolicy: mbind and migrate_pages support
thp migration") didn't return the correct value for THP mbind() too.

If MPOL_MF_STRICT is set, ignore vma_migratable() to make sure it
reaches queue_pages_to_pte_range() or queue_pages_pmd() to check if an
existing page was already on a node that does not follow the policy.
And, non-migratable vma may be used, return -EIO too if MPOL_MF_MOVE or
MPOL_MF_MOVE_ALL was specified.

Tested with https://github.com/metan-ucw/ltp/blob/master/testcases/kernel/syscalls/mbind/mbind02.c

[akpm@linux-foundation.org: tweak code comment]
Link: http://lkml.kernel.org/r/1553020556-38583-1-git-send-email-yang.shi@linux.alibaba.com
Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()")
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Reported-by: Cyril Hrubis <chrubis@suse.cz>
Suggested-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: Rafael Aquini <aquini@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>

---
 mm/mempolicy.c |   53 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 6 deletions(-)

--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -499,6 +499,22 @@ static int queue_pages_pte_range(pmd_t *
 		ptl = pmd_lock(walk->mm, pmd);
 		if (pmd_trans_huge(*pmd)) {
 			page = pmd_page(*pmd);
+
+			nid = page_to_nid(page);
+			if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) {
+				spin_unlock(ptl);
+				return 0;
+			}
+
+			/*
+			 * We cannot modify (split) THP in pure strict mode. A misplaced
+			 * page has to be reported by EIO
+			 */
+			if ((flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL) == MPOL_MF_STRICT)) {
+				spin_unlock(ptl);
+				return -EIO;
+			}
+
 			if (is_huge_zero_page(page)) {
 				spin_unlock(ptl);
 				__split_huge_pmd(vma, pmd, addr, false, NULL);
@@ -509,8 +525,15 @@ static int queue_pages_pte_range(pmd_t *
 				ret = split_huge_page(page);
 				unlock_page(page);
 				put_page(page);
-				if (ret)
+				if (ret) {
+					/*
+					 * When moving pages in the strict mode we
+					 * should report errors
+					 */
+					if (flags & MPOL_MF_STRICT)
+						return -EIO;
 					return 0;
+				}
 			}
 		} else {
 			spin_unlock(ptl);
@@ -536,6 +559,12 @@ retry:
 		nid = page_to_nid(page);
 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 			continue;
+		/*
+		 * We cannot modify (split) THP in pure strict mode. A misplaced
+		 * page has to be reported by EIO
+		 */
+		if ((flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL) == MPOL_MF_STRICT))
+			break;
 		if (PageTransCompound(page)) {
 			get_page(page);
 			pte_unmap_unlock(pte, ptl);
@@ -543,8 +572,10 @@ retry:
 			ret = split_huge_page(page);
 			unlock_page(page);
 			put_page(page);
-			/* Failed to split -- skip. */
+			/* Failed to split -- skip. unless in strict mode */
 			if (ret) {
+				if (flags & MPOL_MF_STRICT)
+					return -EIO;
 				pte = pte_offset_map_lock(walk->mm, pmd,
 						addr, &ptl);
 				continue;
@@ -552,11 +583,16 @@ retry:
 			goto retry;
 		}
 
-		migrate_page_add(page, qp->pagelist, flags);
+		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+			if (!vma_migratable(vma))
+				break;
+			migrate_page_add(page, qp->pagelist, flags);
+		} else
+			break;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
-	return 0;
+	return addr != end ? -EIO : 0;
 }
 
 static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
@@ -628,7 +664,12 @@ static int queue_pages_test_walk(unsigne
 	unsigned long endvma = vma->vm_end;
 	unsigned long flags = qp->flags;
 
-	if (!vma_migratable(vma))
+	/*
+	 * Need check MPOL_MF_STRICT to return -EIO if possible
+	 * regardless of vma_migratable
+	 */
+	if (!vma_migratable(vma) &&
+	    !(flags & MPOL_MF_STRICT))
 		return 1;
 
 	if (endvma > end)
@@ -655,7 +696,7 @@ static int queue_pages_test_walk(unsigne
 	}
 
 	/* queue pages from current vma */
-	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+	if (flags & MPOL_MF_VALID)
 		return 0;
 	return 1;
 }