Takashi Iwai 6592ef
From 48d8476b41eed63567dd2f0ad125c895b9ac648a Mon Sep 17 00:00:00 2001
Takashi Iwai 6592ef
From: Alex Williamson <alex.williamson@redhat.com>
Takashi Iwai 6592ef
Date: Fri, 11 May 2018 09:05:02 -0600
Takashi Iwai 6592ef
Subject: [PATCH] vfio/type1: Fix task tracking for QEMU vCPU hotplug
Takashi Iwai 6592ef
Git-commit: 48d8476b41eed63567dd2f0ad125c895b9ac648a
Takashi Iwai 6592ef
Patch-mainline: v4.18-rc1
Takashi Iwai 6592ef
References: bsc#1051510
Takashi Iwai 6592ef
Takashi Iwai 6592ef
MAP_DMA ioctls might be called from various threads within a process,
Takashi Iwai 6592ef
for example when using QEMU, the vCPU threads are often generating
Takashi Iwai 6592ef
these calls and we therefore take a reference to that vCPU task.
Takashi Iwai 6592ef
However, QEMU also supports vCPU hotplug on some machines and the task
Takashi Iwai 6592ef
that called MAP_DMA may have exited by the time UNMAP_DMA is called,
Takashi Iwai 6592ef
resulting in the mm_struct pointer being NULL and thus a failure to
Takashi Iwai 6592ef
match against the existing mapping.
Takashi Iwai 6592ef
Takashi Iwai 6592ef
To resolve this, we instead take a reference to the thread
Takashi Iwai 6592ef
group_leader, which has the same mm_struct and resource limits, but
Takashi Iwai 6592ef
is less likely exit, at least in the QEMU case.  A difficulty here is
Takashi Iwai 6592ef
guaranteeing that the capabilities of the group_leader match that of
Takashi Iwai 6592ef
the calling thread, which we resolve by tracking CAP_IPC_LOCK at the
Takashi Iwai 6592ef
time of calling rather than at an indeterminate time in the future.
Takashi Iwai 6592ef
Potentially this also results in better efficiency as this is now
Takashi Iwai 6592ef
recorded once per MAP_DMA ioctl.
Takashi Iwai 6592ef
Takashi Iwai 6592ef
Reported-by: Xu Yandong <xuyandong2@huawei.com>
Takashi Iwai 6592ef
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Takashi Iwai 6592ef
Acked-by: Takashi Iwai <tiwai@suse.de>
Takashi Iwai 6592ef
Takashi Iwai 6592ef
---
Takashi Iwai 6592ef
 drivers/vfio/vfio_iommu_type1.c |   73 +++++++++++++++++++++++++---------------
Takashi Iwai 6592ef
 1 file changed, 47 insertions(+), 26 deletions(-)
Takashi Iwai 6592ef
Takashi Iwai 6592ef
--- a/drivers/vfio/vfio_iommu_type1.c
Takashi Iwai 6592ef
+++ b/drivers/vfio/vfio_iommu_type1.c
Takashi Iwai 6592ef
@@ -83,6 +83,7 @@ struct vfio_dma {
Takashi Iwai 6592ef
 	size_t			size;		/* Map size (bytes) */
Takashi Iwai 6592ef
 	int			prot;		/* IOMMU_READ/WRITE */
Takashi Iwai 6592ef
 	bool			iommu_mapped;
Takashi Iwai 6592ef
+	bool			lock_cap;	/* capable(CAP_IPC_LOCK) */
Takashi Iwai 6592ef
 	struct task_struct	*task;
Takashi Iwai 6592ef
 	struct rb_root		pfn_list;	/* Ex-user pinned pfn list */
Takashi Iwai 6592ef
 };
Takashi Iwai 6592ef
@@ -246,29 +247,25 @@ static int vfio_iova_put_vfio_pfn(struct
Takashi Iwai 6592ef
 	return ret;
Takashi Iwai 6592ef
 }
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
-static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap)
Takashi Iwai 6592ef
+static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
Takashi Iwai 6592ef
 {
Takashi Iwai 6592ef
 	struct mm_struct *mm;
Takashi Iwai 6592ef
-	bool is_current;
Takashi Iwai 6592ef
 	int ret;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	if (!npage)
Takashi Iwai 6592ef
 		return 0;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
-	is_current = (task->mm == current->mm);
Takashi Iwai 6592ef
-
Takashi Iwai 6592ef
-	mm = is_current ? task->mm : get_task_mm(task);
Takashi Iwai 6592ef
+	mm = async ? get_task_mm(dma->task) : dma->task->mm;
Takashi Iwai 6592ef
 	if (!mm)
Takashi Iwai 6592ef
 		return -ESRCH; /* process exited */
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	ret = down_write_killable(&mm->mmap_sem);
Takashi Iwai 6592ef
 	if (!ret) {
Takashi Iwai 6592ef
 		if (npage > 0) {
Takashi Iwai 6592ef
-			if (lock_cap ? !*lock_cap :
Takashi Iwai 6592ef
-			    !has_capability(task, CAP_IPC_LOCK)) {
Takashi Iwai 6592ef
+			if (!dma->lock_cap) {
Takashi Iwai 6592ef
 				unsigned long limit;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
-				limit = task_rlimit(task,
Takashi Iwai 6592ef
+				limit = task_rlimit(dma->task,
Takashi Iwai 6592ef
 						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 				if (mm->locked_vm + npage > limit)
Takashi Iwai 6592ef
@@ -282,7 +279,7 @@ static int vfio_lock_acct(struct task_st
Takashi Iwai 6592ef
 		up_write(&mm->mmap_sem);
Takashi Iwai 6592ef
 	}
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
-	if (!is_current)
Takashi Iwai 6592ef
+	if (async)
Takashi Iwai 6592ef
 		mmput(mm);
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	return ret;
Takashi Iwai 6592ef
@@ -391,7 +388,7 @@ static int vaddr_get_pfn(struct mm_struc
Takashi Iwai 6592ef
  */
Takashi Iwai 6592ef
 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
Takashi Iwai 6592ef
 				  long npage, unsigned long *pfn_base,
Takashi Iwai 6592ef
-				  bool lock_cap, unsigned long limit)
Takashi Iwai 6592ef
+				  unsigned long limit)
Takashi Iwai 6592ef
 {
Takashi Iwai 6592ef
 	unsigned long pfn = 0;
Takashi Iwai 6592ef
 	long ret, pinned = 0, lock_acct = 0;
Takashi Iwai 6592ef
@@ -414,7 +411,7 @@ static long vfio_pin_pages_remote(struct
Takashi Iwai 6592ef
 	 * pages are already counted against the user.
Takashi Iwai 6592ef
 	 */
Takashi Iwai 6592ef
 	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
Takashi Iwai 6592ef
-		if (!lock_cap && current->mm->locked_vm + 1 > limit) {
Takashi Iwai 6592ef
+		if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
Takashi Iwai 6592ef
 			put_pfn(*pfn_base, dma->prot);
Takashi Iwai 6592ef
 			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
Takashi Iwai 6592ef
 					limit << PAGE_SHIFT);
Takashi Iwai 6592ef
@@ -440,7 +437,7 @@ static long vfio_pin_pages_remote(struct
Takashi Iwai 6592ef
 		}
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 		if (!rsvd && !vfio_find_vpfn(dma, iova)) {
Takashi Iwai 6592ef
-			if (!lock_cap &&
Takashi Iwai 6592ef
+			if (!dma->lock_cap &&
Takashi Iwai 6592ef
 			    current->mm->locked_vm + lock_acct + 1 > limit) {
Takashi Iwai 6592ef
 				put_pfn(pfn, dma->prot);
Takashi Iwai 6592ef
 				pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
Takashi Iwai 6592ef
@@ -453,7 +450,7 @@ static long vfio_pin_pages_remote(struct
Takashi Iwai 6592ef
 	}
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 out:
Takashi Iwai 6592ef
-	ret = vfio_lock_acct(current, lock_acct, &lock_cap);
Takashi Iwai 6592ef
+	ret = vfio_lock_acct(dma, lock_acct, false);
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 unpin_out:
Takashi Iwai 6592ef
 	if (ret) {
Takashi Iwai 6592ef
@@ -484,7 +481,7 @@ static long vfio_unpin_pages_remote(stru
Takashi Iwai 6592ef
 	}
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	if (do_accounting)
Takashi Iwai 6592ef
-		vfio_lock_acct(dma->task, locked - unlocked, NULL);
Takashi Iwai 6592ef
+		vfio_lock_acct(dma, locked - unlocked, true);
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	return unlocked;
Takashi Iwai 6592ef
 }
Takashi Iwai 6592ef
@@ -501,7 +498,7 @@ static int vfio_pin_page_external(struct
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
Takashi Iwai 6592ef
 	if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
Takashi Iwai 6592ef
-		ret = vfio_lock_acct(dma->task, 1, NULL);
Takashi Iwai 6592ef
+		ret = vfio_lock_acct(dma, 1, true);
Takashi Iwai 6592ef
 		if (ret) {
Takashi Iwai 6592ef
 			put_pfn(*pfn_base, dma->prot);
Takashi Iwai 6592ef
 			if (ret == -ENOMEM)
Takashi Iwai 6592ef
@@ -528,7 +525,7 @@ static int vfio_unpin_page_external(stru
Takashi Iwai 6592ef
 	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	if (do_accounting)
Takashi Iwai 6592ef
-		vfio_lock_acct(dma->task, -unlocked, NULL);
Takashi Iwai 6592ef
+		vfio_lock_acct(dma, -unlocked, true);
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	return unlocked;
Takashi Iwai 6592ef
 }
Takashi Iwai 6592ef
@@ -723,7 +720,7 @@ static long vfio_unmap_unpin(struct vfio
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	dma->iommu_mapped = false;
Takashi Iwai 6592ef
 	if (do_accounting) {
Takashi Iwai 6592ef
-		vfio_lock_acct(dma->task, -unlocked, NULL);
Takashi Iwai 6592ef
+		vfio_lock_acct(dma, -unlocked, true);
Takashi Iwai 6592ef
 		return 0;
Takashi Iwai 6592ef
 	}
Takashi Iwai 6592ef
 	return unlocked;
Takashi Iwai 6592ef
@@ -935,14 +932,12 @@ static int vfio_pin_map_dma(struct vfio_
Takashi Iwai 6592ef
 	size_t size = map_size;
Takashi Iwai 6592ef
 	long npage;
Takashi Iwai 6592ef
 	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
Takashi Iwai 6592ef
-	bool lock_cap = capable(CAP_IPC_LOCK);
Takashi Iwai 6592ef
 	int ret = 0;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	while (size) {
Takashi Iwai 6592ef
 		/* Pin a contiguous chunk of memory */
Takashi Iwai 6592ef
 		npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
Takashi Iwai 6592ef
-					      size >> PAGE_SHIFT, &pfn,
Takashi Iwai 6592ef
-					      lock_cap, limit);
Takashi Iwai 6592ef
+					      size >> PAGE_SHIFT, &pfn, limit);
Takashi Iwai 6592ef
 		if (npage <= 0) {
Takashi Iwai 6592ef
 			WARN_ON(!npage);
Takashi Iwai 6592ef
 			ret = (int)npage;
Takashi Iwai 6592ef
@@ -1017,8 +1012,36 @@ static int vfio_dma_do_map(struct vfio_i
Takashi Iwai 6592ef
 	dma->iova = iova;
Takashi Iwai 6592ef
 	dma->vaddr = vaddr;
Takashi Iwai 6592ef
 	dma->prot = prot;
Takashi Iwai 6592ef
-	get_task_struct(current);
Takashi Iwai 6592ef
-	dma->task = current;
Takashi Iwai 6592ef
+
Takashi Iwai 6592ef
+	/*
Takashi Iwai 6592ef
+	 * We need to be able to both add to a task's locked memory and test
Takashi Iwai 6592ef
+	 * against the locked memory limit and we need to be able to do both
Takashi Iwai 6592ef
+	 * outside of this call path as pinning can be asynchronous via the
Takashi Iwai 6592ef
+	 * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
Takashi Iwai 6592ef
+	 * task_struct and VM locked pages requires an mm_struct, however
Takashi Iwai 6592ef
+	 * holding an indefinite mm reference is not recommended, therefore we
Takashi Iwai 6592ef
+	 * only hold a reference to a task.  We could hold a reference to
Takashi Iwai 6592ef
+	 * current, however QEMU uses this call path through vCPU threads,
Takashi Iwai 6592ef
+	 * which can be killed resulting in a NULL mm and failure in the unmap
Takashi Iwai 6592ef
+	 * path when called via a different thread.  Avoid this problem by
Takashi Iwai 6592ef
+	 * using the group_leader as threads within the same group require
Takashi Iwai 6592ef
+	 * both CLONE_THREAD and CLONE_VM and will therefore use the same
Takashi Iwai 6592ef
+	 * mm_struct.
Takashi Iwai 6592ef
+	 *
Takashi Iwai 6592ef
+	 * Previously we also used the task for testing CAP_IPC_LOCK at the
Takashi Iwai 6592ef
+	 * time of pinning and accounting, however has_capability() makes use
Takashi Iwai 6592ef
+	 * of real_cred, a copy-on-write field, so we can't guarantee that it
Takashi Iwai 6592ef
+	 * matches group_leader, or in fact that it might not change by the
Takashi Iwai 6592ef
+	 * time it's evaluated.  If a process were to call MAP_DMA with
Takashi Iwai 6592ef
+	 * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
Takashi Iwai 6592ef
+	 * possibly see different results for an iommu_mapped vfio_dma vs
Takashi Iwai 6592ef
+	 * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
Takashi Iwai 6592ef
+	 * time of calling MAP_DMA.
Takashi Iwai 6592ef
+	 */
Takashi Iwai 6592ef
+	get_task_struct(current->group_leader);
Takashi Iwai 6592ef
+	dma->task = current->group_leader;
Takashi Iwai 6592ef
+	dma->lock_cap = capable(CAP_IPC_LOCK);
Takashi Iwai 6592ef
+
Takashi Iwai 6592ef
 	dma->pfn_list = RB_ROOT;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	/* Insert zero-sized and grow as we map chunks of it */
Takashi Iwai 6592ef
@@ -1053,7 +1076,6 @@ static int vfio_iommu_replay(struct vfio
Takashi Iwai 6592ef
 	struct vfio_domain *d;
Takashi Iwai 6592ef
 	struct rb_node *n;
Takashi Iwai 6592ef
 	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
Takashi Iwai 6592ef
-	bool lock_cap = capable(CAP_IPC_LOCK);
Takashi Iwai 6592ef
 	int ret;
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 	/* Arbitrarily pick the first domain in the list for lookups */
Takashi Iwai 6592ef
@@ -1100,8 +1122,7 @@ static int vfio_iommu_replay(struct vfio
Takashi Iwai 6592ef
 
Takashi Iwai 6592ef
 				npage = vfio_pin_pages_remote(dma, vaddr,
Takashi Iwai 6592ef
 							      n >> PAGE_SHIFT,
Takashi Iwai 6592ef
-							      &pfn, lock_cap,
Takashi Iwai 6592ef
-							      limit);
Takashi Iwai 6592ef
+							      &pfn, limit);
Takashi Iwai 6592ef
 				if (npage <= 0) {
Takashi Iwai 6592ef
 					WARN_ON(!npage);
Takashi Iwai 6592ef
 					ret = (int)npage;
Takashi Iwai 6592ef
@@ -1370,7 +1391,7 @@ static void vfio_iommu_unmap_unpin_reacc
Takashi Iwai 6592ef
 			if (!is_invalid_reserved_pfn(vpfn->pfn))
Takashi Iwai 6592ef
 				locked++;
Takashi Iwai 6592ef
 		}
Takashi Iwai 6592ef
-		vfio_lock_acct(dma->task, locked - unlocked, NULL);
Takashi Iwai 6592ef
+		vfio_lock_acct(dma, locked - unlocked, true);
Takashi Iwai 6592ef
 	}
Takashi Iwai 6592ef
 }
Takashi Iwai 6592ef