Blob Blame History Raw
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Thu, 15 Mar 2018 17:27:46 -0400
Subject: drm/amdkfd: Remove limit on number of GPUs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Git-commit: c7bcbfa4f8d1e0e1078adfe959d4b65542bccf66
Patch-mainline: v4.17-rc1
References: FATE#326289 FATE#326079 FATE#326049 FATE#322398 FATE#326166

Currently the number of GPUs is limited by aperture placement options
available on GFX7 and GFX8 hardware. This limitation is not necessary.
Scratch and LDS represent per-work-item and per-work-group storage
respectively. Different work-items and work-groups use the same virtual
address to access their own data. Work running on different GPUs is by
definition in different work-groups (different dispatches, in fact).
That means the same virtual addresses can be used for these apertures
on different GPUs.

Add a new AMDKFD_IOC_GET_PROCESS_APERTURES_NEW ioctl that removes the
artificial limitation on the number of GPUs that can be supported. The
new ioctl allows user mode to query the number of GPUs to allocate
enough memory for all GPUs to be reported.

This deprecates AMDKFD_IOC_GET_PROCESS_APERTURES.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Acked-by: Petr Tesarik <ptesarik@suse.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c     |   94 +++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c |   22 ++----
 include/uapi/linux/kfd_ioctl.h               |   27 ++++++-
 3 files changed, 128 insertions(+), 15 deletions(-)

--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -825,6 +825,97 @@ static int kfd_ioctl_get_process_apertur
 	return 0;
 }
 
+static int kfd_ioctl_get_process_apertures_new(struct file *filp,
+				struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_get_process_apertures_new_args *args = data;
+	struct kfd_process_device_apertures *pa;
+	struct kfd_process_device *pdd;
+	uint32_t nodes = 0;
+	int ret;
+
+	dev_dbg(kfd_device, "get apertures for PASID %d", p->pasid);
+
+	if (args->num_of_nodes == 0) {
+		/* Return number of nodes, so that user space can alloacate
+		 * sufficient memory
+		 */
+		mutex_lock(&p->mutex);
+
+		if (!kfd_has_process_device_data(p))
+			goto out_unlock;
+
+		/* Run over all pdd of the process */
+		pdd = kfd_get_first_process_device_data(p);
+		do {
+			args->num_of_nodes++;
+			pdd = kfd_get_next_process_device_data(p, pdd);
+		} while (pdd);
+
+		goto out_unlock;
+	}
+
+	/* Fill in process-aperture information for all available
+	 * nodes, but not more than args->num_of_nodes as that is
+	 * the amount of memory allocated by user
+	 */
+	pa = kzalloc((sizeof(struct kfd_process_device_apertures) *
+				args->num_of_nodes), GFP_KERNEL);
+	if (!pa)
+		return -ENOMEM;
+
+	mutex_lock(&p->mutex);
+
+	if (!kfd_has_process_device_data(p)) {
+		args->num_of_nodes = 0;
+		kfree(pa);
+		goto out_unlock;
+	}
+
+	/* Run over all pdd of the process */
+	pdd = kfd_get_first_process_device_data(p);
+	do {
+		pa[nodes].gpu_id = pdd->dev->id;
+		pa[nodes].lds_base = pdd->lds_base;
+		pa[nodes].lds_limit = pdd->lds_limit;
+		pa[nodes].gpuvm_base = pdd->gpuvm_base;
+		pa[nodes].gpuvm_limit = pdd->gpuvm_limit;
+		pa[nodes].scratch_base = pdd->scratch_base;
+		pa[nodes].scratch_limit = pdd->scratch_limit;
+
+		dev_dbg(kfd_device,
+			"gpu id %u\n", pdd->dev->id);
+		dev_dbg(kfd_device,
+			"lds_base %llX\n", pdd->lds_base);
+		dev_dbg(kfd_device,
+			"lds_limit %llX\n", pdd->lds_limit);
+		dev_dbg(kfd_device,
+			"gpuvm_base %llX\n", pdd->gpuvm_base);
+		dev_dbg(kfd_device,
+			"gpuvm_limit %llX\n", pdd->gpuvm_limit);
+		dev_dbg(kfd_device,
+			"scratch_base %llX\n", pdd->scratch_base);
+		dev_dbg(kfd_device,
+			"scratch_limit %llX\n", pdd->scratch_limit);
+		nodes++;
+
+		pdd = kfd_get_next_process_device_data(p, pdd);
+	} while (pdd && (nodes < args->num_of_nodes));
+	mutex_unlock(&p->mutex);
+
+	args->num_of_nodes = nodes;
+	ret = copy_to_user(
+			(void __user *)args->kfd_process_device_apertures_ptr,
+			pa,
+			(nodes * sizeof(struct kfd_process_device_apertures)));
+	kfree(pa);
+	return ret ? -EFAULT : 0;
+
+out_unlock:
+	mutex_unlock(&p->mutex);
+	return 0;
+}
+
 static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
 					void *data)
 {
@@ -1017,6 +1108,9 @@ static const struct amdkfd_ioctl_desc am
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_TRAP_HANDLER,
 			kfd_ioctl_set_trap_handler, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_GET_PROCESS_APERTURES_NEW,
+			kfd_ioctl_get_process_apertures_new, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -282,14 +282,14 @@
 	(((uint64_t)(base) & \
 		0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL)
 
-#define MAKE_SCRATCH_APP_BASE(gpu_num) \
-	(((uint64_t)(gpu_num) << 61) + 0x100000000L)
+#define MAKE_SCRATCH_APP_BASE() \
+	(((uint64_t)(0x1UL) << 61) + 0x100000000L)
 
 #define MAKE_SCRATCH_APP_LIMIT(base) \
 	(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
 
-#define MAKE_LDS_APP_BASE(gpu_num) \
-	(((uint64_t)(gpu_num) << 61) + 0x0)
+#define MAKE_LDS_APP_BASE() \
+	(((uint64_t)(0x1UL) << 61) + 0x0)
 #define MAKE_LDS_APP_LIMIT(base) \
 	(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
 
@@ -314,7 +314,7 @@ int kfd_init_apertures(struct kfd_proces
 			return -1;
 		}
 		/*
-		 * For 64 bit process aperture will be statically reserved in
+		 * For 64 bit process apertures will be statically reserved in
 		 * the x86_64 non canonical process address space
 		 * amdkfd doesn't currently support apertures for 32 bit process
 		 */
@@ -323,12 +323,11 @@ int kfd_init_apertures(struct kfd_proces
 			pdd->gpuvm_base = pdd->gpuvm_limit = 0;
 			pdd->scratch_base = pdd->scratch_limit = 0;
 		} else {
-			/*
-			 * node id couldn't be 0 - the three MSB bits of
-			 * aperture shoudn't be 0
+			/* Same LDS and scratch apertures can be used
+			 * on all GPUs. This allows using more dGPUs
+			 * than placement options for apertures.
 			 */
-			pdd->lds_base = MAKE_LDS_APP_BASE(id + 1);
-
+			pdd->lds_base = MAKE_LDS_APP_BASE();
 			pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
 
 			pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
@@ -336,8 +335,7 @@ int kfd_init_apertures(struct kfd_proces
 			pdd->gpuvm_limit =
 					MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
 
-			pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
-
+			pdd->scratch_base = MAKE_SCRATCH_APP_BASE();
 			pdd->scratch_limit =
 				MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
 		}
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -107,8 +107,6 @@ struct kfd_ioctl_get_clock_counters_args
 	__u32 pad;
 };
 
-#define NUM_OF_SUPPORTED_GPUS 7
-
 struct kfd_process_device_apertures {
 	__u64 lds_base;		/* from KFD */
 	__u64 lds_limit;		/* from KFD */
@@ -120,6 +118,12 @@ struct kfd_process_device_apertures {
 	__u32 pad;
 };
 
+/*
+ * AMDKFD_IOC_GET_PROCESS_APERTURES is deprecated. Use
+ * AMDKFD_IOC_GET_PROCESS_APERTURES_NEW instead, which supports an
+ * unlimited number of GPUs.
+ */
+#define NUM_OF_SUPPORTED_GPUS 7
 struct kfd_ioctl_get_process_apertures_args {
 	struct kfd_process_device_apertures
 			process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
@@ -129,6 +133,19 @@ struct kfd_ioctl_get_process_apertures_a
 	__u32 pad;
 };
 
+struct kfd_ioctl_get_process_apertures_new_args {
+	/* User allocated. Pointer to struct kfd_process_device_apertures
+	 * filled in by Kernel
+	 */
+	__u64 kfd_process_device_apertures_ptr;
+	/* to KFD - indicates amount of memory present in
+	 *  kfd_process_device_apertures_ptr
+	 * from KFD - Number of entries filled by KFD.
+	 */
+	__u32 num_of_nodes;
+	__u32 pad;
+};
+
 #define MAX_ALLOWED_NUM_POINTS    100
 #define MAX_ALLOWED_AW_BUFF_SIZE 4096
 #define MAX_ALLOWED_WAC_BUFF_SIZE  128
@@ -332,7 +349,11 @@ struct kfd_ioctl_set_trap_handler_args {
 #define AMDKFD_IOC_SET_TRAP_HANDLER		\
 		AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
 
+#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW	\
+		AMDKFD_IOWR(0x14,		\
+			struct kfd_ioctl_get_process_apertures_new_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x14
+#define AMDKFD_COMMAND_END		0x15
 
 #endif