Blob Blame History Raw
From a9c171527a3403cae6c1907744b1bc9ca301f912 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Tue, 20 Jul 2021 13:42:04 -0700
Subject: [PATCH] dmaengine: idxd: rotate portal address for better performance
Git-commit: a9c171527a3403cae6c1907744b1bc9ca301f912
Patch-mainline: v5.15-rc1
References: jsc#SLE-18899

The device submission portal is on a 4k page and any of those 64bit aligned
address on the page can be used for descriptor submission. By rotating the
offset through the 4k range and prevent successive writes to the same MMIO
address, performance improvement is observed through testing.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162681372446.1968485.10634280461681015569.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
Acked-by: Takashi Iwai <tiwai@suse.de>

---
 drivers/dma/idxd/device.c |    1 +
 drivers/dma/idxd/idxd.h   |   20 ++++++++++++++++++++
 drivers/dma/idxd/submit.c |    2 +-
 3 files changed, 22 insertions(+), 1 deletion(-)

--- a/drivers/dma/idxd/device.c
+++ b/drivers/dma/idxd/device.c
@@ -320,6 +320,7 @@ void idxd_wq_unmap_portal(struct idxd_wq
 
 	devm_iounmap(dev, wq->portal);
 	wq->portal = NULL;
+	wq->portal_offset = 0;
 }
 
 void idxd_wqs_unmap_portal(struct idxd_device *idxd)
--- a/drivers/dma/idxd/idxd.h
+++ b/drivers/dma/idxd/idxd.h
@@ -11,6 +11,7 @@
 #include <linux/idr.h>
 #include <linux/pci.h>
 #include <linux/perf_event.h>
+#include <uapi/linux/idxd.h>
 #include "registers.h"
 
 #define IDXD_DRIVER_VERSION	"1.00"
@@ -162,6 +163,7 @@ struct idxd_dma_chan {
 
 struct idxd_wq {
 	void __iomem *portal;
+	u32 portal_offset;
 	struct percpu_ref wq_active;
 	struct completion wq_dead;
 	struct idxd_dev idxd_dev;
@@ -468,6 +470,24 @@ static inline int idxd_get_wq_portal_ful
 	return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot);
 }
 
+#define IDXD_PORTAL_MASK	(PAGE_SIZE - 1)
+
+/*
+ * Even though this function can be accessed by multiple threads, it is safe to use.
+ * At worst the address gets used more than once before it gets incremented. We don't
+ * hit a threshold until iops becomes many million times a second. So the occasional
+ * reuse of the same address is tolerable compare to using an atomic variable. This is
+ * safe on a system that has atomic load/store for 32bit integers. Given that this is an
+ * Intel iEP device, that should not be a problem.
+ */
+static inline void __iomem *idxd_wq_portal_addr(struct idxd_wq *wq)
+{
+	int ofs = wq->portal_offset;
+
+	wq->portal_offset = (ofs + sizeof(struct dsa_raw_desc)) & IDXD_PORTAL_MASK;
+	return wq->portal + ofs;
+}
+
 static inline void idxd_wq_get(struct idxd_wq *wq)
 {
 	wq->client_count++;
--- a/drivers/dma/idxd/submit.c
+++ b/drivers/dma/idxd/submit.c
@@ -146,7 +146,7 @@ int idxd_submit_desc(struct idxd_wq *wq,
 	if (!percpu_ref_tryget_live(&wq->wq_active))
 		return -ENXIO;
 
-	portal = wq->portal;
+	portal = idxd_wq_portal_addr(wq);
 
 	/*
 	 * The wmb() flushes writes to coherent DMA data before