Blob Blame History Raw
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 24 Jul 2017 07:45:31 -0700
Subject: IB/hfi1: Fix bar0 mapping to use write combining
Patch-mainline: v4.14-rc1
Git-commit: cb51c5d2cda855302910ab352f3d391c1a00aba0
References: bsc#1060463 FATE#323043

When the debugpat kernel boot flag is turned on the following
traces are printed:

[ 1884.793168] x86/PAT: Overlap at 0x90000000-0x92000000
[ 1884.803510] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff],
track uncached-minus, req write-combining, ret uncached-minus
[ 1884.818167] hfi1 0000:05:00.0: hfi1_0: WC Remapped RcvArray:
ffffc9000a980000

The ioremap_wc() clearly is not returning a write combining mapping due
to an overlap where the RcvArray is mapped in a uncached mapping prior
to creating the proposed write combining mapping.

The patch replaces the single base register for uncached CSRs that
used to overlap the RcvArray with two mappings.   One, kregbase1, from the
bar0 up to the RcvArray and another, kregbase2, from the end of the
RcvArray to the pio send buffer space.  A new dd field, base2_start,
is used to convert the zero-based offset in the CSR routines to the
correct kregbase1/kregbase2 mapping.  A single direct write of the
RcvArray CSRs is replaced with hfi1_put_tid() to insure correct access
using the new disjoint mapping.

Additionally, the kregend field is deleted since it is only ever written.

patdebug now shows the RcvArray as write combining:
[   35.688990] x86/PAT: reserve_memtype added [mem 0x91200000-0x9127ffff],
track write-combining, req write-combining, ret write-combining

To insulate from any potential issues with write combining, all
writeq are now flushed in hfi1_put_tid() and rcv_array_wc_fill().

Reviewed-by: Mitko Haralanov <mitko.haralanov@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/infiniband/hw/hfi1/chip.c         |   79 +++++++++++++++++++++++-------
 drivers/infiniband/hw/hfi1/chip.h         |    4 -
 drivers/infiniband/hw/hfi1/driver.c       |    4 -
 drivers/infiniband/hw/hfi1/exp_rcv.h      |    5 +
 drivers/infiniband/hw/hfi1/file_ops.c     |    2 
 drivers/infiniband/hw/hfi1/hfi.h          |   20 ++++---
 drivers/infiniband/hw/hfi1/pcie.c         |   58 +++++++++++++++-------
 drivers/infiniband/hw/hfi1/user_exp_rcv.c |    3 -
 8 files changed, 126 insertions(+), 49 deletions(-)

--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1297,25 +1297,71 @@ CNTR_ELEM(#name, \
 	  CNTR_SYNTH, \
 	  access_ibp_##cntr)
 
+/**
+ * hfi_addr_from_offset - return addr for readq/writeq
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * This routine selects the appropriate base address
+ * based on the indicated offset.
+ */
+static inline void __iomem *hfi1_addr_from_offset(
+	const struct hfi1_devdata *dd,
+	u32 offset)
+{
+	if (offset >= dd->base2_start)
+		return dd->kregbase2 + (offset - dd->base2_start);
+	return dd->kregbase1 + offset;
+}
+
+/**
+ * read_csr - read CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: the value read or all FF's if there
+ * is no mapping
+ */
 u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
 {
-	if (dd->flags & HFI1_PRESENT) {
-		return readq((void __iomem *)dd->kregbase + offset);
-	}
+	if (dd->flags & HFI1_PRESENT)
+		return readq(hfi1_addr_from_offset(dd, offset));
 	return -1;
 }
 
+/**
+ * write_csr - write CSR at the indicated offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ * @value - value to write
+ */
 void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
 {
-	if (dd->flags & HFI1_PRESENT)
-		writeq(value, (void __iomem *)dd->kregbase + offset);
+	if (dd->flags & HFI1_PRESENT) {
+		void __iomem *base = hfi1_addr_from_offset(dd, offset);
+
+		/* avoid write to RcvArray */
+		if (WARN_ON(offset >= RCV_ARRAY && offset < dd->base2_start))
+			return;
+		writeq(value, base);
+	}
 }
 
+/**
+ * get_csr_addr - return te iomem address for offset
+ * @dd - the dd device
+ * @offset - the offset of the CSR within bar0
+ *
+ * Return: The iomem address to use in subsequent
+ * writeq/readq operations.
+ */
 void __iomem *get_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	u32 offset)
 {
-	return (void __iomem *)dd->kregbase + offset;
+	if (dd->flags & HFI1_PRESENT)
+		return hfi1_addr_from_offset(dd, offset);
+	return NULL;
 }
 
 static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
@@ -9752,14 +9798,13 @@ void hfi1_put_tid(struct hfi1_devdata *d
 		  u32 type, unsigned long pa, u16 order)
 {
 	u64 reg;
-	void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
-			      (dd->kregbase + RCV_ARRAY));
 
 	if (!(dd->flags & HFI1_PRESENT))
 		goto done;
 
-	if (type == PT_INVALID) {
+	if (type == PT_INVALID || type == PT_INVALID_FLUSH) {
 		pa = 0;
+		order = 0;
 	} else if (type > PT_INVALID) {
 		dd_dev_err(dd,
 			   "unexpected receive array type %u for index %u, not handled\n",
@@ -9773,13 +9818,14 @@ void hfi1_put_tid(struct hfi1_devdata *d
 		| (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
 		| ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
 					<< RCV_ARRAY_RT_ADDR_SHIFT;
-	trace_hfi1_write_rcvarray(base + (index * 8), reg);
-	writeq(reg, base + (index * 8));
+	trace_hfi1_write_rcvarray(dd->rcvarray_wc + (index * 8), reg);
+	writeq(reg, dd->rcvarray_wc + (index * 8));
 
-	if (type == PT_EAGER)
+	if (type == PT_EAGER || type == PT_INVALID_FLUSH || (index & 3) == 3)
 		/*
-		 * Eager entries are written one-by-one so we have to push them
-		 * after we write the entry.
+		 * Eager entries are written and flushed
+		 *
+		 * Expected entries are flushed every 4 writes
 		 */
 		flush_wc();
 done:
@@ -13411,8 +13457,7 @@ static void write_uninitialized_csrs_and
 
 	/* RcvArray */
 	for (i = 0; i < dd->chip_rcv_array_count; i++)
-		write_csr(dd, RCV_ARRAY + (8 * i),
-			  RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+		hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0);
 
 	/* RcvQPMapTable */
 	for (i = 0; i < 32; i++)
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -605,11 +605,11 @@ int read_lcb_csr(struct hfi1_devdata *dd
 int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data);
 
 void __iomem *get_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	u32 offset);
 
 static inline void __iomem *get_kctxt_csr_addr(
-	struct hfi1_devdata *dd,
+	const struct hfi1_devdata *dd,
 	int ctxt,
 	u32 offset0)
 {
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -195,7 +195,7 @@ int hfi1_count_active_units(void)
 
 	spin_lock_irqsave(&hfi1_devs_lock, flags);
 	list_for_each_entry(dd, &hfi1_dev_list, list) {
-		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase)
+		if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
 			continue;
 		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
 			ppd = dd->pport + pidx;
@@ -1282,7 +1282,7 @@ int hfi1_reset_device(int unit)
 
 	dd_dev_info(dd, "Reset on unit %u requested\n", unit);
 
-	if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) {
+	if (!dd->kregbase1 || !(dd->flags & HFI1_PRESENT)) {
 		dd_dev_info(dd,
 			    "Invalid unit number %u or not initialized or not present\n",
 			    unit);
--- a/drivers/infiniband/hw/hfi1/exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.h
@@ -137,8 +137,11 @@ static inline void rcv_array_wc_fill(str
 	 * Doing the WC fill writes only makes sense if the device is
 	 * present and the RcvArray has been mapped as WC memory.
 	 */
-	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
+	if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) {
 		writeq(0, dd->rcvarray_wc + (index * 8));
+		if ((index & 3) == 3)
+			flush_wc();
+	}
 }
 
 static inline void tid_group_add_tail(struct tid_group *grp,
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -181,7 +181,7 @@ static int hfi1_file_open(struct inode *
 					       struct hfi1_devdata,
 					       user_cdev);
 
-	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase))
+	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1))
 		return -EINVAL;
 
 	if (!atomic_inc_not_zero(&dd->user_refcount))
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -867,12 +867,15 @@ struct hfi1_devdata {
 	struct device *diag_device;
 	struct device *ui_device;
 
-	/* mem-mapped pointer to base of chip regs */
-	u8 __iomem *kregbase;
-	/* end of mem-mapped chip space excluding sendbuf and user regs */
-	u8 __iomem *kregend;
-	/* physical address of chip for io_remap, etc. */
+	/* first mapping up to RcvArray */
+	u8 __iomem *kregbase1;
 	resource_size_t physaddr;
+
+	/* second uncached mapping from RcvArray to pio send buffers */
+	u8 __iomem *kregbase2;
+	/* for detecting offset above kregbase2 address */
+	u32 base2_start;
+
 	/* Per VL data. Enough for all VLs but not all elements are set/used. */
 	struct per_vl_data vld[PER_VL_SEND_CONTEXTS];
 	/* send context data */
@@ -1236,9 +1239,10 @@ static inline bool hfi1_vnic_is_rsm_full
 #define dc8051_ver_patch(a) ((a) & 0x0000ff)
 
 /* f_put_tid types */
-#define PT_EXPECTED 0
-#define PT_EAGER    1
-#define PT_INVALID  2
+#define PT_EXPECTED       0
+#define PT_EAGER          1
+#define PT_INVALID_FLUSH  2
+#define PT_INVALID        3
 
 struct tid_rb_node;
 struct mmu_rb_node;
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -180,31 +180,47 @@ int hfi1_pcie_ddinit(struct hfi1_devdata
 		return -EINVAL;
 	}
 
-	dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND);
-	if (!dd->kregbase)
+	dd->kregbase1 = ioremap_nocache(addr, RCV_ARRAY);
+	if (!dd->kregbase1) {
+		dd_dev_err(dd, "UC mapping of kregbase1 failed\n");
 		return -ENOMEM;
+	}
+	dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY);
+	dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT);
+	dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count);
+	dd->base2_start  = RCV_ARRAY + dd->chip_rcv_array_count * 8;
+
+	dd->kregbase2 = ioremap_nocache(
+		addr + dd->base2_start,
+		TXE_PIO_SEND - dd->base2_start);
+	if (!dd->kregbase2) {
+		dd_dev_err(dd, "UC mapping of kregbase2 failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "UC base2: %p for %x\n", dd->kregbase2,
+		    TXE_PIO_SEND - dd->base2_start);
 
 	dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE);
 	if (!dd->piobase) {
-		iounmap(dd->kregbase);
-		return -ENOMEM;
+		dd_dev_err(dd, "WC mapping of send buffers failed\n");
+		goto nomem;
 	}
+	dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
 
-	dd->flags |= HFI1_PRESENT;	/* now register routines work */
-
-	dd->kregend = dd->kregbase + TXE_PIO_SEND;
 	dd->physaddr = addr;        /* used for io_remap, etc. */
 
 	/*
-	 * Re-map the chip's RcvArray as write-combining to allow us
+	 * Map the chip's RcvArray as write-combining to allow us
 	 * to write an entire cacheline worth of entries in one shot.
-	 * If this re-map fails, just continue - the RcvArray programming
-	 * function will handle both cases.
 	 */
-	dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT);
 	dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY,
 				     dd->chip_rcv_array_count * 8);
-	dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc);
+	if (!dd->rcvarray_wc) {
+		dd_dev_err(dd, "WC mapping of receive array failed\n");
+		goto nomem;
+	}
+	dd_dev_info(dd, "WC RcvArray: %p for %x\n",
+		    dd->rcvarray_wc, dd->chip_rcv_array_count * 8);
 	/*
 	 * Save BARs and command to rewrite after device reset.
 	 */
@@ -253,10 +269,16 @@ int hfi1_pcie_ddinit(struct hfi1_devdata
 	if (ret)
 		goto read_error;
 
+	dd->flags |= HFI1_PRESENT;	/* chip.c CSR routines now work */
 	return 0;
 
 read_error:
 	dd_dev_err(dd, "Unable to read from PCI config\n");
+	goto bail_error;
+nomem:
+	ret = -ENOMEM;
+bail_error:
+	hfi1_pcie_ddcleanup(dd);
 	return ret;
 }
 
@@ -267,15 +289,19 @@ read_error:
  */
 void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd)
 {
-	u64 __iomem *base = (void __iomem *)dd->kregbase;
-
 	dd->flags &= ~HFI1_PRESENT;
-	dd->kregbase = NULL;
-	iounmap(base);
+	if (dd->kregbase1)
+		iounmap(dd->kregbase1);
+	dd->kregbase1 = NULL;
+	if (dd->kregbase2)
+		iounmap(dd->kregbase2);
+	dd->kregbase2 = NULL;
 	if (dd->rcvarray_wc)
 		iounmap(dd->rcvarray_wc);
+	dd->rcvarray_wc = NULL;
 	if (dd->piobase)
 		iounmap(dd->piobase);
+	dd->piobase = NULL;
 }
 
 /* return the PCIe link speed from the given link status */
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -814,12 +814,11 @@ static void clear_tid_node(struct hfi1_f
 				 node->npages, node->mmu.addr, node->phys,
 				 node->dma_addr);
 
-	hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
 	/*
 	 * Make sure device has seen the write before we unpin the
 	 * pages.
 	 */
-	flush_wc();
+	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
 
 	pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
 			 PCI_DMA_FROMDEVICE);