Blob Blame History Raw
From: James Smart <jsmart2021@gmail.com>
Date: Fri, 10 Sep 2021 16:31:58 -0700
Subject: scsi: lpfc: Improve PBDE checks during SGL processing
Patch-mainline: v5.16-rc1
Git-commit: 315b3fd13521fb44d97572eb24b7c8670ec7fd97
References: bsc#1190576

The PBDE feature, setting payload buffer address explicitly in the WQE so
it doesn't have to be fetched from the SGL, only makes sense when there is
a single buffer for the I/O. When there are multiple buffers it actually
hurts performance as the SGL subsequently has to be fetched.

Rework the SGL logic to only use PBDE when a single buffer.

Link: https://lore.kernel.org/r/20210910233159.115896-14-jsmart2021@gmail.com
Co-developed-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: Justin Tee <justin.tee@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Daniel Wagner <dwagner@suse.de>
---
 drivers/scsi/lpfc/lpfc_nvme.c  |   12 +++++------
 drivers/scsi/lpfc/lpfc_nvmet.c |   44 +++++++++++++++++++----------------------
 drivers/scsi/lpfc/lpfc_scsi.c  |   26 ++++++++++++------------
 3 files changed, 40 insertions(+), 42 deletions(-)

--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -1149,7 +1149,6 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport
 	struct sli4_sge *first_data_sgl;
 	struct ulp_bde64 *bde;
 	dma_addr_t physaddr = 0;
-	uint32_t num_bde = 0;
 	uint32_t dma_len = 0;
 	uint32_t dma_offset = 0;
 	int nseg, i, j;
@@ -1203,7 +1202,7 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport
 			}
 
 			sgl->word2 = 0;
-			if ((num_bde + 1) == nseg) {
+			if (nseg == 1) {
 				bf_set(lpfc_sli4_sge_last, sgl, 1);
 				bf_set(lpfc_sli4_sge_type, sgl,
 				       LPFC_SGE_TYPE_DATA);
@@ -1272,8 +1271,9 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport
 
 			j++;
 		}
-		if (phba->cfg_enable_pbde) {
-			/* Use PBDE support for first SGL only, offset == 0 */
+
+		/* PBDE support for first data SGE only */
+		if (nseg == 1 && phba->cfg_enable_pbde) {
 			/* Words 13-15 */
 			bde = (struct ulp_bde64 *)
 				&wqe->words[13];
@@ -1284,11 +1284,11 @@ lpfc_nvme_prep_io_dma(struct lpfc_vport
 			bde->tus.f.bdeFlags = BUFF_TYPE_BDE_64;
 			bde->tus.w = cpu_to_le32(bde->tus.w);
 
-			/* Word 11 */
+			/* Word 11 - set PBDE bit */
 			bf_set(wqe_pbde, &wqe->generic.wqe_com, 1);
 		} else {
 			memset(&wqe->words[13], 0, (sizeof(uint32_t) * 3));
-			bf_set(wqe_pbde, &wqe->generic.wqe_com, 0);
+			/* Word 11 - PBDE bit disabled by default template */
 		}
 
 	} else {
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -2567,7 +2567,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba
 	struct ulp_bde64 *bde;
 	dma_addr_t physaddr;
 	int i, cnt, nsegs;
-	int do_pbde;
+	bool use_pbde = false;
 	int xc = 1;
 
 	if (!lpfc_is_link_up(phba)) {
@@ -2675,9 +2675,6 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba
 		if (!xc)
 			bf_set(wqe_xc, &wqe->fcp_tsend.wqe_com, 0);
 
-		/* Word 11 - set sup, irsp, irsplen later */
-		do_pbde = 0;
-
 		/* Word 12 */
 		wqe->fcp_tsend.fcp_data_len = rsp->transfer_length;
 
@@ -2755,12 +2752,13 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba
 		if (!xc)
 			bf_set(wqe_xc, &wqe->fcp_treceive.wqe_com, 0);
 
-		/* Word 11 - set pbde later */
-		if (phba->cfg_enable_pbde) {
-			do_pbde = 1;
+		/* Word 11 - check for pbde */
+		if (nsegs == 1 && phba->cfg_enable_pbde) {
+			use_pbde = true;
+			/* Word 11 - PBDE bit already preset by template */
 		} else {
+			/* Overwrite default template setting */
 			bf_set(wqe_pbde, &wqe->fcp_treceive.wqe_com, 0);
-			do_pbde = 0;
 		}
 
 		/* Word 12 */
@@ -2831,7 +2829,6 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba
 			       ((rsp->rsplen >> 2) - 1));
 			memcpy(&wqe->words[16], rsp->rspaddr, rsp->rsplen);
 		}
-		do_pbde = 0;
 
 		/* Word 12 */
 		wqe->fcp_trsp.rsvd_12_15[0] = 0;
@@ -2866,23 +2863,24 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba
 			bf_set(lpfc_sli4_sge_last, sgl, 1);
 		sgl->word2 = cpu_to_le32(sgl->word2);
 		sgl->sge_len = cpu_to_le32(cnt);
-		if (i == 0) {
-			bde = (struct ulp_bde64 *)&wqe->words[13];
-			if (do_pbde) {
-				/* Words 13-15  (PBDE) */
-				bde->addrLow = sgl->addr_lo;
-				bde->addrHigh = sgl->addr_hi;
-				bde->tus.f.bdeSize =
-					le32_to_cpu(sgl->sge_len);
-				bde->tus.f.bdeFlags = BUFF_TYPE_BDE_64;
-				bde->tus.w = cpu_to_le32(bde->tus.w);
-			} else {
-				memset(bde, 0, sizeof(struct ulp_bde64));
-			}
-		}
 		sgl++;
 		ctxp->offset += cnt;
 	}
+
+	bde = (struct ulp_bde64 *)&wqe->words[13];
+	if (use_pbde) {
+		/* decrement sgl ptr backwards once to first data sge */
+		sgl--;
+
+		/* Words 13-15 (PBDE) */
+		bde->addrLow = sgl->addr_lo;
+		bde->addrHigh = sgl->addr_hi;
+		bde->tus.f.bdeSize = le32_to_cpu(sgl->sge_len);
+		bde->tus.f.bdeFlags = BUFF_TYPE_BDE_64;
+		bde->tus.w = cpu_to_le32(bde->tus.w);
+	} else {
+		memset(bde, 0, sizeof(struct ulp_bde64));
+	}
 	ctxp->state = LPFC_NVMET_STE_DATA;
 	ctxp->entry_cnt++;
 	return nvmewqe;
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3243,7 +3243,6 @@ lpfc_scsi_prep_dma_buf_s4(struct lpfc_hb
 	struct lpfc_vport *vport = phba->pport;
 	union lpfc_wqe128 *wqe = &pwqeq->wqe;
 	dma_addr_t physaddr;
-	uint32_t num_bde = 0;
 	uint32_t dma_len;
 	uint32_t dma_offset = 0;
 	int nseg, i, j;
@@ -3305,7 +3304,7 @@ lpfc_scsi_prep_dma_buf_s4(struct lpfc_hb
 		j = 2;
 		for (i = 0; i < nseg; i++) {
 			sgl->word2 = 0;
-			if ((num_bde + 1) == nseg) {
+			if (nseg == 1) {
 				bf_set(lpfc_sli4_sge_last, sgl, 1);
 				bf_set(lpfc_sli4_sge_type, sgl,
 				       LPFC_SGE_TYPE_DATA);
@@ -3374,13 +3373,15 @@ lpfc_scsi_prep_dma_buf_s4(struct lpfc_hb
 
 			j++;
 		}
-		/*
-		 * Setup the first Payload BDE. For FCoE we just key off
-		 * Performance Hints, for FC we use lpfc_enable_pbde.
-		 * We populate words 13-15 of IOCB/WQE.
+
+		/* PBDE support for first data SGE only.
+		 * For FCoE, we key off Performance Hints.
+		 * For FC, we key off lpfc_enable_pbde.
 		 */
-		if ((phba->sli3_options & LPFC_SLI4_PERFH_ENABLED) ||
-		    phba->cfg_enable_pbde) {
+		if (nseg == 1 &&
+		    ((phba->sli3_options & LPFC_SLI4_PERFH_ENABLED) ||
+		     phba->cfg_enable_pbde)) {
+			/* Words 13-15 */
 			bde = (struct ulp_bde64 *)
 				&wqe->words[13];
 			bde->addrLow = first_data_sgl->addr_lo;
@@ -3390,12 +3391,15 @@ lpfc_scsi_prep_dma_buf_s4(struct lpfc_hb
 			bde->tus.f.bdeFlags = BUFF_TYPE_BDE_64;
 			bde->tus.w = cpu_to_le32(bde->tus.w);
 
+			/* Word 11 - set PBDE bit */
+			bf_set(wqe_pbde, &wqe->generic.wqe_com, 1);
 		} else {
 			memset(&wqe->words[13], 0, (sizeof(uint32_t) * 3));
+			/* Word 11 - PBDE bit disabled by default template */
 		}
 	} else {
 		sgl += 1;
-		/* clear the last flag in the fcp_rsp map entry */
+		/* set the last flag in the fcp_rsp map entry */
 		sgl->word2 = le32_to_cpu(sgl->word2);
 		bf_set(lpfc_sli4_sge_last, sgl, 1);
 		sgl->word2 = cpu_to_le32(sgl->word2);
@@ -3408,10 +3412,6 @@ lpfc_scsi_prep_dma_buf_s4(struct lpfc_hb
 		}
 	}
 
-	/* Word 11 */
-	if (phba->cfg_enable_pbde)
-		bf_set(wqe_pbde, &wqe->generic.wqe_com, 1);
-
 	/*
 	 * Finish initializing those IOCB fields that are dependent on the
 	 * scsi_cmnd request_buffer.  Note that for SLI-2 the bdeSize is