Blob Blame History Raw
From: Shyam Sundar <ssundar@marvell.com>
Date: Tue, 26 Mar 2019 00:38:55 -0700
Subject: scsi: qedf: Cleanup rrq_work after QEDF_CMD_OUTSTANDING is cleared
Git-commit: faea5719d373f38d785961ee724de1b392e869e8
Patch-mainline: v5.2-rc1
References: bsc#1136467 jsc#SLE-4694

Here is the relevant logs for the problem we are solving:

qedf_flush_active_ios:1707]:3: Flush active i/o's num=0x17 fcport=0xffff948168fbcc80 port_id=0x550200 scsi_id=0.
qedf_flush_active_ios:1708]:3: Locking flush mutex.
qedf_flush_active_ios:1758]:3: Not outstanding, xid=0xaaf, cmd_type=3 refcount=1.
qedf_flush_active_ios:1896]:3: Flushed 0x16 I/Os, active=0x1.
qedf_flush_active_ios:1901]:3: Flushed 0x16 I/Os, active=0x1 cnt=60.
qedf_send_rrq:295]:3: Sending RRQ orig io = ffffb48b8f7d7158, orig_xid = 0xaaf
qedf_initiate_els:37]:3: Sending ELS
qedf_initiate_els:68]:3: initiate_els els_req = 0xffffb48b8f6d3098 cb_arg = ffff948fd5e4de80 xid = 4c6
qedf_init_mp_req:2172]:3: Entered.
qedf_init_mp_task:727]:3: Initializing MP task for cmd_type=4
qedf_initiate_els:134]:3: Ringing doorbell for ELS req
qedf_flush_active_ios:1901]:3: Flushed 0x16 I/Os, active=0x2 cnt=20.
qedf_cmd_timeout:96]:3: ELS timeout, xid=0x4c6.
qedf_rrq_compl:186]:3: Entered.
qedf_rrq_compl:204]:3: rrq_compl: orig io = ffffb48b8f7d7158, orig xid = 0xaaf, rrq_xid = 0x4c6, refcount=1
qedf_flush_active_ios:1935]:3: Unlocking flush mutex.
qedf_upload_connection:1579]:3: Uploading connection port_id=550200.

We found an ABTS command for which CMD_OUTSTANDING was cleared (line 3).
For this command, delayed send_rrq was queued, but would take 10 secs to
execute.  Adding capability to detect that (based on io_req->state that is
being introduced), and attempt to cancel rrq_work. If we succeed, we drop
the reference and free the io_req. If we cannot, then the els will get sent
out and we will wait for 10 secs for it to complete.

Signed-off-by: Shyam Sundar <ssundar@marvell.com>
Signed-off-by: Saurav Kashyap <skashyap@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Lee Duncan <lduncan@suse.com>
---
 drivers/scsi/qedf/qedf.h    | 17 +++++++++++++++
 drivers/scsi/qedf/qedf_io.c | 53 +++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 16 deletions(-)

diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h
index 1e0d1429ed48..f647817fb79a 100644
--- a/drivers/scsi/qedf/qedf.h
+++ b/drivers/scsi/qedf/qedf.h
@@ -119,9 +119,26 @@ struct qedf_ioreq {
 	u8 io_req_flags;
 	uint8_t tm_flags;
 	struct qedf_rport *fcport;
+#define	QEDF_CMD_ST_INACTIVE		0
+#define	QEDFC_CMD_ST_IO_ACTIVE		1
+#define	QEDFC_CMD_ST_ABORT_ACTIVE	2
+#define	QEDFC_CMD_ST_ABORT_ACTIVE_EH	3
+#define	QEDFC_CMD_ST_CLEANUP_ACTIVE	4
+#define	QEDFC_CMD_ST_CLEANUP_ACTIVE_EH	5
+#define	QEDFC_CMD_ST_RRQ_ACTIVE		6
+#define	QEDFC_CMD_ST_RRQ_WAIT		7
+#define	QEDFC_CMD_ST_OXID_RETIRE_WAIT	8
+#define	QEDFC_CMD_ST_TMF_ACTIVE		9
+#define	QEDFC_CMD_ST_DRAIN_ACTIVE	10
+#define	QEDFC_CMD_ST_CLEANED		11
+#define	QEDFC_CMD_ST_ELS_ACTIVE		12
+	atomic_t state;
 	unsigned long flags;
 	enum qedf_ioreq_event event;
 	size_t data_xfer_len;
+	/* ID: 001: Alloc cmd (qedf_alloc_cmd) */
+	/* ID: 002: Initiate ABTS (qedf_initiate_abts) */
+	/* ID: 003: For RRQ (qedf_process_abts_compl) */
 	struct kref refcount;
 	struct qedf_cmd_mgr *cmd_mgr;
 	struct io_bdt *bd_tbl;
diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c
index ca7ca8c093c5..db5d49e72652 100644
--- a/drivers/scsi/qedf/qedf_io.c
+++ b/drivers/scsi/qedf/qedf_io.c
@@ -170,6 +170,7 @@ static void qedf_handle_rrq(struct work_struct *work)
 	struct qedf_ioreq *io_req =
 	    container_of(work, struct qedf_ioreq, rrq_work.work);
 
+	atomic_set(&io_req->state, QEDFC_CMD_ST_RRQ_ACTIVE);
 	qedf_send_rrq(io_req);
 
 }
@@ -374,7 +375,8 @@ struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type)
 	io_req->lun = -1;
 
 	/* Hold the io_req against deletion */
-	kref_init(&io_req->refcount);
+	kref_init(&io_req->refcount);	/* ID: 001 */
+	atomic_set(&io_req->state, QEDFC_CMD_ST_IO_ACTIVE);
 
 	/* Bind io_bdt for this io_req */
 	/* Have a static link between io_req and io_bdt_pool */
@@ -447,6 +449,7 @@ void qedf_release_cmd(struct kref *ref)
 
 	atomic_inc(&cmd_mgr->free_list_cnt);
 	atomic_dec(&fcport->num_active_ios);
+	atomic_set(&io_req->state, QEDF_CMD_ST_INACTIVE);
 	if (atomic_read(&fcport->num_active_ios) < 0)
 		QEDF_WARN(&(fcport->qedf->dbg_ctx), "active_ios < 0.\n");
 
@@ -1627,6 +1630,21 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun)
 			QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO,
 				  "Not outstanding, xid=0x%x, cmd_type=%d refcount=%d.\n",
 				  io_req->xid, io_req->cmd_type, refcount);
+			/* If RRQ work has been queue, try to cancel it and
+			 * free the io_req
+			 */
+			if (atomic_read(&io_req->state) ==
+			    QEDFC_CMD_ST_RRQ_WAIT) {
+				if (cancel_delayed_work_sync
+				    (&io_req->rrq_work)) {
+					QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO,
+						  "Putting reference for pending RRQ work xid=0x%x.\n",
+						  io_req->xid);
+					/* ID: 003 */
+					kref_put(&io_req->refcount,
+						 qedf_release_cmd);
+				}
+			}
 			continue;
 		}
 
@@ -1650,6 +1668,7 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun)
 		}
 
 		if (io_req->cmd_type == QEDF_ABTS) {
+			/* ID: 004 */
 			rc = kref_get_unless_zero(&io_req->refcount);
 			if (!rc) {
 				QEDF_ERR(&(qedf->dbg_ctx),
@@ -1665,24 +1684,25 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun)
 
 			if (cancel_delayed_work_sync(&io_req->rrq_work)) {
 				QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO,
-					  "Putting reference for pending RRQ work xid=0x%x.\n",
+					  "Putting ref for cancelled RRQ work xid=0x%x.\n",
 					  io_req->xid);
 				kref_put(&io_req->refcount, qedf_release_cmd);
 			}
 
-			/* Cancel any timeout work */
-			cancel_delayed_work_sync(&io_req->timeout_work);
-
-			if (!test_bit(QEDF_CMD_IN_ABORT, &io_req->flags))
-				goto free_cmd;
-
-			qedf_initiate_cleanup(io_req, true);
+			if (cancel_delayed_work_sync(&io_req->timeout_work)) {
+				QEDF_INFO(&qedf->dbg_ctx, QEDF_LOG_IO,
+					  "Putting ref for cancelled tmo work xid=0x%x.\n",
+					  io_req->xid);
+				qedf_initiate_cleanup(io_req, true);
+				/* Notify eh_abort handler that ABTS is
+				 * complete
+				 */
+				complete(&io_req->abts_done);
+				clear_bit(QEDF_CMD_IN_ABORT, &io_req->flags);
+				/* ID: 002 */
+				kref_put(&io_req->refcount, qedf_release_cmd);
+			}
 			flush_cnt++;
-
-			/* Notify eh_abort handler that ABTS is complete */
-			kref_put(&io_req->refcount, qedf_release_cmd);
-			complete(&io_req->abts_done);
-
 			goto free_cmd;
 		}
 
@@ -1722,7 +1742,7 @@ void qedf_flush_active_ios(struct qedf_rport *fcport, int lun)
 		qedf_initiate_cleanup(io_req, true);
 
 free_cmd:
-		kref_put(&io_req->refcount, qedf_release_cmd);
+		kref_put(&io_req->refcount, qedf_release_cmd);	/* ID: 004 */
 	}
 
 	wait_cnt = 60;
@@ -1988,6 +1988,7 @@ void qedf_process_abts_compl(struct qedf
 		 */
 		queue_delayed_work(qedf->dpc_wq, &io_req->rrq_work,
 		    msecs_to_jiffies(qedf->lport->r_a_tov));
+		atomic_set(&io_req->state, QEDFC_CMD_ST_RRQ_WAIT);
 		break;
 	/* For error cases let the cleanup return the command */
 	case FC_RCTL_BA_RJT: