Blob Blame History Raw
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 17 Apr 2019 14:44:35 -0700
Subject: scsi: qla2xxx: Fix race conditions in the code for aborting SCSI
 commands
Patch-mainline: v5.2-rc1
Git-commit: 219d27d7147e07fe899a781bd72f9180b78c3852
References: bsc#1082635 bsc#1123034 bsc#1131304 bsc#1127988 bsc#1141340 bsc#1143706

In the *_done() functions, instead of returning early if sp->ref_count >=
2, only decrement sp->ref_count. In qla2xxx_eh_abort(), instead of deciding
what to do based on the value of sp->ref_count, decide which action to take
depending on the completion status of the firmware abort. Remove srb.cwaitq
and use srb.comp instead. In qla2x00_abort_srb(), call
isp_ops->abort_command() directly instead of calling qla2xxx_eh_abort().

Cc: Himanshu Madhani <hmadhani@marvell.com>
Cc: Giridhar Malavali <gmalavali@marvell.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: Himanshu Madhani <hmadhani@marvell.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Daniel Wagner <dwagner@suse.de>
---
 drivers/scsi/qla2xxx/qla_def.h  |    1 
 drivers/scsi/qla2xxx/qla_nvme.c |   34 --------
 drivers/scsi/qla2xxx/qla_nvme.h |    1 
 drivers/scsi/qla2xxx/qla_os.c   |  161 +++++++++++++---------------------------
 4 files changed, 56 insertions(+), 141 deletions(-)

--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -546,7 +546,6 @@ typedef struct srb {
 	int rc;
 	int retry_count;
 	struct completion *comp;
-	wait_queue_head_t *cwaitq;
 	union {
 		struct srb_iocb iocb_cmd;
 		struct bsg_job *bsg_job;
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -137,8 +137,7 @@ static void qla_nvme_sp_ls_done(void *pt
 		return;
 	}
 
-	if (!atomic_dec_and_test(&sp->ref_count))
-		return;
+	atomic_dec(&sp->ref_count);
 
 	if (res)
 		res = -EINVAL;
@@ -161,8 +160,7 @@ static void qla_nvme_sp_done(void *ptr,
 	nvme = &sp->u.iocb_cmd;
 	fd = nvme->u.nvme.desc;
 
-	if (!atomic_dec_and_test(&sp->ref_count))
-		return;
+	atomic_dec(&sp->ref_count);
 
 	if (res == QLA_SUCCESS) {
 		fd->rcv_rsplen = nvme->u.nvme.rsp_pyld_len;
@@ -611,34 +609,6 @@ static struct nvme_fc_port_template qla_
 	.fcprqst_priv_sz = sizeof(struct nvme_private),
 };
 
-#define NVME_ABORT_POLLING_PERIOD    2
-static int qla_nvme_wait_on_command(srb_t *sp)
-{
-	int ret = QLA_SUCCESS;
-
-	wait_event_timeout(sp->nvme_ls_waitq, (atomic_read(&sp->ref_count) > 1),
-	    NVME_ABORT_POLLING_PERIOD*HZ);
-
-	if (atomic_read(&sp->ref_count) > 1)
-		ret = QLA_FUNCTION_FAILED;
-
-	return ret;
-}
-
-void qla_nvme_abort(struct qla_hw_data *ha, struct srb *sp, int res)
-{
-	int rval;
-
-	if (ha->flags.fw_started) {
-		rval = ha->isp_ops->abort_command(sp);
-		if (!rval && !qla_nvme_wait_on_command(sp))
-			ql_log(ql_log_warn, NULL, 0x2112,
-			    "timed out waiting on sp=%p\n", sp);
-	} else {
-		sp->done(sp, res);
-	}
-}
-
 static void qla_nvme_unregister_remote_port(struct work_struct *work)
 {
 	struct fc_port *fcport = container_of(work, struct fc_port,
--- a/drivers/scsi/qla2xxx/qla_nvme.h
+++ b/drivers/scsi/qla2xxx/qla_nvme.h
@@ -145,7 +145,6 @@ struct pt_ls4_rx_unsol {
 int qla_nvme_register_hba(struct scsi_qla_host *);
 int  qla_nvme_register_remote(struct scsi_qla_host *, struct fc_port *);
 void qla_nvme_delete(struct scsi_qla_host *);
-void qla_nvme_abort(struct qla_hw_data *, struct srb *sp, int res);
 void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *, struct pt_ls4_request *,
     struct req_que *);
 void qla24xx_async_gffid_sp_done(void *, int);
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -23,23 +23,6 @@
 
 #include "qla_target.h"
 
-#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state) \
-	___wait_event(wq_head, ___wait_cond_timeout(condition),			\
-		      state, 0, timeout,					\
-		      spin_unlock_irq(&lock);					\
-		      __ret = schedule_timeout(__ret);				\
-		      spin_lock_irq(&lock));
-
-#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)	\
-({										\
-	long __ret = timeout;							\
-	if (!___wait_cond_timeout(condition))					\
-		__ret = __wait_event_lock_irq_timeout(				\
-					wq_head, condition, lock, timeout,	\
-					TASK_UNINTERRUPTIBLE);			\
-	__ret;									\
-})
-
 /*
  * Driver version
  */
@@ -733,7 +716,7 @@ qla2x00_sp_compl(void *ptr, int res)
 {
 	srb_t *sp = ptr;
 	struct scsi_cmnd *cmd = GET_CMD_SP(sp);
-	wait_queue_head_t *cwaitq = sp->cwaitq;
+	struct completion *comp = sp->comp;
 
 	if (atomic_read(&sp->ref_count) == 0) {
 		ql_dbg(ql_dbg_io, sp->vha, 0x3015,
@@ -743,15 +726,15 @@ qla2x00_sp_compl(void *ptr, int res)
 			WARN_ON(atomic_read(&sp->ref_count) == 0);
 		return;
 	}
-	if (!atomic_dec_and_test(&sp->ref_count))
-		return;
+
+	atomic_dec(&sp->ref_count);
 
 	sp->free(sp);
 	cmd->result = res;
 	CMD_SP(cmd) = NULL;
 	cmd->scsi_done(cmd);
-	if (cwaitq)
-		wake_up(cwaitq);
+	if (comp)
+		complete(comp);
 	qla2x00_rel_sp(sp);
 }
 
@@ -844,7 +827,7 @@ qla2xxx_qpair_sp_compl(void *ptr, int re
 {
 	srb_t *sp = ptr;
 	struct scsi_cmnd *cmd = GET_CMD_SP(sp);
-	wait_queue_head_t *cwaitq = sp->cwaitq;
+	struct completion *comp = sp->comp;
 
 	if (atomic_read(&sp->ref_count) == 0) {
 		ql_dbg(ql_dbg_io, sp->fcport->vha, 0x3079,
@@ -854,15 +837,15 @@ qla2xxx_qpair_sp_compl(void *ptr, int re
 			WARN_ON(atomic_read(&sp->ref_count) == 0);
 		return;
 	}
-	if (!atomic_dec_and_test(&sp->ref_count))
-		return;
+
+	atomic_dec(&sp->ref_count);
 
 	sp->free(sp);
 	cmd->result = res;
 	CMD_SP(cmd) = NULL;
 	cmd->scsi_done(cmd);
-	if (cwaitq)
-		wake_up(cwaitq);
+	if (comp)
+		complete(comp);
 	qla2xxx_rel_qpair_sp(sp->qpair, sp);
 }
 
@@ -1309,7 +1292,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
 	unsigned int id;
 	uint64_t lun;
 	unsigned long flags;
-	int rval, wait = 0;
+	int rval;
 	struct qla_hw_data *ha = vha->hw;
 	struct qla_qpair *qpair;
 
@@ -1322,7 +1305,6 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
 	ret = fc_block_scsi_eh(cmd);
 	if (ret != 0)
 		return ret;
-	ret = SUCCESS;
 
 	sp = (srb_t *) CMD_SP(cmd);
 	if (!sp)
@@ -1333,7 +1315,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
 		return SUCCESS;
 
 	spin_lock_irqsave(qpair->qp_lock_ptr, flags);
-	if (!CMD_SP(cmd)) {
+	if (sp->type != SRB_SCSI_CMD || GET_CMD_SP(sp) != cmd) {
 		/* there's a chance an interrupt could clear
 		   the ptr as part of done & free */
 		spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
@@ -1354,66 +1336,31 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
 	    "Aborting from RISC nexus=%ld:%d:%llu sp=%p cmd=%p handle=%x\n",
 	    vha->host_no, id, lun, sp, cmd, sp->handle);
 
-	/* Get a reference to the sp and drop the lock.*/
 	rval = ha->isp_ops->abort_command(sp);
-	if (rval) {
-		if (rval == QLA_FUNCTION_PARAMETER_ERROR)
-			ret = SUCCESS;
-		else
-			ret = FAILED;
-
-		ql_dbg(ql_dbg_taskm, vha, 0x8003,
-		    "Abort command mbx failed cmd=%p, rval=%x.\n", cmd, rval);
-	} else {
-		ql_dbg(ql_dbg_taskm, vha, 0x8004,
-		    "Abort command mbx success cmd=%p.\n", cmd);
-		wait = 1;
-	}
+	ql_dbg(ql_dbg_taskm, vha, 0x8003,
+	       "Abort command mbx cmd=%p, rval=%x.\n", cmd, rval);
 
-	spin_lock_irqsave(qpair->qp_lock_ptr, flags);
-
-	/*
-	 * Releasing of the SRB and associated command resources
-	 * is managed through ref_count.
-	 * Whether we need to wait for the abort completion or complete
-	 * the abort handler should be based on the ref_count.
-	 */
-	if (atomic_read(&sp->ref_count) > 1) {
+	switch (rval) {
+	case QLA_SUCCESS:
 		/*
-		 * The command is not yet completed. We need to wait for either
-		 * command completion or abort completion.
+		 * The command has been aborted. That means that the firmware
+		 * won't report a completion.
 		 */
-		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(eh_waitq);
-		uint32_t ratov = ha->r_a_tov/10;
-
-		/* Go ahead and release the extra ref_count obtained earlier */
-		sp->done(sp, DID_RESET << 16);
-		sp->cwaitq = &eh_waitq;
-
-		if (!wait_event_lock_irq_timeout(eh_waitq,
-		    CMD_SP(cmd) == NULL, *qpair->qp_lock_ptr,
-		    msecs_to_jiffies(4 * ratov * 1000))) {
-			/*
-			 * The abort got dropped, LOGO will be sent and the
-			 * original command will be completed with CS_TIMEOUT
-			 * completion
-			 */
-			ql_dbg(ql_dbg_taskm, vha, 0xffff,
-			    "%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
-			    __func__, ha->r_a_tov);
-			sp->cwaitq = NULL;
-			ret = FAILED;
-			goto end;
-		}
-	} else {
-		/* Command completed while processing the abort */
-		sp->done(sp, DID_RESET << 16);
+		sp->done(sp, DID_ABORT << 16);
+		ret = SUCCESS;
+		break;
+	default:
+		/*
+		 * Either abort failed or abort and completion raced. Let
+		 * the SCSI core retry the abort in the former case.
+		 */
+		ret = FAILED;
+		break;
 	}
-end:
-	spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
+
 	ql_log(ql_log_info, vha, 0x801c,
-	    "Abort command issued nexus=%ld:%d:%llu --  %d %x.\n",
-	    vha->host_no, id, lun, wait, ret);
+	    "Abort command issued nexus=%ld:%d:%llu -- %x.\n",
+	    vha->host_no, id, lun, ret);
 
 	return ret;
 }
@@ -1789,34 +1736,34 @@ static void qla2x00_abort_srb(struct qla
 	__releases(qp->qp_lock_ptr)
 	__acquires(qp->qp_lock_ptr)
 {
+	DECLARE_COMPLETION_ONSTACK(comp);
 	scsi_qla_host_t *vha = qp->vha;
 	struct qla_hw_data *ha = vha->hw;
+	int rval;
 
-	if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS) {
-		if (!sp_get(sp)) {
-			/* got sp */
-			spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
-			qla_nvme_abort(ha, sp, res);
-			spin_lock_irqsave(qp->qp_lock_ptr, *flags);
-		}
-	} else if (GET_CMD_SP(sp) && !ha->flags.eeh_busy &&
-		   !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
-		   !qla2x00_isp_reg_stat(ha) && sp->type == SRB_SCSI_CMD) {
-		/*
-		 * Don't abort commands in adapter during EEH recovery as it's
-		 * not accessible/responding.
-		 *
-		 * Get a reference to the sp and drop the lock. The reference
-		 * ensures this sp->done() call and not the call in
-		 * qla2xxx_eh_abort() ends the SCSI cmd (with result 'res').
-		 */
-		if (!sp_get(sp)) {
-			spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
-			qla2xxx_eh_abort(GET_CMD_SP(sp));
-			spin_lock_irqsave(qp->qp_lock_ptr, *flags);
+	if (sp_get(sp))
+		return;
+
+	if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS ||
+	    (sp->type == SRB_SCSI_CMD && !ha->flags.eeh_busy &&
+	     !test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
+	     !qla2x00_isp_reg_stat(ha))) {
+		sp->comp = &comp;
+		rval = ha->isp_ops->abort_command(sp);
+		spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
+
+		switch (rval) {
+		case QLA_SUCCESS:
+			sp->done(sp, res);
+			break;
+		case QLA_FUNCTION_PARAMETER_ERROR:
+			wait_for_completion(&comp);
+			break;
 		}
+
+		spin_lock_irqsave(qp->qp_lock_ptr, *flags);
+		sp->comp = NULL;
 	}
-	sp->done(sp, res);
 }
 
 static void