Daniel Wagner 3120dd
From: Quinn Tran <qutran@marvell.com>
Daniel Wagner 3120dd
Date: Wed, 15 Jun 2022 22:35:00 -0700
Daniel Wagner 3120dd
Subject: scsi: qla2xxx: Wind down adapter after PCIe error
Denis Kirjanov 718367
Patch-mainline: v5.20-rc1
Daniel Wagner 3120dd
Git-commit: d3117c83ba316b3200d9f2fe900f2b9a5525a25c
Daniel Wagner 3120dd
References: bsc#1201958
Daniel Wagner 3120dd
Daniel Wagner 3120dd
Put adapter into a wind down state if OS does not make any attempt to
Daniel Wagner 3120dd
recover the adapter after PCIe error.
Daniel Wagner 3120dd
Daniel Wagner 3120dd
Link: https://lore.kernel.org/r/20220616053508.27186-4-njavali@marvell.com
Daniel Wagner 3120dd
Cc: stable@vger.kernel.org
Daniel Wagner 3120dd
Signed-off-by: Quinn Tran <qutran@marvell.com>
Daniel Wagner 3120dd
Signed-off-by: Nilesh Javali <njavali@marvell.com>
Daniel Wagner 3120dd
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Daniel Wagner 3120dd
Acked-by: Daniel Wagner <dwagner@suse.de>
Daniel Wagner 3120dd
---
Daniel Wagner 3120dd
 drivers/scsi/qla2xxx/qla_bsg.c  |   10 +++++++-
Daniel Wagner 3120dd
 drivers/scsi/qla2xxx/qla_def.h  |    4 +++
Daniel Wagner 3120dd
 drivers/scsi/qla2xxx/qla_init.c |   20 ++++++++++++++++
Daniel Wagner 3120dd
 drivers/scsi/qla2xxx/qla_os.c   |   48 ++++++++++++++++++++++++++++++++++++++++
Daniel Wagner 3120dd
 4 files changed, 81 insertions(+), 1 deletion(-)
Daniel Wagner 3120dd
Daniel Wagner 3120dd
--- a/drivers/scsi/qla2xxx/qla_bsg.c
Daniel Wagner 3120dd
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
Daniel Wagner 3120dd
@@ -3062,6 +3062,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n",
Daniel Wagner 3120dd
 	    __func__, bsg_job);
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+	if (qla2x00_isp_reg_stat(ha)) {
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x9007,
Daniel Wagner 3120dd
+		    "PCI/Register disconnect.\n");
Daniel Wagner 3120dd
+		qla_pci_set_eeh_busy(vha);
Daniel Wagner 3120dd
+	}
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 	/* find the bsg job from the active list of commands */
Daniel Wagner 3120dd
 	spin_lock_irqsave(&ha->hardware_lock, flags);
Daniel Wagner 3120dd
 	for (que = 0; que < ha->max_req_queues; que++) {
Daniel Wagner 3120dd
@@ -3079,7 +3086,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_
Daniel Wagner 3120dd
 			    sp->u.bsg_job == bsg_job) {
Daniel Wagner 3120dd
 				req->outstanding_cmds[cnt] = NULL;
Daniel Wagner 3120dd
 				spin_unlock_irqrestore(&ha->hardware_lock, flags);
Daniel Wagner 3120dd
-				if (ha->isp_ops->abort_command(sp)) {
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+				if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) {
Daniel Wagner 3120dd
 					ql_log(ql_log_warn, vha, 0x7089,
Daniel Wagner 3120dd
 					    "mbx abort_command failed.\n");
Daniel Wagner 3120dd
 					bsg_reply->result = -EIO;
Daniel Wagner 3120dd
--- a/drivers/scsi/qla2xxx/qla_def.h
Daniel Wagner 3120dd
+++ b/drivers/scsi/qla2xxx/qla_def.h
Daniel Wagner 3120dd
@@ -4054,6 +4054,9 @@ struct qla_hw_data {
Daniel Wagner 3120dd
 		uint32_t	n2n_fw_acc_sec:1;
Daniel Wagner 3120dd
 		uint32_t	plogi_template_valid:1;
Daniel Wagner 3120dd
 		uint32_t	port_isolated:1;
Daniel Wagner 3120dd
+		uint32_t	eeh_flush:2;
Daniel Wagner 3120dd
+#define EEH_FLUSH_RDY  1
Daniel Wagner 3120dd
+#define EEH_FLUSH_DONE 2
Daniel Wagner 3120dd
 	} flags;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	uint16_t max_exchg;
Daniel Wagner 3120dd
@@ -4088,6 +4091,7 @@ struct qla_hw_data {
Daniel Wagner 3120dd
 	uint32_t		rsp_que_len;
Daniel Wagner 3120dd
 	uint32_t		req_que_off;
Daniel Wagner 3120dd
 	uint32_t		rsp_que_off;
Daniel Wagner 3120dd
+	unsigned long		eeh_jif;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	/* Multi queue data structs */
Daniel Wagner 3120dd
 	device_reg_t *mqiobase;
Daniel Wagner 3120dd
--- a/drivers/scsi/qla2xxx/qla_init.c
Daniel Wagner 3120dd
+++ b/drivers/scsi/qla2xxx/qla_init.c
Daniel Wagner 3120dd
@@ -48,6 +48,7 @@ qla2x00_sp_timeout(unsigned long __data)
Daniel Wagner 3120dd
 {
Daniel Wagner 3120dd
 	srb_t *sp = (srb_t *)__data;
Daniel Wagner 3120dd
 	struct srb_iocb *iocb;
Daniel Wagner 3120dd
+	scsi_qla_host_t *vha = sp->vha;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	WARN_ON(irqs_disabled());
Daniel Wagner 3120dd
 	iocb = &sp->u.iocb_cmd;
Daniel Wagner 3120dd
@@ -55,6 +56,12 @@ qla2x00_sp_timeout(unsigned long __data)
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	/* ref: TMR */
Daniel Wagner 3120dd
 	kref_put(&sp->cmd_kref, qla2x00_sp_release);
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+	if (vha && qla2x00_isp_reg_stat(vha->hw)) {
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x9008,
Daniel Wagner 3120dd
+		    "PCI/Register disconnect.\n");
Daniel Wagner 3120dd
+		qla_pci_set_eeh_busy(vha);
Daniel Wagner 3120dd
+	}
Daniel Wagner 3120dd
 }
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 void qla2x00_sp_free(srb_t *sp)
Daniel Wagner 3120dd
@@ -9671,6 +9678,12 @@ int qla2xxx_disable_port(struct Scsi_Hos
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	vha->hw->flags.port_isolated = 1;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
+	if (qla2x00_isp_reg_stat(vha->hw)) {
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x9006,
Daniel Wagner 3120dd
+		    "PCI/Register disconnect, exiting.\n");
Daniel Wagner 3120dd
+		qla_pci_set_eeh_busy(vha);
Daniel Wagner 3120dd
+		return FAILED;
Daniel Wagner 3120dd
+	}
Daniel Wagner 3120dd
 	if (qla2x00_chip_is_down(vha))
Daniel Wagner 3120dd
 		return 0;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
@@ -9686,6 +9699,13 @@ int qla2xxx_enable_port(struct Scsi_Host
Daniel Wagner 3120dd
 {
Daniel Wagner 3120dd
 	scsi_qla_host_t *vha = shost_priv(host);
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
+	if (qla2x00_isp_reg_stat(vha->hw)) {
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x9001,
Daniel Wagner 3120dd
+		    "PCI/Register disconnect, exiting.\n");
Daniel Wagner 3120dd
+		qla_pci_set_eeh_busy(vha);
Daniel Wagner 3120dd
+		return FAILED;
Daniel Wagner 3120dd
+	}
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 	vha->hw->flags.port_isolated = 0;
Daniel Wagner 3120dd
 	/* Set the flag to 1, so that isp_abort can proceed */
Daniel Wagner 3120dd
 	vha->flags.online = 1;
Daniel Wagner 3120dd
--- a/drivers/scsi/qla2xxx/qla_os.c
Daniel Wagner 3120dd
+++ b/drivers/scsi/qla2xxx/qla_os.c
Daniel Wagner 3120dd
@@ -340,6 +340,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme,
Daniel Wagner 3120dd
 		 "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)");
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
+u32 ql2xdelay_before_pci_error_handling = 5;
Daniel Wagner 3120dd
+module_param(ql2xdelay_before_pci_error_handling, uint, 0644);
Daniel Wagner 3120dd
+MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling,
Daniel Wagner 3120dd
+	"Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n");
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 static void qla2x00_clear_drv_active(struct qla_hw_data *);
Daniel Wagner 3120dd
 static void qla2x00_free_device(scsi_qla_host_t *);
Daniel Wagner 3120dd
 static int qla2xxx_map_queues(struct Scsi_Host *shost);
Daniel Wagner 3120dd
@@ -7275,6 +7280,44 @@ static void qla_heart_beat(struct scsi_q
Daniel Wagner 3120dd
 	}
Daniel Wagner 3120dd
 }
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
+static void qla_wind_down_chip(scsi_qla_host_t *vha)
Daniel Wagner 3120dd
+{
Daniel Wagner 3120dd
+	struct qla_hw_data *ha = vha->hw;
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+	if (!ha->flags.eeh_busy)
Daniel Wagner 3120dd
+		return;
Daniel Wagner 3120dd
+	if (ha->pci_error_state)
Daniel Wagner 3120dd
+		/* system is trying to recover */
Daniel Wagner 3120dd
+		return;
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+	/*
Daniel Wagner 3120dd
+	 * Current system is not handling PCIE error.  At this point, this is
Daniel Wagner 3120dd
+	 * best effort to wind down the adapter.
Daniel Wagner 3120dd
+	 */
Daniel Wagner 3120dd
+	if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) &&
Daniel Wagner 3120dd
+	    !ha->flags.eeh_flush) {
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x9009,
Daniel Wagner 3120dd
+		    "PCI Error detected, attempting to reset hardware.\n");
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+		ha->isp_ops->reset_chip(vha);
Daniel Wagner 3120dd
+		ha->isp_ops->disable_intrs(ha);
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+		ha->flags.eeh_flush = EEH_FLUSH_RDY;
Daniel Wagner 3120dd
+		ha->eeh_jif = jiffies;
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+	} else if (ha->flags.eeh_flush == EEH_FLUSH_RDY &&
Daniel Wagner 3120dd
+	    time_after_eq(jiffies, ha->eeh_jif +  5 * HZ)) {
Daniel Wagner 3120dd
+		pci_clear_master(ha->pdev);
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+		/* flush all command */
Daniel Wagner 3120dd
+		qla2x00_abort_isp_cleanup(vha);
Daniel Wagner 3120dd
+		ha->flags.eeh_flush = EEH_FLUSH_DONE;
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
+		ql_log(ql_log_info, vha, 0x900a,
Daniel Wagner 3120dd
+		    "PCI Error handling complete, all IOs aborted.\n");
Daniel Wagner 3120dd
+	}
Daniel Wagner 3120dd
+}
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 /**************************************************************************
Daniel Wagner 3120dd
 *   qla2x00_timer
Daniel Wagner 3120dd
 *
Daniel Wagner 3120dd
@@ -7297,6 +7340,8 @@ qla2x00_timer(scsi_qla_host_t *vha)
Daniel Wagner 3120dd
 	fc_port_t *fcport = NULL;
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	if (ha->flags.eeh_busy) {
Daniel Wagner 3120dd
+		qla_wind_down_chip(vha);
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 		ql_dbg(ql_dbg_timer, vha, 0x6000,
Daniel Wagner 3120dd
 		    "EEH = %d, restarting timer.\n",
Daniel Wagner 3120dd
 		    ha->flags.eeh_busy);
Daniel Wagner 3120dd
@@ -7877,6 +7922,9 @@ void qla_pci_set_eeh_busy(struct scsi_ql
Daniel Wagner 3120dd
 
Daniel Wagner 3120dd
 	spin_lock_irqsave(&base_vha->work_lock, flags);
Daniel Wagner 3120dd
 	if (!ha->flags.eeh_busy) {
Daniel Wagner 3120dd
+		ha->eeh_jif = jiffies;
Daniel Wagner 3120dd
+		ha->flags.eeh_flush = 0;
Daniel Wagner 3120dd
+
Daniel Wagner 3120dd
 		ha->flags.eeh_busy = 1;
Daniel Wagner 3120dd
 		do_cleanup = true;
Daniel Wagner 3120dd
 	}