Blob Blame History Raw
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 14 Aug 2019 16:56:53 -0700
Subject: scsi: lpfc: Fix crash due to port reset racing vs adapter error
Git-commit: 8c24a4f643edbcc7c8281b1f7527568f565dfbf8
Patch-mainline: v5.4-rc1
References: bsc#1146215
 handling

If the adapter encounters a condition which causes the adapter to fail
(driver must detect the failure) simultaneously to a request to the driver
to reset the adapter (such as a host_reset), the reset path will be racing
with the asynchronously-detect adapter failure path.  In the failing
situation, one path has started to tear down the adapter data structures
(io_wq's) while the other path has initiated a repeat of the teardown and
is in the lpfc_sli_flush_xxx_rings path and attempting to access the
just-freed data structures.

Fix by the following:

 - In cases where an adapter failure is detected, rather than explicitly
   calling offline_eratt() to start the teardown, change the adapter state
   and let the later calls of posted work to the slowpath thread invoke the
   adapter recovery.  In essence, this means all requests to reset are
   serialized on the slowpath thread.

 - Clean up the routine that restarts the adapter. If there is a failure
   from brdreset, don't immediately error and leave things in a partial
   state. Instead, ensure the adapter state is set and finish the teardown
   of structures before returning.

 - If in the scsi host reset handler and the board fails to reset and
   restart (which can be due to parallel reset/recovery paths), instead of
   hard failing and explicitly calling offline_eratt() (which gets into the
   redundant path), just fail out and let the asynchronous path resolve the
   adapter state.

Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Hannes Reinecke <hare@suse.com>
---
 drivers/scsi/lpfc/lpfc_init.c |  7 +++----
 drivers/scsi/lpfc/lpfc_scsi.c | 16 +++++++++-------
 drivers/scsi/lpfc/lpfc_sli.c  |  6 ++++--
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 8429de65d490..c549212a2b49 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -1926,7 +1926,7 @@ lpfc_handle_eratt_s4(struct lpfc_hba *phba)
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
 				"7624 Firmware not ready: Failing UE recovery,"
 				" waited %dSec", i);
-		lpfc_sli4_offline_eratt(phba);
+		phba->link_state = LPFC_HBA_ERROR;
 		break;
 
 	case LPFC_SLI_INTF_IF_TYPE_2:
@@ -2000,9 +2000,8 @@ lpfc_handle_eratt_s4(struct lpfc_hba *phba)
 		}
 		/* fall through for not able to recover */
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-				"3152 Unrecoverable error, bring the port "
-				"offline\n");
-		lpfc_sli4_offline_eratt(phba);
+				"3152 Unrecoverable error\n");
+		phba->link_state = LPFC_HBA_ERROR;
 		break;
 	case LPFC_SLI_INTF_IF_TYPE_1:
 	default:
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index f9df800e7067..720a98266986 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -5295,18 +5295,20 @@ lpfc_host_reset_handler(struct scsi_cmnd *cmnd)
 	lpfc_offline(phba);
 	rc = lpfc_sli_brdrestart(phba);
 	if (rc)
-		ret = FAILED;
+		goto error;
+
 	rc = lpfc_online(phba);
 	if (rc)
-		ret = FAILED;
+		goto error;
+
 	lpfc_unblock_mgmt_io(phba);
 
-	if (ret == FAILED) {
-		lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP,
-				 "3323 Failed host reset, bring it offline\n");
-		lpfc_sli4_offline_eratt(phba);
-	}
 	return ret;
+error:
+	lpfc_printf_vlog(vport, KERN_ERR, LOG_FCP,
+			 "3323 Failed host reset\n");
+	lpfc_unblock_mgmt_io(phba);
+	return FAILED;
 }
 
 /**
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 8449f7f9cb64..be89a86f9649 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -4509,7 +4509,7 @@ lpfc_sli_brdreset(struct lpfc_hba *phba)
  * checking during resets the device. The caller is not required to hold
  * any locks.
  *
- * This function returns 0 always.
+ * This function returns 0 on success else returns negative error code.
  **/
 int
 lpfc_sli4_brdreset(struct lpfc_hba *phba)
@@ -4667,7 +4667,7 @@ lpfc_sli_brdrestart_s4(struct lpfc_hba *phba)
 
 	rc = lpfc_sli4_brdreset(phba);
 	if (rc)
-		return rc;
+		goto error;
 
 	spin_lock_irq(&phba->hbalock);
 	phba->pport->stopped = 0;
@@ -4682,6 +4682,8 @@ lpfc_sli_brdrestart_s4(struct lpfc_hba *phba)
 	if (hba_aer_enabled)
 		pci_disable_pcie_error_reporting(phba->pcidev);
 
+error:
+	phba->link_state = LPFC_HBA_ERROR;
 	lpfc_hba_down_post(phba);
 	lpfc_sli4_queue_destroy(phba);
 
-- 
2.16.4