36cc9f
From: Martin Wilck <mwilck@suse.com>
36cc9f
Date: Fri, 14 May 2021 17:32:14 +0200
36cc9f
Subject: scsi: scsi_dh_alua: Retry RTPG on a different path after failure
Michal Kubecek 9a3a83
Patch-mainline: v5.14-rc1
36cc9f
Git-commit: ee8868c5c78f16fb726775741aeab8a233373332
36cc9f
References: bsc#1174978 bsc#1185701
36cc9f
36cc9f
If an RTPG fails, we can't infer anything wrt. the state of the ports in
36cc9f
the port group except that we were unable to reach the one port on which
36cc9f
the RTPG had failed. "offline" is just a secondary port state, which means
36cc9f
that we can't infer the state of any port in the PG from the failure (in
36cc9f
fact, even the failed port might still be in "active/optimized" primary
36cc9f
port access state).
36cc9f
36cc9f
Therefore, when we encounter an RTPG failure, we should retry the RTPG on a
36cc9f
different port. This avoids falsely setting port states to offline for
36cc9f
unreachable ports. To do this, ports on which an RTPG has failed are
36cc9f
temporarily set to "disabled" to avoid repeating the failed I/O on the same
36cc9f
target port. Once the RTPG has either succeeded on one port or failed on
36cc9f
all ports of the PG, the ports are enabled again.
36cc9f
36cc9f
Link: https://lore.kernel.org/r/20210514153214.5626-1-mwilck@suse.com
36cc9f
Signed-off-by: Martin Wilck <mwilck@suse.com>
36cc9f
Signed-off-by: Hannes Reinecke <hare@suse.de>
36cc9f
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
36cc9f
---
36cc9f
 drivers/scsi/device_handler/scsi_dh_alua.c | 70 +++++++++++++++++++++-
36cc9f
 1 file changed, 67 insertions(+), 3 deletions(-)
36cc9f
36cc9f
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
36cc9f
index efa8c0381476..03b7f255644f 100644
36cc9f
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
36cc9f
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
36cc9f
@@ -88,6 +88,7 @@ struct alua_dh_data {
36cc9f
 	struct scsi_device	*sdev;
36cc9f
 	int			init_error;
36cc9f
 	struct mutex		init_mutex;
36cc9f
+	bool			disabled;
36cc9f
 };
36cc9f
 
36cc9f
 struct alua_queue_data {
36cc9f
@@ -569,6 +570,8 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
36cc9f
 			kfree(buff);
36cc9f
 			if (driver_byte(retval) == DRIVER_ERROR)
36cc9f
 				return SCSI_DH_DEV_TEMP_BUSY;
36cc9f
+			if (host_byte(retval) == DID_NO_CONNECT)
36cc9f
+				return SCSI_DH_RES_TEMP_UNAVAIL;
36cc9f
 			return SCSI_DH_IO;
36cc9f
 		}
36cc9f
 
36cc9f
@@ -807,6 +810,51 @@ static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
36cc9f
 	return SCSI_DH_RETRY;
36cc9f
 }
36cc9f
 
36cc9f
+static bool alua_rtpg_select_sdev(struct alua_port_group *pg)
36cc9f
+{
36cc9f
+	struct alua_dh_data *h;
36cc9f
+	struct scsi_device *sdev = NULL;
36cc9f
+
36cc9f
+	lockdep_assert_held(&pg->lock);
36cc9f
+	if (WARN_ON(!pg->rtpg_sdev))
36cc9f
+		return false;
36cc9f
+
36cc9f
+	/*
36cc9f
+	 * RCU protection isn't necessary for dh_list here
36cc9f
+	 * as we hold pg->lock, but for access to h->pg.
36cc9f
+	 */
36cc9f
+	rcu_read_lock();
36cc9f
+	list_for_each_entry_rcu(h, &pg->dh_list, node) {
36cc9f
+		if (!h->sdev)
36cc9f
+			continue;
36cc9f
+		if (h->sdev == pg->rtpg_sdev) {
36cc9f
+			h->disabled = true;
36cc9f
+			continue;
36cc9f
+		}
36cc9f
+		if (rcu_dereference(h->pg) == pg &&
36cc9f
+		    !h->disabled &&
36cc9f
+		    !scsi_device_get(h->sdev)) {
36cc9f
+			sdev = h->sdev;
36cc9f
+			break;
36cc9f
+		}
36cc9f
+	}
36cc9f
+	rcu_read_unlock();
36cc9f
+
36cc9f
+	if (!sdev) {
36cc9f
+		pr_warn("%s: no device found for rtpg\n",
36cc9f
+			(pg->device_id_len ?
36cc9f
+			 (char *)pg->device_id_str : "(nameless PG)"));
36cc9f
+		return false;
36cc9f
+	}
36cc9f
+
36cc9f
+	sdev_printk(KERN_INFO, sdev, "rtpg retry on different device\n");
36cc9f
+
36cc9f
+	scsi_device_put(pg->rtpg_sdev);
36cc9f
+	pg->rtpg_sdev = sdev;
36cc9f
+
36cc9f
+	return true;
36cc9f
+}
36cc9f
+
36cc9f
 static void alua_rtpg_work(struct work_struct *work)
36cc9f
 {
36cc9f
 	struct alua_port_group *pg =
36cc9f
@@ -815,6 +863,7 @@ static void alua_rtpg_work(struct work_struct *work)
36cc9f
 	LIST_HEAD(qdata_list);
36cc9f
 	int err = SCSI_DH_OK;
36cc9f
 	struct alua_queue_data *qdata, *tmp;
36cc9f
+	struct alua_dh_data *h;
36cc9f
 	unsigned long flags;
36cc9f
 
36cc9f
 	spin_lock_irqsave(&pg->lock, flags);
36cc9f
@@ -848,9 +897,18 @@ static void alua_rtpg_work(struct work_struct *work)
36cc9f
 		}
36cc9f
 		err = alua_rtpg(sdev, pg);
36cc9f
 		spin_lock_irqsave(&pg->lock, flags);
36cc9f
-		if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
36cc9f
+
36cc9f
+		/* If RTPG failed on the current device, try using another */
36cc9f
+		if (err == SCSI_DH_RES_TEMP_UNAVAIL &&
36cc9f
+		    alua_rtpg_select_sdev(pg))
36cc9f
+			err = SCSI_DH_IMM_RETRY;
36cc9f
+
36cc9f
+		if (err == SCSI_DH_RETRY || err == SCSI_DH_IMM_RETRY ||
36cc9f
+		    pg->flags & ALUA_PG_RUN_RTPG) {
36cc9f
 			pg->flags &= ~ALUA_PG_RUNNING;
36cc9f
-			if (!pg->interval && !(pg->flags & ALUA_PG_RUN_RTPG))
36cc9f
+			if (err == SCSI_DH_IMM_RETRY)
36cc9f
+				pg->interval = 0;
36cc9f
+			else if (!pg->interval && !(pg->flags & ALUA_PG_RUN_RTPG))
36cc9f
 				pg->interval = ALUA_RTPG_RETRY_DELAY;
36cc9f
 			pg->flags |= ALUA_PG_RUN_RTPG;
36cc9f
 			spin_unlock_irqrestore(&pg->lock, flags);
36cc9f
@@ -878,6 +936,12 @@ static void alua_rtpg_work(struct work_struct *work)
36cc9f
 	}
36cc9f
 
36cc9f
 	list_splice_init(&pg->rtpg_list, &qdata_list);
36cc9f
+	/*
36cc9f
+	 * We went through an RTPG, for good or bad.
36cc9f
+	 * Re-enable all devices for the next attempt.
36cc9f
+	 */
36cc9f
+	list_for_each_entry(h, &pg->dh_list, node)
36cc9f
+		h->disabled = false;
36cc9f
 	pg->rtpg_sdev = NULL;
36cc9f
 	spin_unlock_irqrestore(&pg->lock, flags);
36cc9f
 
36cc9f
@@ -962,6 +1026,7 @@ static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
36cc9f
 	int err = SCSI_DH_DEV_UNSUPP, tpgs;
36cc9f
 
36cc9f
 	mutex_lock(&h->init_mutex);
36cc9f
+	h->disabled = false;
36cc9f
 	tpgs = alua_check_tpgs(sdev);
36cc9f
 	if (tpgs != TPGS_MODE_NONE)
36cc9f
 		err = alua_check_vpd(sdev, h, tpgs);
36cc9f
@@ -1080,7 +1145,6 @@ static void alua_check(struct scsi_device *sdev, bool force)
36cc9f
 		return;
36cc9f
 	}
36cc9f
 	rcu_read_unlock();
36cc9f
-
36cc9f
 	alua_rtpg_queue(pg, sdev, NULL, force);
36cc9f
 	kref_put(&pg->kref, release_port_group);
36cc9f
 }
36cc9f
-- 
36cc9f
2.31.1
36cc9f