|
|
36cc9f |
From: Martin Wilck <mwilck@suse.com>
|
|
|
36cc9f |
Date: Fri, 14 May 2021 17:32:14 +0200
|
|
|
36cc9f |
Subject: scsi: scsi_dh_alua: Retry RTPG on a different path after failure
|
|
Michal Kubecek |
9a3a83 |
Patch-mainline: v5.14-rc1
|
|
|
36cc9f |
Git-commit: ee8868c5c78f16fb726775741aeab8a233373332
|
|
|
36cc9f |
References: bsc#1174978 bsc#1185701
|
|
|
36cc9f |
|
|
|
36cc9f |
If an RTPG fails, we can't infer anything wrt. the state of the ports in
|
|
|
36cc9f |
the port group except that we were unable to reach the one port on which
|
|
|
36cc9f |
the RTPG had failed. "offline" is just a secondary port state, which means
|
|
|
36cc9f |
that we can't infer the state of any port in the PG from the failure (in
|
|
|
36cc9f |
fact, even the failed port might still be in "active/optimized" primary
|
|
|
36cc9f |
port access state).
|
|
|
36cc9f |
|
|
|
36cc9f |
Therefore, when we encounter an RTPG failure, we should retry the RTPG on a
|
|
|
36cc9f |
different port. This avoids falsely setting port states to offline for
|
|
|
36cc9f |
unreachable ports. To do this, ports on which an RTPG has failed are
|
|
|
36cc9f |
temporarily set to "disabled" to avoid repeating the failed I/O on the same
|
|
|
36cc9f |
target port. Once the RTPG has either succeeded on one port or failed on
|
|
|
36cc9f |
all ports of the PG, the ports are enabled again.
|
|
|
36cc9f |
|
|
|
36cc9f |
Link: https://lore.kernel.org/r/20210514153214.5626-1-mwilck@suse.com
|
|
|
36cc9f |
Signed-off-by: Martin Wilck <mwilck@suse.com>
|
|
|
36cc9f |
Signed-off-by: Hannes Reinecke <hare@suse.de>
|
|
|
36cc9f |
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
|
|
|
36cc9f |
---
|
|
|
36cc9f |
drivers/scsi/device_handler/scsi_dh_alua.c | 70 +++++++++++++++++++++-
|
|
|
36cc9f |
1 file changed, 67 insertions(+), 3 deletions(-)
|
|
|
36cc9f |
|
|
|
36cc9f |
diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
|
|
|
36cc9f |
index efa8c0381476..03b7f255644f 100644
|
|
|
36cc9f |
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
|
|
|
36cc9f |
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
|
|
|
36cc9f |
@@ -88,6 +88,7 @@ struct alua_dh_data {
|
|
|
36cc9f |
struct scsi_device *sdev;
|
|
|
36cc9f |
int init_error;
|
|
|
36cc9f |
struct mutex init_mutex;
|
|
|
36cc9f |
+ bool disabled;
|
|
|
36cc9f |
};
|
|
|
36cc9f |
|
|
|
36cc9f |
struct alua_queue_data {
|
|
|
36cc9f |
@@ -569,6 +570,8 @@ static int alua_rtpg(struct scsi_device *sdev, struct alua_port_group *pg)
|
|
|
36cc9f |
kfree(buff);
|
|
|
36cc9f |
if (driver_byte(retval) == DRIVER_ERROR)
|
|
|
36cc9f |
return SCSI_DH_DEV_TEMP_BUSY;
|
|
|
36cc9f |
+ if (host_byte(retval) == DID_NO_CONNECT)
|
|
|
36cc9f |
+ return SCSI_DH_RES_TEMP_UNAVAIL;
|
|
|
36cc9f |
return SCSI_DH_IO;
|
|
|
36cc9f |
}
|
|
|
36cc9f |
|
|
|
36cc9f |
@@ -807,6 +810,51 @@ static unsigned alua_stpg(struct scsi_device *sdev, struct alua_port_group *pg)
|
|
|
36cc9f |
return SCSI_DH_RETRY;
|
|
|
36cc9f |
}
|
|
|
36cc9f |
|
|
|
36cc9f |
+static bool alua_rtpg_select_sdev(struct alua_port_group *pg)
|
|
|
36cc9f |
+{
|
|
|
36cc9f |
+ struct alua_dh_data *h;
|
|
|
36cc9f |
+ struct scsi_device *sdev = NULL;
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ lockdep_assert_held(&pg->lock);
|
|
|
36cc9f |
+ if (WARN_ON(!pg->rtpg_sdev))
|
|
|
36cc9f |
+ return false;
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ /*
|
|
|
36cc9f |
+ * RCU protection isn't necessary for dh_list here
|
|
|
36cc9f |
+ * as we hold pg->lock, but for access to h->pg.
|
|
|
36cc9f |
+ */
|
|
|
36cc9f |
+ rcu_read_lock();
|
|
|
36cc9f |
+ list_for_each_entry_rcu(h, &pg->dh_list, node) {
|
|
|
36cc9f |
+ if (!h->sdev)
|
|
|
36cc9f |
+ continue;
|
|
|
36cc9f |
+ if (h->sdev == pg->rtpg_sdev) {
|
|
|
36cc9f |
+ h->disabled = true;
|
|
|
36cc9f |
+ continue;
|
|
|
36cc9f |
+ }
|
|
|
36cc9f |
+ if (rcu_dereference(h->pg) == pg &&
|
|
|
36cc9f |
+ !h->disabled &&
|
|
|
36cc9f |
+ !scsi_device_get(h->sdev)) {
|
|
|
36cc9f |
+ sdev = h->sdev;
|
|
|
36cc9f |
+ break;
|
|
|
36cc9f |
+ }
|
|
|
36cc9f |
+ }
|
|
|
36cc9f |
+ rcu_read_unlock();
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ if (!sdev) {
|
|
|
36cc9f |
+ pr_warn("%s: no device found for rtpg\n",
|
|
|
36cc9f |
+ (pg->device_id_len ?
|
|
|
36cc9f |
+ (char *)pg->device_id_str : "(nameless PG)"));
|
|
|
36cc9f |
+ return false;
|
|
|
36cc9f |
+ }
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ sdev_printk(KERN_INFO, sdev, "rtpg retry on different device\n");
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ scsi_device_put(pg->rtpg_sdev);
|
|
|
36cc9f |
+ pg->rtpg_sdev = sdev;
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ return true;
|
|
|
36cc9f |
+}
|
|
|
36cc9f |
+
|
|
|
36cc9f |
static void alua_rtpg_work(struct work_struct *work)
|
|
|
36cc9f |
{
|
|
|
36cc9f |
struct alua_port_group *pg =
|
|
|
36cc9f |
@@ -815,6 +863,7 @@ static void alua_rtpg_work(struct work_struct *work)
|
|
|
36cc9f |
LIST_HEAD(qdata_list);
|
|
|
36cc9f |
int err = SCSI_DH_OK;
|
|
|
36cc9f |
struct alua_queue_data *qdata, *tmp;
|
|
|
36cc9f |
+ struct alua_dh_data *h;
|
|
|
36cc9f |
unsigned long flags;
|
|
|
36cc9f |
|
|
|
36cc9f |
spin_lock_irqsave(&pg->lock, flags);
|
|
|
36cc9f |
@@ -848,9 +897,18 @@ static void alua_rtpg_work(struct work_struct *work)
|
|
|
36cc9f |
}
|
|
|
36cc9f |
err = alua_rtpg(sdev, pg);
|
|
|
36cc9f |
spin_lock_irqsave(&pg->lock, flags);
|
|
|
36cc9f |
- if (err == SCSI_DH_RETRY || pg->flags & ALUA_PG_RUN_RTPG) {
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ /* If RTPG failed on the current device, try using another */
|
|
|
36cc9f |
+ if (err == SCSI_DH_RES_TEMP_UNAVAIL &&
|
|
|
36cc9f |
+ alua_rtpg_select_sdev(pg))
|
|
|
36cc9f |
+ err = SCSI_DH_IMM_RETRY;
|
|
|
36cc9f |
+
|
|
|
36cc9f |
+ if (err == SCSI_DH_RETRY || err == SCSI_DH_IMM_RETRY ||
|
|
|
36cc9f |
+ pg->flags & ALUA_PG_RUN_RTPG) {
|
|
|
36cc9f |
pg->flags &= ~ALUA_PG_RUNNING;
|
|
|
36cc9f |
- if (!pg->interval && !(pg->flags & ALUA_PG_RUN_RTPG))
|
|
|
36cc9f |
+ if (err == SCSI_DH_IMM_RETRY)
|
|
|
36cc9f |
+ pg->interval = 0;
|
|
|
36cc9f |
+ else if (!pg->interval && !(pg->flags & ALUA_PG_RUN_RTPG))
|
|
|
36cc9f |
pg->interval = ALUA_RTPG_RETRY_DELAY;
|
|
|
36cc9f |
pg->flags |= ALUA_PG_RUN_RTPG;
|
|
|
36cc9f |
spin_unlock_irqrestore(&pg->lock, flags);
|
|
|
36cc9f |
@@ -878,6 +936,12 @@ static void alua_rtpg_work(struct work_struct *work)
|
|
|
36cc9f |
}
|
|
|
36cc9f |
|
|
|
36cc9f |
list_splice_init(&pg->rtpg_list, &qdata_list);
|
|
|
36cc9f |
+ /*
|
|
|
36cc9f |
+ * We went through an RTPG, for good or bad.
|
|
|
36cc9f |
+ * Re-enable all devices for the next attempt.
|
|
|
36cc9f |
+ */
|
|
|
36cc9f |
+ list_for_each_entry(h, &pg->dh_list, node)
|
|
|
36cc9f |
+ h->disabled = false;
|
|
|
36cc9f |
pg->rtpg_sdev = NULL;
|
|
|
36cc9f |
spin_unlock_irqrestore(&pg->lock, flags);
|
|
|
36cc9f |
|
|
|
36cc9f |
@@ -962,6 +1026,7 @@ static int alua_initialize(struct scsi_device *sdev, struct alua_dh_data *h)
|
|
|
36cc9f |
int err = SCSI_DH_DEV_UNSUPP, tpgs;
|
|
|
36cc9f |
|
|
|
36cc9f |
mutex_lock(&h->init_mutex);
|
|
|
36cc9f |
+ h->disabled = false;
|
|
|
36cc9f |
tpgs = alua_check_tpgs(sdev);
|
|
|
36cc9f |
if (tpgs != TPGS_MODE_NONE)
|
|
|
36cc9f |
err = alua_check_vpd(sdev, h, tpgs);
|
|
|
36cc9f |
@@ -1080,7 +1145,6 @@ static void alua_check(struct scsi_device *sdev, bool force)
|
|
|
36cc9f |
return;
|
|
|
36cc9f |
}
|
|
|
36cc9f |
rcu_read_unlock();
|
|
|
36cc9f |
-
|
|
|
36cc9f |
alua_rtpg_queue(pg, sdev, NULL, force);
|
|
|
36cc9f |
kref_put(&pg->kref, release_port_group);
|
|
|
36cc9f |
}
|
|
|
36cc9f |
--
|
|
|
36cc9f |
2.31.1
|
|
|
36cc9f |
|