Blob Blame History Raw
From 667d04e0b56f1610d6cfcf7cbf419a4dbb6ebfc1 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Mon, 29 Jan 2018 17:53:09 +0100
Subject: [PATCH v2] scsi: handle special return codes for ABORTED COMMAND
References: bsc#1069138, bnc#745640, bnc#825227
Patch-mainline: submitted to linux-scsi, 20180130

Introduce a new blist flag that indicates the device may return certain
sense code/ASC/ASCQ combinations that indicate different treatment than
normal. In particular, some devices need unconditional retry (aka
ADD_TO_MLQUEUE) under certain conditions; otherwise path failures may be
falsely detected in the "maybe_retry" case.

Because different devices use different sense codes to indicate this
condition, a single bit is not enough. But we also can't use a bit for
every possible status code. Therefore the new flag BLIST_ABORTED_CMD_QUIRK
just indicates that this is a device for which the return code should be
checked. An extra "blacklist" in scsi_aborted_cmd_quirk() then checks for
known ASC/ASCQ combinations, and handles them.

When such a combination is encountered for the first time, a message is
printed. In systems that have several different peripherals using this
flag, that might lead to a wrong match without a warning. This small risk
is a compromise between exactness and the excessive resources that would be
required to check for matching device vendor and model name every time.
Also, if there were different devices (identified by vendor/model) using
the same ASC/ASCQ, the code might print stray warnings. But the additional
effort to required to handle this 100% correctly is hardly justified with
the current minimal "blacklist".

I chose to recycle devinfo flag (1<<11) (former BLIST_INQUIRY_58, free
since 496c91bbc910) for this purpose rather than extending blist_flags_t to
64 bit. This could be easily changed of course.

This patch replaces the previously submitted patches
 "scsi: handle ABORTED_COMMAND on Fujitsu ETERNUS" and
 "scsi: Always retry internal target error" (Hannes Reinecke)

Changes in v2:
 - use ARRAY_SIZE (Bart van Assche)
 - make blist array static const and use separate bitmask for warned flag
   (Bart van Assche)
 - Fix string comparison for SCSI vendor and model
 - Print warning at scsi_logging_level 0, and improve message formatting

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 drivers/scsi/scsi_devinfo.c |    4 +
 drivers/scsi/scsi_error.c   |  111 ++++++++++++++++++++++++++++++++++++++++++++
 include/scsi/scsi_devinfo.h |    3 +
 3 files changed, 117 insertions(+), 1 deletion(-)

--- a/drivers/scsi/scsi_devinfo.c
+++ b/drivers/scsi/scsi_devinfo.c
@@ -160,12 +160,14 @@ static struct {
 	{"DGC", "RAID", NULL, BLIST_SPARSELUN},	/* EMC CLARiiON, storage on LUN 0 */
 	{"DGC", "DISK", NULL, BLIST_SPARSELUN},	/* EMC CLARiiON, no storage on LUN 0 */
 	{"EMC",  "Invista", "*", BLIST_SPARSELUN | BLIST_LARGELUN},
-	{"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN | BLIST_REPORTLUN2},
+	{"EMC", "SYMMETRIX", NULL, BLIST_SPARSELUN | BLIST_LARGELUN
+	 | BLIST_REPORTLUN2 | BLIST_ABORTED_CMD_QUIRK},
 	{"EMULEX", "MD21/S2     ESDI", NULL, BLIST_SINGLELUN},
 	{"easyRAID", "16P", NULL, BLIST_NOREPORTLUN},
 	{"easyRAID", "X6P", NULL, BLIST_NOREPORTLUN},
 	{"easyRAID", "F8", NULL, BLIST_NOREPORTLUN},
 	{"FSC", "CentricStor", "*", BLIST_SPARSELUN | BLIST_LARGELUN},
+	{"FUJITSU", "ETERNUS_DXM", "*", BLIST_ABORTED_CMD_QUIRK},
 	{"Generic", "USB SD Reader", "1.00", BLIST_FORCELUN | BLIST_INQUIRY_36},
 	{"Generic", "USB Storage-SMC", "0180", BLIST_FORCELUN | BLIST_INQUIRY_36},
 	{"Generic", "USB Storage-SMC", "0207", BLIST_FORCELUN | BLIST_INQUIRY_36},
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -26,6 +26,7 @@
 #include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/jiffies.h>
+#include <linux/bitmap.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -39,6 +40,7 @@
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi_dh.h>
 #include <scsi/sg.h>
+#include <scsi/scsi_devinfo.h>
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
@@ -432,6 +434,112 @@ static void scsi_report_sense(struct scs
 	}
 }
 
+struct aborted_cmd_blist {
+	u8 asc;
+	u8 ascq;
+	int retval;
+	const char *vendor;
+	const char *model;
+};
+
+/**
+ * scsi_strcmp - Compare space-padded string with reference string
+ * @device_str:	vendor or model field of struct scsi_device,
+ *		possibly space-padded
+ * @ref_str:	reference string to compare with
+ * @len:	max size of device_str: 8 for vendor, 16 for model
+ *
+ * Return value:
+ *	-1, 0, or 1, like strcmp().
+ */
+static int scsi_strcmp(const char *device_str, const char *ref_str, int len)
+{
+	int ref_len = strlen(ref_str);
+	int r, i;
+
+	WARN_ON(ref_len > len);
+	r = strncmp(device_str, ref_str, min(ref_len, len));
+	if (r != 0)
+		return r;
+
+	for (i = ref_len; i < strnlen(device_str, len); i++)
+		if (device_str[i] != ' ')
+			return 1;
+	return 0;
+}
+
+/**
+ * scsi_aborted_cmd_quirk - Handle special return codes for ABORTED COMMAND
+ * @sdev:	SCSI device that returned ABORTED COMMAND.
+ * @sshdr:	Sense data
+ *
+ * Return value:
+ *	SUCCESS or FAILED or NEEDS_RETRY or ADD_TO_MLQUEUE
+ *
+ * Notes:
+ *	This is only called for devices that have the blist flag
+ *      BLIST_ABORTED_CMD_QUIRK set.
+ */
+static int scsi_aborted_cmd_quirk(const struct scsi_device *sdev,
+				  const struct scsi_sense_hdr *sshdr)
+{
+	static const struct aborted_cmd_blist blist[] = {
+		/*
+		 * 44/00: SYMMETRIX uses this code for a variety of internal
+		 * issues, all of which can be recovered by retry
+		 */
+		{ 0x44, 0x00, ADD_TO_MLQUEUE, "EMC", "SYMMETRIX" },
+		/*
+		 * c1/01: This is used by ETERNUS to indicate the
+		 * command should be retried unconditionally
+		 */
+		{ 0xc1, 0x01, ADD_TO_MLQUEUE, "FUJITSU", "ETERNUS_DXM" }
+	};
+	const struct aborted_cmd_blist *found;
+	int ret = NEEDS_RETRY, i;
+	static DECLARE_BITMAP(warned, ARRAY_SIZE(blist));
+
+	for (i = 0; i < ARRAY_SIZE(blist); i++) {
+		if (sshdr->asc == blist[i].asc &&
+		    sshdr->ascq == blist[i].ascq)
+			break;
+	}
+
+	if (i >= ARRAY_SIZE(blist))
+		return ret;
+
+	found = &blist[i];
+	ret = found->retval;
+	if (test_and_set_bit(BIT(i), warned))
+		return ret;
+
+	/*
+	 * When we encounter a known ASC/ASCQ combination, it may or may not
+	 * match the device for which this combination is known.
+	 * Warn only once for each known ASC/ASCQ combination.
+	 * We can't afford making a string comparison every time in the
+	 * SCSI command return path, and a wrong match here is expected to be
+	 * non-fatal.
+	 */
+	if (!scsi_strcmp(sdev->vendor, found->vendor, 8) &&
+	    !scsi_strcmp(sdev->model, found->model, 16)) {
+		SCSI_LOG_ERROR_RECOVERY(2,
+			sdev_printk(KERN_INFO, sdev,
+				    "special retcode %s for ABORTED COMMAND %02x/%02x (expected)",
+				    scsi_mlreturn_string(ret),
+				    sshdr->asc, sshdr->ascq));
+	} else {
+		sdev_printk(KERN_WARNING, sdev,
+			    "special retcode %s for ABORTED COMMAND %02x/%02x\n",
+			    scsi_mlreturn_string(ret),
+			    sshdr->asc, sshdr->ascq);
+		sdev_printk(KERN_WARNING, sdev,
+			    "(UNEXPECTED from  \"%.8s:%.16s\", please inform linux-scsi@vger.kernel.org)\n",
+			    sdev->vendor, sdev->model);
+	}
+	return ret;
+}
+
 /**
  * scsi_check_sense - Examine scsi cmd sense
  * @scmd:	Cmd to have sense checked.
@@ -503,6 +611,9 @@ int scsi_check_sense(struct scsi_cmnd *s
 		if (sshdr.asc == 0x10) /* DIF */
 			return SUCCESS;
 
+		if (sdev->sdev_bflags & BLIST_ABORTED_CMD_QUIRK)
+			return scsi_aborted_cmd_quirk(sdev, &sshdr);
+
 		return NEEDS_RETRY;
 	case NOT_READY:
 	case UNIT_ATTENTION:
--- a/include/scsi/scsi_devinfo.h
+++ b/include/scsi/scsi_devinfo.h
@@ -15,6 +15,9 @@
 #define BLIST_ISROM     	0x100	/* Treat as (removable) CD-ROM */
 #define BLIST_LARGELUN		0x200	/* LUNs past 7 on a SCSI-2 device */
 #define BLIST_INQUIRY_36	0x400	/* override additional length field */
+#define BLIST_ABORTED_CMD_QUIRK 0x800   /* Device uses special return codes for
+					   ABORTED COMMAND,
+					   see scsi_aborted_cmd_quirk() */
 #define BLIST_NOSTARTONADD	0x1000	/* do not do automatic start on add */
 #define BLIST_REPORTLUN2	0x20000	/* try REPORT_LUNS even for SCSI-2 devs
  					   (if HBA supports more than 8 LUNs) */