Blob Blame History Raw
From: Don Brace <don.brace@microchip.com>
Date: Tue, 28 Sep 2021 18:54:36 -0500
Subject: scsi: smartpqi: Add TEST UNIT READY check for SANITIZE operation
Patch-mainline: v5.16-rc1
Git-commit: be76f90668d8406dccb4c05d6edde89f48ad04d8
References: jsc#SLE-19728

Send a TEST UNIT READY to HBA disks and do not present them to the OS if
0x02/0x04/0x1b (SANITIZE IN PROGRESS) is returned.

During boot-up, some OSes appear to hang when there are one or more disks
undergoing a sanitize operation.

According to SCSI SBC4 specification section 4.11.2 "Commands allowed
during SANITIZE", some SCSI commands are permitted, but read/write
operations are not.

When the OS attempts to read the disk partition table a CHECK CONDITION ASC
0x04 ASCQ 0x1b is returned which causes the OS to retry the read until
SANITIZE has completed. This can take hours.

According to document HPE Smart Storage Administrator User Guide, during
the sanitize erase operation, the drive is unusable. I.e. the expected
behavior for SANITIZE is the that disk remains offline even after SANITIZE
has completed. The customer is expected to re-enable the disk using the
management utility.

Link: https://lore.kernel.org/r/20210928235442.201875-6-don.brace@microchip.com
Reviewed-by: Scott Benesh <scott.benesh@microchip.com>
Reviewed-by: Scott Teel <scott.teel@microchip.com>
Reviewed-by: Mike McGowen <mike.mcgowen@microchip.com>
Acked-by: John Donnelly <john.p.donnelly@oracle.com>
Signed-off-by: Don Brace <don.brace@microchip.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Acked-by: Martin Wilck <mwilck@suse.com>
---
 drivers/scsi/smartpqi/smartpqi_init.c |   87 ++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -555,6 +555,10 @@ static int pqi_build_raid_path_request(s
 	cdb = request->cdb;
 
 	switch (cmd) {
+	case TEST_UNIT_READY:
+		request->data_direction = SOP_READ_FLAG;
+		cdb[0] = TEST_UNIT_READY;
+		break;
 	case INQUIRY:
 		request->data_direction = SOP_READ_FLAG;
 		cdb[0] = INQUIRY;
@@ -1575,6 +1579,85 @@ out:
 	return rc;
 }
 
+/*
+ * Prevent adding drive to OS for some corner cases such as a drive
+ * undergoing a sanitize operation. Some OSes will continue to poll
+ * the drive until the sanitize completes, which can take hours,
+ * resulting in long bootup delays. Commands such as TUR, READ_CAP
+ * are allowed, but READ/WRITE cause check condition. So the OS
+ * cannot check/read the partition table.
+ * Note: devices that have completed sanitize must be re-enabled
+ *       using the management utility.
+ */
+static bool pqi_keep_device_offline(struct pqi_ctrl_info *ctrl_info,
+	struct pqi_scsi_dev *device)
+{
+	u8 scsi_status;
+	int rc;
+	enum dma_data_direction dir;
+	char *buffer;
+	int buffer_length = 64;
+	size_t sense_data_length;
+	struct scsi_sense_hdr sshdr;
+	struct pqi_raid_path_request request;
+	struct pqi_raid_error_info error_info;
+	bool offline = false; /* Assume keep online */
+
+	/* Do not check controllers. */
+	if (pqi_is_hba_lunid(device->scsi3addr))
+		return false;
+
+	/* Do not check LVs. */
+	if (pqi_is_logical_device(device))
+		return false;
+
+	buffer = kmalloc(buffer_length, GFP_KERNEL);
+	if (!buffer)
+		return false; /* Assume not offline */
+
+	/* Check for SANITIZE in progress using TUR */
+	rc = pqi_build_raid_path_request(ctrl_info, &request,
+		TEST_UNIT_READY, RAID_CTLR_LUNID, buffer,
+		buffer_length, 0, &dir);
+	if (rc)
+		goto out; /* Assume not offline */
+
+	memcpy(request.lun_number, device->scsi3addr, sizeof(request.lun_number));
+
+	rc = pqi_submit_raid_request_synchronous(ctrl_info, &request.header, 0, &error_info);
+
+	if (rc)
+		goto out; /* Assume not offline */
+
+	scsi_status = error_info.status;
+	sense_data_length = get_unaligned_le16(&error_info.sense_data_length);
+	if (sense_data_length == 0)
+		sense_data_length =
+			get_unaligned_le16(&error_info.response_data_length);
+	if (sense_data_length) {
+		if (sense_data_length > sizeof(error_info.data))
+			sense_data_length = sizeof(error_info.data);
+
+		/*
+		 * Check for sanitize in progress: asc:0x04, ascq: 0x1b
+		 */
+		if (scsi_status == SAM_STAT_CHECK_CONDITION &&
+			scsi_normalize_sense(error_info.data,
+				sense_data_length, &sshdr) &&
+				sshdr.sense_key == NOT_READY &&
+				sshdr.asc == 0x04 &&
+				sshdr.ascq == 0x1b) {
+			device->device_offline = true;
+			offline = true;
+			goto out; /* Keep device offline */
+		}
+	}
+
+out:
+	kfree(buffer);
+	return offline;
+}
+
 static int pqi_get_device_info(struct pqi_ctrl_info *ctrl_info,
 	struct pqi_scsi_dev *device,
 	struct bmic_identify_physical_device *id_phys)
@@ -2296,6 +2379,10 @@ static int pqi_update_scsi_devices(struc
 		if (!pqi_is_supported_device(device))
 			continue;
 
+		/* Do not present disks that the OS cannot fully probe */
+		if (pqi_keep_device_offline(ctrl_info, device))
+			continue;
+
 		/* Gather information about the device. */
 		rc = pqi_get_device_info(ctrl_info, device, id_phys);
 		if (rc == -ENOMEM) {