Enzo Matsumiya a68e30
From 8c4dfea97f15b80097b3f882ca428fb2751ec30c Mon Sep 17 00:00:00 2001
Enzo Matsumiya a68e30
From: Victor Gladkov <Victor.Gladkov@kioxia.com>
Enzo Matsumiya a68e30
Date: Tue, 24 Nov 2020 18:34:59 +0000
Enzo Matsumiya a68e30
Patch-mainline: v5.11-rc1
Enzo Matsumiya a68e30
Git-commit: 8c4dfea97f15b80097b3f882ca428fb2751ec30c
Enzo Matsumiya a68e30
Subject: [PATCH] nvme-fabrics: reject I/O to offline device
Enzo Matsumiya a68e30
References: bsc#1181161
Enzo Matsumiya a68e30
Enzo Matsumiya a68e30
Commands get stuck while Host NVMe-oF controller is in reconnect state.
Enzo Matsumiya a68e30
The controller enters into reconnect state when it loses connection with
Enzo Matsumiya a68e30
the target.  It tries to reconnect every 10 seconds (default) until
Enzo Matsumiya a68e30
a successful reconnect or until the reconnect time-out is reached.
Enzo Matsumiya a68e30
The default reconnect time out is 10 minutes.
Enzo Matsumiya a68e30
Enzo Matsumiya a68e30
Applications are expecting commands to complete with success or error
Enzo Matsumiya a68e30
within a certain timeout (30 seconds by default).  The NVMe host is
Enzo Matsumiya a68e30
enforcing that timeout while it is connected, but during reconnect the
Enzo Matsumiya a68e30
timeout is not enforced and commands may get stuck for a long period or
Enzo Matsumiya a68e30
even forever.
Enzo Matsumiya a68e30
Enzo Matsumiya a68e30
To fix this long delay due to the default timeout, introduce new
Enzo Matsumiya a68e30
"fast_io_fail_tmo" session parameter.  The timeout is measured in seconds
Enzo Matsumiya a68e30
from the controller reconnect and any command beyond that timeout is
Enzo Matsumiya a68e30
rejected.  The new parameter value may be passed during 'connect'.
Enzo Matsumiya a68e30
The default value of -1 means no timeout (similar to current behavior).
Enzo Matsumiya a68e30
Enzo Matsumiya a68e30
Signed-off-by: Victor Gladkov <victor.gladkov@kioxia.com>
Enzo Matsumiya a68e30
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Enzo Matsumiya a68e30
Reviewed-by: Hannes Reinecke <hare@suse.de>
Enzo Matsumiya a68e30
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Enzo Matsumiya a68e30
Reviewed-by: Chao Leng <lengchao@huawei.com>
Enzo Matsumiya a68e30
Signed-off-by: Christoph Hellwig <hch@lst.de>
Enzo Matsumiya a68e30
Acked-by: Enzo Matsumiya <ematsumiya@suse.de>
Enzo Matsumiya a68e30
---
Enzo Matsumiya a68e30
 drivers/nvme/host/core.c      | 46 ++++++++++++++++++++++++++++++++++-
Enzo Matsumiya a68e30
 drivers/nvme/host/fabrics.c   | 25 ++++++++++++++++---
Enzo Matsumiya a68e30
 drivers/nvme/host/fabrics.h   |  5 ++++
Enzo Matsumiya a68e30
 drivers/nvme/host/multipath.c |  2 ++
Enzo Matsumiya a68e30
 drivers/nvme/host/nvme.h      |  3 +++
Enzo Matsumiya a68e30
 5 files changed, 77 insertions(+), 4 deletions(-)
Enzo Matsumiya a68e30
Enzo Matsumiya a68e30
--- a/drivers/nvme/host/core.c
Enzo Matsumiya a68e30
+++ b/drivers/nvme/host/core.c
Enzo Matsumiya a68e30
@@ -136,6 +136,38 @@ int nvme_try_sched_reset(struct nvme_ctr
Enzo Matsumiya a68e30
 }
Enzo Matsumiya a68e30
 EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
+static void nvme_failfast_work(struct work_struct *work)
Enzo Matsumiya a68e30
+{
Enzo Matsumiya a68e30
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
Enzo Matsumiya a68e30
+			struct nvme_ctrl, failfast_work);
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+	if (ctrl->state != NVME_CTRL_CONNECTING)
Enzo Matsumiya a68e30
+		return;
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+	set_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
Enzo Matsumiya a68e30
+	dev_info(ctrl->device, "failfast expired\n");
Enzo Matsumiya a68e30
+	nvme_kick_requeue_lists(ctrl);
Enzo Matsumiya a68e30
+}
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+static inline void nvme_start_failfast_work(struct nvme_ctrl *ctrl)
Enzo Matsumiya a68e30
+{
Enzo Matsumiya a68e30
+	if (!ctrl->opts || ctrl->opts->fast_io_fail_tmo == -1)
Enzo Matsumiya a68e30
+		return;
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+	schedule_delayed_work(&ctrl->failfast_work,
Enzo Matsumiya a68e30
+			      ctrl->opts->fast_io_fail_tmo * HZ);
Enzo Matsumiya a68e30
+}
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+static inline void nvme_stop_failfast_work(struct nvme_ctrl *ctrl)
Enzo Matsumiya a68e30
+{
Enzo Matsumiya a68e30
+	if (!ctrl->opts)
Enzo Matsumiya a68e30
+		return;
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+	cancel_delayed_work_sync(&ctrl->failfast_work);
Enzo Matsumiya a68e30
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
Enzo Matsumiya a68e30
+}
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
Enzo Matsumiya a68e30
 {
Enzo Matsumiya a68e30
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
Enzo Matsumiya a68e30
@@ -386,8 +418,17 @@ bool nvme_change_ctrl_state(struct nvme_
Enzo Matsumiya a68e30
 	}
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	spin_unlock_irqrestore(&ctrl->lock, flags);
Enzo Matsumiya a68e30
-	if (changed && ctrl->state == NVME_CTRL_LIVE)
Enzo Matsumiya a68e30
+	if (!changed)
Enzo Matsumiya a68e30
+		return false;
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+	if (ctrl->state == NVME_CTRL_LIVE) {
Enzo Matsumiya a68e30
+		if (old_state == NVME_CTRL_CONNECTING)
Enzo Matsumiya a68e30
+			nvme_stop_failfast_work(ctrl);
Enzo Matsumiya a68e30
 		nvme_kick_requeue_lists(ctrl);
Enzo Matsumiya a68e30
+	} else if (ctrl->state == NVME_CTRL_CONNECTING &&
Enzo Matsumiya a68e30
+		old_state == NVME_CTRL_RESETTING) {
Enzo Matsumiya a68e30
+		nvme_start_failfast_work(ctrl);
Enzo Matsumiya a68e30
+	}
Enzo Matsumiya a68e30
 	return changed;
Enzo Matsumiya a68e30
 }
Enzo Matsumiya a68e30
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
Enzo Matsumiya a68e30
@@ -3998,6 +4039,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ct
Enzo Matsumiya a68e30
 {
Enzo Matsumiya a68e30
 	nvme_mpath_stop(ctrl);
Enzo Matsumiya a68e30
 	nvme_stop_keep_alive(ctrl);
Enzo Matsumiya a68e30
+	nvme_stop_failfast_work(ctrl);
Enzo Matsumiya a68e30
 	flush_work(&ctrl->async_event_work);
Enzo Matsumiya a68e30
 	cancel_work_sync(&ctrl->fw_act_work);
Enzo Matsumiya a68e30
 }
Enzo Matsumiya a68e30
@@ -4063,6 +4105,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctr
Enzo Matsumiya a68e30
 	int ret;
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	ctrl->state = NVME_CTRL_NEW;
Enzo Matsumiya a68e30
+	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
Enzo Matsumiya a68e30
 	spin_lock_init(&ctrl->lock);
Enzo Matsumiya a68e30
 	mutex_init(&ctrl->scan_lock);
Enzo Matsumiya a68e30
 	INIT_LIST_HEAD(&ctrl->namespaces);
Enzo Matsumiya a68e30
@@ -4077,6 +4120,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctr
Enzo Matsumiya a68e30
 	init_waitqueue_head(&ctrl->state_wq);
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
Enzo Matsumiya a68e30
+	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
Enzo Matsumiya a68e30
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
Enzo Matsumiya a68e30
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
--- a/drivers/nvme/host/fabrics.c
Enzo Matsumiya a68e30
+++ b/drivers/nvme/host/fabrics.c
Enzo Matsumiya a68e30
@@ -549,6 +549,7 @@ blk_status_t nvmf_fail_nonready_command(
Enzo Matsumiya a68e30
 {
Hannes Reinecke 4805fd
 	if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
Enzo Matsumiya a68e30
 	    ctrl->state != NVME_CTRL_DEAD &&
Enzo Matsumiya a68e30
+	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
Enzo Matsumiya a68e30
 	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
Enzo Matsumiya a68e30
 		return BLK_STS_RESOURCE;
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
@@ -615,6 +616,7 @@ static const match_table_t opt_tokens =
Enzo Matsumiya a68e30
 	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
Enzo Matsumiya a68e30
 	{ NVMF_OPT_NR_POLL_QUEUES,	"nr_poll_queues=%d"	},
Enzo Matsumiya a68e30
 	{ NVMF_OPT_TOS,			"tos=%d"		},
Enzo Matsumiya a68e30
+	{ NVMF_OPT_FAIL_FAST_TMO,	"fast_io_fail_tmo=%d"	},
Enzo Matsumiya a68e30
 	{ NVMF_OPT_ERR,			NULL			}
Enzo Matsumiya a68e30
 };
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
@@ -634,6 +636,7 @@ static int nvmf_parse_options(struct nvm
Enzo Matsumiya a68e30
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
Enzo Matsumiya a68e30
 	opts->kato = NVME_DEFAULT_KATO;
Enzo Matsumiya a68e30
 	opts->duplicate_connect = false;
Enzo Matsumiya a68e30
+	opts->fast_io_fail_tmo = NVMF_DEF_FAIL_FAST_TMO;
Enzo Matsumiya a68e30
 	opts->hdr_digest = false;
Enzo Matsumiya a68e30
 	opts->data_digest = false;
Enzo Matsumiya a68e30
 	opts->tos = -1; /* < 0 == use transport default */
Enzo Matsumiya a68e30
@@ -754,6 +757,17 @@ static int nvmf_parse_options(struct nvm
Enzo Matsumiya a68e30
 				pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
Enzo Matsumiya a68e30
 			ctrl_loss_tmo = token;
Enzo Matsumiya a68e30
 			break;
Enzo Matsumiya a68e30
+		case NVMF_OPT_FAIL_FAST_TMO:
Enzo Matsumiya a68e30
+			if (match_int(args, &token)) {
Enzo Matsumiya a68e30
+				ret = -EINVAL;
Enzo Matsumiya a68e30
+				goto out;
Enzo Matsumiya a68e30
+			}
Enzo Matsumiya a68e30
+
Enzo Matsumiya a68e30
+			if (token >= 0)
Enzo Matsumiya a68e30
+				pr_warn("I/O fail on reconnect controller after %d sec\n",
Enzo Matsumiya a68e30
+					token);
Enzo Matsumiya a68e30
+			opts->fast_io_fail_tmo = token;
Enzo Matsumiya a68e30
+			break;
Enzo Matsumiya a68e30
 		case NVMF_OPT_HOSTNQN:
Enzo Matsumiya a68e30
 			if (opts->host) {
Enzo Matsumiya a68e30
 				pr_err("hostnqn already user-assigned: %s\n",
Enzo Matsumiya a68e30
@@ -884,11 +898,15 @@ static int nvmf_parse_options(struct nvm
Enzo Matsumiya a68e30
 		opts->nr_poll_queues = 0;
Enzo Matsumiya a68e30
 		opts->duplicate_connect = true;
Enzo Matsumiya a68e30
 	}
Enzo Matsumiya a68e30
-	if (ctrl_loss_tmo < 0)
Enzo Matsumiya a68e30
+	if (ctrl_loss_tmo < 0) {
Enzo Matsumiya a68e30
 		opts->max_reconnects = -1;
Enzo Matsumiya a68e30
-	else
Enzo Matsumiya a68e30
+	} else {
Enzo Matsumiya a68e30
 		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
Enzo Matsumiya a68e30
 						opts->reconnect_delay);
Enzo Matsumiya a68e30
+		if (ctrl_loss_tmo < opts->fast_io_fail_tmo)
Enzo Matsumiya a68e30
+			pr_warn("failfast tmo (%d) larger than controller loss tmo (%d)\n",
Enzo Matsumiya a68e30
+				opts->fast_io_fail_tmo, ctrl_loss_tmo);
Enzo Matsumiya a68e30
+	}
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	if (!opts->host) {
Enzo Matsumiya a68e30
 		kref_get(&nvmf_default_host->ref);
Enzo Matsumiya a68e30
@@ -988,7 +1006,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
Enzo Matsumiya a68e30
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
Enzo Matsumiya a68e30
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
Enzo Matsumiya a68e30
 				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
Enzo Matsumiya a68e30
-				 NVMF_OPT_DISABLE_SQFLOW)
Enzo Matsumiya a68e30
+				 NVMF_OPT_DISABLE_SQFLOW |\
Enzo Matsumiya a68e30
+				 NVMF_OPT_FAIL_FAST_TMO)
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 static struct nvme_ctrl *
Enzo Matsumiya a68e30
 nvmf_create_ctrl(struct device *dev, const char *buf)
Enzo Matsumiya a68e30
--- a/drivers/nvme/host/fabrics.h
Enzo Matsumiya a68e30
+++ b/drivers/nvme/host/fabrics.h
Enzo Matsumiya a68e30
@@ -15,6 +15,8 @@
Enzo Matsumiya a68e30
 #define NVMF_DEF_RECONNECT_DELAY	10
Enzo Matsumiya a68e30
 /* default to 600 seconds of reconnect attempts before giving up */
Enzo Matsumiya a68e30
 #define NVMF_DEF_CTRL_LOSS_TMO		600
Enzo Matsumiya a68e30
+/* default is -1: the fail fast mechanism is disabled  */
Enzo Matsumiya a68e30
+#define NVMF_DEF_FAIL_FAST_TMO		-1
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 /*
Enzo Matsumiya a68e30
  * Define a host as seen by the target.  We allocate one at boot, but also
Enzo Matsumiya a68e30
@@ -56,6 +58,7 @@ enum {
Enzo Matsumiya a68e30
 	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
Enzo Matsumiya a68e30
 	NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
Enzo Matsumiya a68e30
 	NVMF_OPT_TOS		= 1 << 19,
Enzo Matsumiya a68e30
+	NVMF_OPT_FAIL_FAST_TMO	= 1 << 20,
Enzo Matsumiya a68e30
 };
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 /**
Enzo Matsumiya a68e30
@@ -89,6 +92,7 @@ enum {
Enzo Matsumiya a68e30
  * @nr_write_queues: number of queues for write I/O
Enzo Matsumiya a68e30
  * @nr_poll_queues: number of queues for polling I/O
Enzo Matsumiya a68e30
  * @tos: type of service
Enzo Matsumiya a68e30
+ * @fast_io_fail_tmo: Fast I/O fail timeout in seconds
Enzo Matsumiya a68e30
  */
Enzo Matsumiya a68e30
 struct nvmf_ctrl_options {
Enzo Matsumiya a68e30
 	unsigned		mask;
Enzo Matsumiya a68e30
@@ -111,6 +115,7 @@ struct nvmf_ctrl_options {
Enzo Matsumiya a68e30
 	unsigned int		nr_write_queues;
Enzo Matsumiya a68e30
 	unsigned int		nr_poll_queues;
Enzo Matsumiya a68e30
 	int			tos;
Enzo Matsumiya a68e30
+	int			fast_io_fail_tmo;
Enzo Matsumiya a68e30
 };
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 /*
Enzo Matsumiya a68e30
--- a/drivers/nvme/host/multipath.c
Enzo Matsumiya a68e30
+++ b/drivers/nvme/host/multipath.c
Enzo Matsumiya a68e30
@@ -291,6 +291,8 @@ static bool nvme_available_path(struct n
Enzo Matsumiya a68e30
 	struct nvme_ns *ns;
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
Enzo Matsumiya a68e30
+		if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags))
Enzo Matsumiya a68e30
+			continue;
Enzo Matsumiya a68e30
 		switch (ns->ctrl->state) {
Enzo Matsumiya a68e30
 		case NVME_CTRL_LIVE:
Enzo Matsumiya a68e30
 		case NVME_CTRL_RESETTING:
Enzo Matsumiya a68e30
--- a/drivers/nvme/host/nvme.h
Enzo Matsumiya a68e30
+++ b/drivers/nvme/host/nvme.h
Enzo Matsumiya a68e30
@@ -263,6 +263,7 @@ struct nvme_ctrl {
Enzo Matsumiya a68e30
 	struct work_struct scan_work;
Enzo Matsumiya a68e30
 	struct work_struct async_event_work;
Enzo Matsumiya a68e30
 	struct delayed_work ka_work;
Enzo Matsumiya a68e30
+	struct delayed_work failfast_work;
Enzo Matsumiya a68e30
 	struct nvme_command ka_cmd;
Enzo Matsumiya a68e30
 	struct work_struct fw_act_work;
Enzo Matsumiya a68e30
 	unsigned long events;
Enzo Matsumiya a68e30
@@ -296,6 +297,8 @@ struct nvme_ctrl {
Enzo Matsumiya a68e30
 	u16 icdoff;
Enzo Matsumiya a68e30
 	u16 maxcmd;
Enzo Matsumiya a68e30
 	int nr_reconnects;
Enzo Matsumiya a68e30
+	unsigned long flags;
Enzo Matsumiya a68e30
+#define NVME_CTRL_FAILFAST_EXPIRED	0
Enzo Matsumiya a68e30
 	struct nvmf_ctrl_options *opts;
Enzo Matsumiya a68e30
 
Enzo Matsumiya a68e30
 	struct page *discard_page;