Blob Blame History Raw
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 15 Nov 2018 12:31:17 +0100
Subject: [PATCH] nvme-multipath: round-robin I/O policy
Patch-Mainline: never, reworked from patch sent to upstream
References: bsc#1110705

Implement a simple round-robin I/O policy for multipathing.
Path selection is done in two rounds, first iterating across all
optimized paths, and, if that doesn't return any valid paths,
then iterate over all optimized and non-optimized paths.
If no paths are found we're using the existing algorithm.
This patch also implements a sysfs attribute 'iopolicy' to switch
between the current, NUMA-aware I/O policy and the 'round-robin'
I/O policy.
The original NUMA-aware I/O policy is kept as a default.

Signed-off-by: Hannes Reinecke <hare@suse.com>
---
 drivers/nvme/host/core.c      |  6 +++
 drivers/nvme/host/multipath.c | 95 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvme.h      | 12 ++++++
 3 files changed, 113 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0d56736e8fca..6c888a9322cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2150,6 +2150,9 @@ static struct attribute *nvme_subsys_attrs[] = {
 	&subsys_attr_serial.attr,
 	&subsys_attr_firmware_rev.attr,
 	&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&subsys_attr_iopolicy.attr,
+#endif
 	NULL,
 };
 
@@ -2202,6 +2205,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
 	subsys->vendor_id = le16_to_cpu(id->vid);
 	subsys->cmic = id->cmic;
+#ifdef CONFIG_NVME_MULTIPATH
+	subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
 
 	subsys->dev.class = nvme_subsys_class;
 	subsys->dev.release = nvme_release_subsystem;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 4b12d8fad44e..25438c558f37 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -151,6 +151,54 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
 	return fallback;
 }
 
+static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head,
+					   struct nvme_ns *old)
+{
+	struct nvme_ns *ns, *found = NULL;
+	bool try_nonoptimized = false;
+
+	if (!old)
+		return NULL;
+retry:
+	ns = old;
+	do {
+		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
+					   struct nvme_ns, siblings);
+		if (!ns) {
+			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
+						    siblings);
+			if (!ns)
+				return NULL;
+
+			if (ns == old)
+				/*
+				 * The list consists of just one entry.
+				 * Sorry for the noise :-)
+				 */
+				return old;
+		}
+		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
+			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+				found = ns;
+				break;
+			}
+			if (try_nonoptimized &&
+			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
+				found = ns;
+				break;
+			}
+		}
+	} while (ns != old);
+
+	if (found)
+		rcu_assign_pointer(head->current_path, found);
+	else if (!try_nonoptimized) {
+		try_nonoptimized = true;
+		goto retry;
+	}
+	return found;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -161,6 +209,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
 	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
 
+	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+		ns = __nvme_rr_next_path(head, ns);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head);
 	return ns;
@@ -462,6 +512,51 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 	cancel_work_sync(&ctrl->ana_work);
 }
 
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
+	struct device_attribute subsys_attr_##_name =	\
+		__ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+	[NVME_IOPOLICY_UNKNOWN] = "unknown",
+	[NVME_IOPOLICY_NUMA] = "numa",
+	[NVME_IOPOLICY_RR] = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+	int iopolicy = NVME_IOPOLICY_UNKNOWN;
+
+	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
+		iopolicy = READ_ONCE(subsys->iopolicy);
+	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
+		iopolicy = NVME_IOPOLICY_NUMA;
+	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
+		iopolicy = NVME_IOPOLICY_RR;
+
+	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
+		return -EINVAL;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+	return count;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fb56593bbb76..d1740a139ada 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -238,6 +238,14 @@ struct nvme_ctrl {
 	struct nvmf_ctrl_options *opts;
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_iopolicy {
+	NVME_IOPOLICY_UNKNOWN,
+	NVME_IOPOLICY_NUMA,
+	NVME_IOPOLICY_RR,
+};
+#endif
+
 struct nvme_subsystem {
 	int			instance;
 	struct device		dev;
@@ -257,6 +265,9 @@ struct nvme_subsystem {
 	u8			cmic;
 	u16			vendor_id;
 	struct ida		ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_iopolicy	iopolicy;
+#endif
 };
 
 /*
@@ -486,6 +497,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 
 extern struct device_attribute dev_attr_ana_grpid;
 extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
 
 #else
 static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
-- 
2.16.4