Blob Blame History Raw
From: Hannes Reinecke <hare@suse.de>
Date: Thu, 15 Nov 2018 12:31:17 +0100
Subject: [PATCH] nvme-multipath: round-robin I/O policy
References: bsc#1110705
Patch-Mainline: submitted linux-nvme 2019/01/25

Implement a simple round-robin I/O policy for multipathing.
Path selection is done in two rounds, first iterating across all
optimized paths, and, if that doesn't return any valid paths,
iterate over all optimized and non-optimized paths.
If no paths are found we're using the existing algorithm.
This patch also implements a sysfs attribute 'iopolicy' to switch
between the current, NUMA-aware I/O policy and the 'round-robin'
I/O policy.

Signed-off-by: Hannes Reinecke <hare@suse.com>
---
 drivers/nvme/host/core.c      |   6 +++
 drivers/nvme/host/multipath.c | 102 +++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  12 +++++
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6b2617e213cc..7f595086f6c6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2232,6 +2232,9 @@ static struct attribute *nvme_subsys_attrs[] = {
 	&subsys_attr_serial.attr,
 	&subsys_attr_firmware_rev.attr,
 	&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&subsys_attr_iopolicy.attr,
+#endif
 	NULL,
 };
 
@@ -2284,6 +2287,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
 	subsys->vendor_id = le16_to_cpu(id->vid);
 	subsys->cmic = id->cmic;
+#ifdef CONFIG_NVME_MULTIPATH
+	subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
 
 	subsys->dev.class = nvme_subsys_class;
 	subsys->dev.release = nvme_release_subsystem;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8b58a6ca5bf4..6122ff5aa438 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, ns->ctrl->numa_node);
+		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+			distance = node_distance(node, ns->ctrl->numa_node);
+		else
+			distance = LOCAL_DISTANCE;
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -168,6 +171,56 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 	return found;
 }
 
+static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
+					   struct nvme_ns *old)
+{
+	struct nvme_ns *ns, *found = NULL;
+	bool try_nonoptimized = false;
+
+	if (!old)
+		return NULL;
+retry:
+	ns = old;
+	do {
+		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
+					   struct nvme_ns, siblings);
+		if (!ns) {
+			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
+						    siblings);
+			if (!ns)
+				return NULL;
+
+			if (ns == old)
+				/*
+				 * The list consists of just one entry.
+				 * Sorry for the noise :-)
+				 */
+				return old;
+		}
+		if (!ns->disk || ns->ctrl->state != NVME_CTRL_LIVE ||
+		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+			continue;
+
+		if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+			found = ns;
+			break;
+		}
+		if (try_nonoptimized &&
+		    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
+			found = ns;
+			break;
+		}
+	} while (ns != old);
+
+	if (found)
+		rcu_assign_pointer(head->current_path[node], found);
+	else if (!try_nonoptimized) {
+		try_nonoptimized = true;
+		goto retry;
+	}
+	return found;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -180,6 +233,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	struct nvme_ns *ns;
 
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
+	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+		ns = __nvme_rr_next_path(head, node, ns);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head, node);
 	return ns;
@@ -487,6 +542,51 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 	cancel_work_sync(&ctrl->ana_work);
 }
 
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
+	struct device_attribute subsys_attr_##_name =	\
+		__ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+	[NVME_IOPOLICY_UNKNOWN] = "unknown",
+	[NVME_IOPOLICY_NUMA] = "numa",
+	[NVME_IOPOLICY_RR] = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+	int iopolicy = NVME_IOPOLICY_UNKNOWN;
+
+	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
+		iopolicy = READ_ONCE(subsys->iopolicy);
+	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
+		iopolicy = NVME_IOPOLICY_NUMA;
+	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
+		iopolicy = NVME_IOPOLICY_RR;
+
+	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
+		return -EINVAL;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+	return count;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b84be08bbbe0..7967837b6a8c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -243,6 +243,14 @@ struct nvme_ctrl {
 	struct nvmf_ctrl_options *opts;
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_iopolicy {
+	NVME_IOPOLICY_UNKNOWN,
+	NVME_IOPOLICY_NUMA,
+	NVME_IOPOLICY_RR,
+};
+#endif
+
 struct nvme_subsystem {
 	int			instance;
 	struct device		dev;
@@ -262,6 +270,9 @@ struct nvme_subsystem {
 	u8			cmic;
 	u16			vendor_id;
 	struct ida		ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_iopolicy	iopolicy;
+#endif
 };
 
 /*
@@ -482,6 +493,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 
 extern struct device_attribute dev_attr_ana_grpid;
 extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
 
 #else
 static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
-- 
2.16.4