Johannes Thumshirn 98c37e
From: Israel Rukshin <israelr@mellanox.com>
Johannes Thumshirn 98c37e
Date: Sun, 26 Nov 2017 10:40:55 +0000
Johannes Thumshirn 98c37e
Subject: nvme-rdma: Use mr pool
Johannes Thumshirn 98c37e
Patch-mainline: v4.15-rc2
Johannes Thumshirn 98c37e
Git-commit: f41725bbe16b0773302c0cc7dc2e89f54828712d
Johannes Thumshirn 98c37e
References: FATE#323952, FATE#322506
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
Currently, blk_mq_tagset_iter() iterate over initial hctx tags only.  If
Johannes Thumshirn 98c37e
an I/O scheduler is used, it doesn't iterate the hctx scheduler tags and
Johannes Thumshirn 98c37e
the static request aren't been updated. For example, while using NVMe
Johannes Thumshirn 98c37e
over Fabrics RDMA host, this cause us not to reinit the scheduler
Johannes Thumshirn 98c37e
requests and thus not re-register all the memory regions during the
Johannes Thumshirn 98c37e
tagset re-initialization in the reconnect flow.
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
This may lead to a memory registration error:
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
  "MEMREG for CQE 0xffff88044c14dce8 failed with status memory management operation error (6)"
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
With this commit we don't need to reinit the requests, and thus fix this
Johannes Thumshirn 98c37e
failure.
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Johannes Thumshirn 98c37e
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Johannes Thumshirn 98c37e
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Johannes Thumshirn 98c37e
Signed-off-by: Christoph Hellwig <hch@lst.de>
Johannes Thumshirn 98c37e
Acked-by: Johannes Thumshirn <jthumshirn@suse.de>
Johannes Thumshirn 98c37e
---
Johannes Thumshirn 98c37e
 drivers/nvme/host/rdma.c |   95 ++++++++++++++++++-----------------------------
Johannes Thumshirn 98c37e
 1 file changed, 37 insertions(+), 58 deletions(-)
Johannes Thumshirn 98c37e
Johannes Thumshirn 98c37e
--- a/drivers/nvme/host/rdma.c
Johannes Thumshirn 98c37e
+++ b/drivers/nvme/host/rdma.c
Johannes Thumshirn 98c37e
@@ -15,6 +15,7 @@
Johannes Thumshirn 98c37e
 #include <linux/module.h>
Johannes Thumshirn 98c37e
 #include <linux/init.h>
Johannes Thumshirn 98c37e
 #include <linux/slab.h>
Johannes Thumshirn 98c37e
+#include <rdma/mr_pool.h>
Johannes Thumshirn 98c37e
 #include <linux/err.h>
Johannes Thumshirn 98c37e
 #include <linux/string.h>
Johannes Thumshirn 98c37e
 #include <linux/atomic.h>
Johannes Thumshirn 98c37e
@@ -260,32 +261,6 @@ static int nvme_rdma_create_qp(struct nv
Johannes Thumshirn 98c37e
 	return ret;
Johannes Thumshirn 98c37e
 }
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
-static int nvme_rdma_reinit_request(void *data, struct request *rq)
Johannes Thumshirn 98c37e
-{
Johannes Thumshirn 98c37e
-	struct nvme_rdma_ctrl *ctrl = data;
Johannes Thumshirn 98c37e
-	struct nvme_rdma_device *dev = ctrl->device;
Johannes Thumshirn 98c37e
-	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
Johannes Thumshirn 98c37e
-	int ret = 0;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-	if (WARN_ON_ONCE(!req->mr))
Johannes Thumshirn 98c37e
-		return 0;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-	ib_dereg_mr(req->mr);
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-	req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
Johannes Thumshirn 98c37e
-			ctrl->max_fr_pages);
Johannes Thumshirn 98c37e
-	if (IS_ERR(req->mr)) {
Johannes Thumshirn 98c37e
-		ret = PTR_ERR(req->mr);
Johannes Thumshirn 98c37e
-		req->mr = NULL;
Johannes Thumshirn 98c37e
-		goto out;
Johannes Thumshirn 98c37e
-	}
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-	req->mr->need_inval = false;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-out:
Johannes Thumshirn 98c37e
-	return ret;
Johannes Thumshirn 98c37e
-}
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
 static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
Johannes Thumshirn 98c37e
 		struct request *rq, unsigned int hctx_idx)
Johannes Thumshirn 98c37e
 {
Johannes Thumshirn 98c37e
@@ -295,9 +270,6 @@ static void nvme_rdma_exit_request(struc
Johannes Thumshirn 98c37e
 	struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
Johannes Thumshirn 98c37e
 	struct nvme_rdma_device *dev = queue->device;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
-	if (req->mr)
Johannes Thumshirn 98c37e
-		ib_dereg_mr(req->mr);
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
 	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
Johannes Thumshirn 98c37e
 			DMA_TO_DEVICE);
Johannes Thumshirn 98c37e
 }
Johannes Thumshirn 98c37e
@@ -319,21 +291,9 @@ static int nvme_rdma_init_request(struct
Johannes Thumshirn 98c37e
 	if (ret)
Johannes Thumshirn 98c37e
 		return ret;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
-	req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
Johannes Thumshirn 98c37e
-			ctrl->max_fr_pages);
Johannes Thumshirn 98c37e
-	if (IS_ERR(req->mr)) {
Johannes Thumshirn 98c37e
-		ret = PTR_ERR(req->mr);
Johannes Thumshirn 98c37e
-		goto out_free_qe;
Johannes Thumshirn 98c37e
-	}
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
 	req->queue = queue;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	return 0;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
-out_free_qe:
Johannes Thumshirn 98c37e
-	nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
Johannes Thumshirn 98c37e
-			DMA_TO_DEVICE);
Johannes Thumshirn 98c37e
-	return -ENOMEM;
Johannes Thumshirn 98c37e
 }
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
Johannes Thumshirn 98c37e
@@ -433,6 +393,8 @@ static void nvme_rdma_destroy_queue_ib(s
Johannes Thumshirn 98c37e
 	struct nvme_rdma_device *dev = queue->device;
Johannes Thumshirn 98c37e
 	struct ib_device *ibdev = dev->dev;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+	ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
Johannes Thumshirn 98c37e
+
Johannes Thumshirn 98c37e
 	rdma_destroy_qp(queue->cm_id);
Johannes Thumshirn 98c37e
 	ib_free_cq(queue->ib_cq);
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
@@ -442,6 +404,12 @@ static void nvme_rdma_destroy_queue_ib(s
Johannes Thumshirn 98c37e
 	nvme_rdma_dev_put(dev);
Johannes Thumshirn 98c37e
 }
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
Johannes Thumshirn 98c37e
+{
Johannes Thumshirn 98c37e
+	return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
Johannes Thumshirn 98c37e
+		     ibdev->attrs.max_fast_reg_page_list_len);
Johannes Thumshirn 98c37e
+}
Johannes Thumshirn 98c37e
+
Johannes Thumshirn 98c37e
 static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
Johannes Thumshirn 98c37e
 {
Johannes Thumshirn 98c37e
 	struct ib_device *ibdev;
Johannes Thumshirn 98c37e
@@ -484,8 +452,22 @@ static int nvme_rdma_create_queue_ib(str
Johannes Thumshirn 98c37e
 		goto out_destroy_qp;
Johannes Thumshirn 98c37e
 	}
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+	ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
Johannes Thumshirn 98c37e
+			      queue->queue_size,
Johannes Thumshirn 98c37e
+			      IB_MR_TYPE_MEM_REG,
Johannes Thumshirn 98c37e
+			      nvme_rdma_get_max_fr_pages(ibdev));
Johannes Thumshirn 98c37e
+	if (ret) {
Johannes Thumshirn 98c37e
+		dev_err(queue->ctrl->ctrl.device,
Johannes Thumshirn 98c37e
+			"failed to initialize MR pool sized %d for QID %d\n",
Johannes Thumshirn 98c37e
+			queue->queue_size, idx);
Johannes Thumshirn 98c37e
+		goto out_destroy_ring;
Johannes Thumshirn 98c37e
+	}
Johannes Thumshirn 98c37e
+
Johannes Thumshirn 98c37e
 	return 0;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+out_destroy_ring:
Johannes Thumshirn 98c37e
+	nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
Johannes Thumshirn 98c37e
+			    sizeof(struct nvme_completion), DMA_FROM_DEVICE);
Johannes Thumshirn 98c37e
 out_destroy_qp:
Johannes Thumshirn 98c37e
 	rdma_destroy_qp(queue->cm_id);
Johannes Thumshirn 98c37e
 out_destroy_ib_cq:
Johannes Thumshirn 98c37e
@@ -757,8 +739,7 @@ static int nvme_rdma_configure_admin_que
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	ctrl->device = ctrl->queues[0].device;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
-	ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
Johannes Thumshirn 98c37e
-		ctrl->device->dev->attrs.max_fast_reg_page_list_len);
Johannes Thumshirn 98c37e
+	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	if (new) {
Johannes Thumshirn 98c37e
 		ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
Johannes Thumshirn 98c37e
@@ -772,10 +753,6 @@ static int nvme_rdma_configure_admin_que
Johannes Thumshirn 98c37e
 			error = PTR_ERR(ctrl->ctrl.admin_q);
Johannes Thumshirn 98c37e
 			goto out_free_tagset;
Johannes Thumshirn 98c37e
 		}
Johannes Thumshirn 98c37e
-	} else {
Johannes Thumshirn 98c37e
-		error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
Johannes Thumshirn 98c37e
-		if (error)
Johannes Thumshirn 98c37e
-			goto out_free_queue;
Johannes Thumshirn 98c37e
 	}
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	error = nvme_rdma_start_queue(ctrl, 0);
Johannes Thumshirn 98c37e
@@ -855,10 +832,6 @@ static int nvme_rdma_configure_io_queues
Johannes Thumshirn 98c37e
 			goto out_free_tag_set;
Johannes Thumshirn 98c37e
 		}
Johannes Thumshirn 98c37e
 	} else {
Johannes Thumshirn 98c37e
-		ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
Johannes Thumshirn 98c37e
-		if (ret)
Johannes Thumshirn 98c37e
-			goto out_free_io_queues;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
 		blk_mq_update_nr_hw_queues(&ctrl->tag_set,
Johannes Thumshirn 98c37e
 			ctrl->ctrl.queue_count - 1);
Johannes Thumshirn 98c37e
 	}
Johannes Thumshirn 98c37e
@@ -1061,6 +1034,11 @@ static void nvme_rdma_unmap_data(struct
Johannes Thumshirn 98c37e
 	if (!blk_rq_bytes(rq))
Johannes Thumshirn 98c37e
 		return;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+	if (req->mr) {
Johannes Thumshirn 98c37e
+		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
Johannes Thumshirn 98c37e
+		req->mr = NULL;
Johannes Thumshirn 98c37e
+	}
Johannes Thumshirn 98c37e
+
Johannes Thumshirn 98c37e
 	ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
Johannes Thumshirn 98c37e
 			req->nents, rq_data_dir(rq) ==
Johannes Thumshirn 98c37e
 				    WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
Johannes Thumshirn 98c37e
@@ -1117,12 +1095,18 @@ static int nvme_rdma_map_sg_fr(struct nv
Johannes Thumshirn 98c37e
 	struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
Johannes Thumshirn 98c37e
 	int nr;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
+	req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
Johannes Thumshirn 98c37e
+	if (WARN_ON_ONCE(!req->mr))
Johannes Thumshirn 98c37e
+		return -EAGAIN;
Johannes Thumshirn 98c37e
+
Johannes Thumshirn 98c37e
 	/*
Johannes Thumshirn 98c37e
 	 * Align the MR to a 4K page size to match the ctrl page size and
Johannes Thumshirn 98c37e
 	 * the block virtual boundary.
Johannes Thumshirn 98c37e
 	 */
Johannes Thumshirn 98c37e
 	nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
Johannes Thumshirn 98c37e
 	if (unlikely(nr < count)) {
Johannes Thumshirn 98c37e
+		ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
Johannes Thumshirn 98c37e
+		req->mr = NULL;
Johannes Thumshirn 98c37e
 		if (nr < 0)
Johannes Thumshirn 98c37e
 			return nr;
Johannes Thumshirn 98c37e
 		return -EINVAL;
Johannes Thumshirn 98c37e
@@ -1141,8 +1125,6 @@ static int nvme_rdma_map_sg_fr(struct nv
Johannes Thumshirn 98c37e
 			     IB_ACCESS_REMOTE_READ |
Johannes Thumshirn 98c37e
 			     IB_ACCESS_REMOTE_WRITE;
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
-	req->mr->need_inval = true;
Johannes Thumshirn 98c37e
-
Johannes Thumshirn 98c37e
 	sg->addr = cpu_to_le64(req->mr->iova);
Johannes Thumshirn 98c37e
 	put_unaligned_le24(req->mr->length, sg->length);
Johannes Thumshirn 98c37e
 	put_unaligned_le32(req->mr->rkey, sg->key);
Johannes Thumshirn 98c37e
@@ -1162,7 +1144,6 @@ static int nvme_rdma_map_data(struct nvm
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	req->num_sge = 1;
Johannes Thumshirn 98c37e
 	req->inline_data = false;
Johannes Thumshirn 98c37e
-	req->mr->need_inval = false;
Johannes Thumshirn 98c37e
 	refcount_set(&req->ref, 2); /* send and recv completions */
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	c->common.flags |= NVME_CMD_SGL_METABUF;
Johannes Thumshirn 98c37e
@@ -1341,8 +1322,7 @@ static int nvme_rdma_process_nvme_rsp(st
Johannes Thumshirn 98c37e
 				req->mr->rkey);
Johannes Thumshirn 98c37e
 			nvme_rdma_error_recovery(queue->ctrl);
Johannes Thumshirn 98c37e
 		}
Johannes Thumshirn 98c37e
-		req->mr->need_inval = false;
Johannes Thumshirn 98c37e
-	} else if (req->mr->need_inval) {
Johannes Thumshirn 98c37e
+	} else if (req->mr) {
Johannes Thumshirn 98c37e
 		ret = nvme_rdma_inv_rkey(queue, req);
Johannes Thumshirn 98c37e
 		if (unlikely(ret < 0)) {
Johannes Thumshirn 98c37e
 			dev_err(queue->ctrl->ctrl.device,
Johannes Thumshirn 98c37e
@@ -1650,7 +1630,7 @@ static blk_status_t nvme_rdma_queue_rq(s
Johannes Thumshirn 98c37e
 			sizeof(struct nvme_command), DMA_TO_DEVICE);
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
Johannes Thumshirn 98c37e
-			req->mr->need_inval ? &req->reg_wr.wr : NULL);
Johannes Thumshirn 98c37e
+			req->mr ? &req->reg_wr.wr : NULL);
Johannes Thumshirn 98c37e
 	if (unlikely(err)) {
Johannes Thumshirn 98c37e
 		nvme_rdma_unmap_data(queue, rq);
Johannes Thumshirn 98c37e
 		goto err;
Johannes Thumshirn 98c37e
@@ -1798,7 +1778,6 @@ static const struct nvme_ctrl_ops nvme_r
Johannes Thumshirn 98c37e
 	.submit_async_event	= nvme_rdma_submit_async_event,
Johannes Thumshirn 98c37e
 	.delete_ctrl		= nvme_rdma_delete_ctrl,
Johannes Thumshirn 98c37e
 	.get_address		= nvmf_get_address,
Johannes Thumshirn 98c37e
-	.reinit_request		= nvme_rdma_reinit_request,
Johannes Thumshirn 98c37e
 };
Johannes Thumshirn 98c37e
 
Johannes Thumshirn 98c37e
 static inline bool