Hannes Reinecke ad0b6a
From: Israel Rukshin <israelr@mellanox.com>
Hannes Reinecke ad0b6a
Date: Sun, 24 Nov 2019 18:38:30 +0200
Hannes Reinecke ad0b6a
Subject: [PATCH] nvme-rdma: Avoid preallocating big SGL for data
Hannes Reinecke ad0b6a
Git-commit: 38e1800275d3af607e4df92ff49dc2cf442586a4
Hannes Reinecke ad0b6a
Patch-mainline: v5.5-rc2
Hannes Reinecke ad0b6a
References: bsc#1169045
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
nvme_rdma_alloc_tagset() preallocates a big buffer for the IO SGL based
Hannes Reinecke ad0b6a
on SG_CHUNK_SIZE.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
Modern DMA engines are often capable of dealing with very big segments so
Hannes Reinecke ad0b6a
the SG_CHUNK_SIZE is often too big. SG_CHUNK_SIZE results in a static 4KB
Hannes Reinecke ad0b6a
SGL allocation per command.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
If a controller has lots of deep queues, preallocation for the sg list can
Hannes Reinecke ad0b6a
consume substantial amounts of memory. For nvme-rdma, nr_hw_queues can be
Hannes Reinecke ad0b6a
128 and each queue's depth 128. This means the resulting preallocation
Hannes Reinecke ad0b6a
for the data SGL is 128*128*4K = 64MB per controller.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
Switch to runtime allocation for SGL for lists longer than 2 entries. This
Hannes Reinecke ad0b6a
is the approach used by NVMe PCI so it should be reasonable for NVMeOF as
Hannes Reinecke ad0b6a
well. Runtime SGL allocation has always been the case for the legacy I/O
Hannes Reinecke ad0b6a
path so this is nothing new.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
The preallocated small SGL depends on SG_CHAIN so if the ARCH doesn't
Hannes Reinecke ad0b6a
support SG_CHAIN, use only runtime allocation for the SGL.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
We didn't notice of a performance degradation, since for small IOs we'll
Hannes Reinecke ad0b6a
use the inline SG and for the bigger IOs the allocation of a bigger SGL
Hannes Reinecke ad0b6a
from slab is fast enough.
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
Suggested-by: Christoph Hellwig <hch@lst.de>
Hannes Reinecke ad0b6a
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Hannes Reinecke ad0b6a
Reviewed-by: Christoph Hellwig <hch@lst.de>
Hannes Reinecke ad0b6a
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Hannes Reinecke ad0b6a
Signed-off-by: Keith Busch <kbusch@kernel.org>
Hannes Reinecke ad0b6a
Acked-by: Hannes Reinecke <hare@suse.com>
Hannes Reinecke ad0b6a
---
Hannes Reinecke ad0b6a
 drivers/nvme/host/nvme.h |  6 ++++++
Hannes Reinecke ad0b6a
 drivers/nvme/host/rdma.c | 10 +++++-----
Hannes Reinecke ad0b6a
 2 files changed, 11 insertions(+), 5 deletions(-)
Hannes Reinecke ad0b6a
Hannes Reinecke ad0b6a
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
Hannes Reinecke ad0b6a
index 3b9cbe0668fa..1024fec7914c 100644
Hannes Reinecke ad0b6a
--- a/drivers/nvme/host/nvme.h
Hannes Reinecke ad0b6a
+++ b/drivers/nvme/host/nvme.h
Hannes Reinecke ad0b6a
@@ -28,6 +28,12 @@ extern unsigned int admin_timeout;
Hannes Reinecke ad0b6a
 #define NVME_DEFAULT_KATO	5
Hannes Reinecke ad0b6a
 #define NVME_KATO_GRACE		10
Hannes Reinecke ad0b6a
 
Hannes Reinecke ad0b6a
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
Hannes Reinecke ad0b6a
+#define  NVME_INLINE_SG_CNT  0
Hannes Reinecke ad0b6a
+#else
Hannes Reinecke ad0b6a
+#define  NVME_INLINE_SG_CNT  2
Hannes Reinecke ad0b6a
+#endif
Hannes Reinecke ad0b6a
+
Hannes Reinecke ad0b6a
 extern struct workqueue_struct *nvme_wq;
Hannes Reinecke ad0b6a
 extern struct workqueue_struct *nvme_reset_wq;
Hannes Reinecke ad0b6a
 extern struct workqueue_struct *nvme_delete_wq;
Hannes Reinecke ad0b6a
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
Hannes Reinecke ad0b6a
index dce59459ed41..2a47c6c5007e 100644
Hannes Reinecke ad0b6a
--- a/drivers/nvme/host/rdma.c
Hannes Reinecke ad0b6a
+++ b/drivers/nvme/host/rdma.c
Hannes Reinecke ad0b6a
@@ -731,7 +731,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
Hannes Reinecke ad0b6a
 		set->reserved_tags = 2; /* connect + keep-alive */
Hannes Reinecke ad0b6a
 		set->numa_node = nctrl->numa_node;
Hannes Reinecke ad0b6a
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
Hannes Reinecke ad0b6a
-			SG_CHUNK_SIZE * sizeof(struct scatterlist);
Hannes Reinecke ad0b6a
+			NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
Hannes Reinecke ad0b6a
 		set->driver_data = ctrl;
Hannes Reinecke ad0b6a
 		set->nr_hw_queues = 1;
Hannes Reinecke ad0b6a
 		set->timeout = ADMIN_TIMEOUT;
Hannes Reinecke ad0b6a
@@ -745,7 +745,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
Hannes Reinecke ad0b6a
 		set->numa_node = nctrl->numa_node;
Hannes Reinecke ad0b6a
 		set->flags = BLK_MQ_F_SHOULD_MERGE;
Hannes Reinecke ad0b6a
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
Hannes Reinecke ad0b6a
-			SG_CHUNK_SIZE * sizeof(struct scatterlist);
Hannes Reinecke ad0b6a
+			NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
Hannes Reinecke ad0b6a
 		set->driver_data = ctrl;
Hannes Reinecke ad0b6a
 		set->nr_hw_queues = nctrl->queue_count - 1;
Hannes Reinecke ad0b6a
 		set->timeout = NVME_IO_TIMEOUT;
Hannes Reinecke ad0b6a
@@ -1160,7 +1160,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
Hannes Reinecke ad0b6a
 	}
Hannes Reinecke ad0b6a
 
Hannes Reinecke ad0b6a
 	ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
Hannes Reinecke ad0b6a
-	sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
Hannes Reinecke ad0b6a
+	sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
Hannes Reinecke ad0b6a
 }
Hannes Reinecke ad0b6a
 
Hannes Reinecke ad0b6a
 static int nvme_rdma_set_sg_null(struct nvme_command *c)
Hannes Reinecke ad0b6a
@@ -1276,7 +1276,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
Hannes Reinecke ad0b6a
 	req->sg_table.sgl = req->first_sgl;
Hannes Reinecke ad0b6a
 	ret = sg_alloc_table_chained(&req->sg_table,
Hannes Reinecke ad0b6a
 			blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
Hannes Reinecke ad0b6a
-			SG_CHUNK_SIZE);
Hannes Reinecke ad0b6a
+			NVME_INLINE_SG_CNT);
Hannes Reinecke ad0b6a
 	if (ret)
Hannes Reinecke ad0b6a
 		return -ENOMEM;
Hannes Reinecke ad0b6a
 
Hannes Reinecke ad0b6a
@@ -1314,7 +1314,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
Hannes Reinecke ad0b6a
 out_unmap_sg:
Hannes Reinecke ad0b6a
 	ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
Hannes Reinecke ad0b6a
 out_free_table:
Hannes Reinecke ad0b6a
-	sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
Hannes Reinecke ad0b6a
+	sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
Hannes Reinecke ad0b6a
 	return ret;
Hannes Reinecke ad0b6a
 }
Hannes Reinecke ad0b6a
 
Hannes Reinecke ad0b6a
-- 
Hannes Reinecke ad0b6a
2.16.4
Hannes Reinecke ad0b6a