Blob Blame History Raw
From: Tariq Toukan <tariqt@mellanox.com>
Date: Sun, 29 Jan 2017 17:42:26 +0200
Subject: net/mlx5e: Introduce RX Page-Reuse
Patch-mainline: v4.13-rc1
Git-commit: accd58833237d4ad835f7f176303ac2b582704e3
References: bsc#1046303 FATE#322944

Introduce a Page-Reuse mechanism in non-Striding RQ RX datapath.

A WQE (RX descriptor) buffer is a page, that in most cases was fully
wasted on a packet that is much smaller, requiring a new page for
the next round.

In this patch, we implement a page-reuse mechanism, that resembles a
`SW Striding RQ`.
We allow the WQE to reuse its allocated page as much as it could,
until the page is fully consumed.  In each round, the WQE is capable
of receiving packet of maximal size (MTU). Yet, upon the reception of
a packet, the WQE knows the actual packet size, and consumes the exact
amount of memory needed to build a linear SKB. Then, it updates the
buffer pointer within the page accordingly, for the next round.

Feature is mutually exclusive with XDP (packet-per-page)
and LRO (session size is a power of two, needs unused page).

Performance tests:
iperf tcp tests show huge gain:

--------------------------------------------
num streams | BW before | BW after | ratio |
          1 |      22.2 |     30.9 | 1.39x |
          8 |      64.2 |     93.6 | 1.46x |
         64 |      56.7 |     91.4 | 1.61x |
--------------------------------------------

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   12 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   28 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  119 ++++++++++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |    4 
 4 files changed, 126 insertions(+), 37 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -456,6 +456,11 @@ struct mlx5e_dma_info {
 	dma_addr_t	addr;
 };
 
+struct mlx5e_wqe_frag_info {
+	struct mlx5e_dma_info di;
+	u32 offset;
+};
+
 struct mlx5e_umr_dma_info {
 	__be64                *mtt;
 	dma_addr_t             mtt_addr;
@@ -517,7 +522,12 @@ struct mlx5e_rq {
 	struct mlx5_wq_ll      wq;
 
 	union {
-		struct mlx5e_dma_info *dma_info;
+		struct {
+			struct mlx5e_wqe_frag_info *frag_info;
+			u32 frag_sz;	/* max possible skb frag_sz */
+			bool page_reuse;
+			bool xdp_xmit;
+		} wqe;
 		struct {
 			struct mlx5e_mpw_info *info;
 			void                  *mtt_no_align;
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -200,6 +200,7 @@ static void mlx5e_update_sw_counters(str
 		s->rx_buff_alloc_err += rq_stats->buff_alloc_err;
 		s->rx_cqe_compress_blks += rq_stats->cqe_compress_blks;
 		s->rx_cqe_compress_pkts += rq_stats->cqe_compress_pkts;
+		s->rx_page_reuse  += rq_stats->page_reuse;
 		s->rx_cache_reuse += rq_stats->cache_reuse;
 		s->rx_cache_full  += rq_stats->cache_full;
 		s->rx_cache_empty += rq_stats->cache_empty;
@@ -549,7 +550,6 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
 	u32 byte_count;
-	u32 frag_sz;
 	int npages;
 	int wq_sz;
 	int err;
@@ -613,9 +613,10 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 			goto err_destroy_umr_mkey;
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		rq->dma_info = kzalloc_node(wq_sz * sizeof(*rq->dma_info),
-					    GFP_KERNEL, cpu_to_node(c->cpu));
-		if (!rq->dma_info) {
+		rq->wqe.frag_info =
+			kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
+				     GFP_KERNEL, cpu_to_node(c->cpu));
+		if (!rq->wqe.frag_info) {
 			err = -ENOMEM;
 			goto err_rq_wq_destroy;
 		}
@@ -624,7 +625,7 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 
 		rq->handle_rx_cqe = c->priv->profile->rx_handlers.handle_rx_cqe;
 		if (!rq->handle_rx_cqe) {
-			kfree(rq->dma_info);
+			kfree(rq->wqe.frag_info);
 			err = -EINVAL;
 			netdev_err(c->netdev, "RX handler of RQ is not set, err %d\n", err);
 			goto err_rq_wq_destroy;
@@ -633,11 +634,12 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 		rq->buff.wqe_sz = params->lro_en  ?
 				params->lro_wqe_sz :
 				MLX5E_SW2HW_MTU(c->priv, c->netdev->mtu);
+		rq->wqe.page_reuse = !params->xdp_prog && !params->lro_en;
 		byte_count = rq->buff.wqe_sz;
 
 		/* calc the required page order */
-		frag_sz = MLX5_SKB_FRAG_SZ(rq->rx_headroom + byte_count);
-		npages = DIV_ROUND_UP(frag_sz, PAGE_SIZE);
+		rq->wqe.frag_sz = MLX5_SKB_FRAG_SZ(rq->rx_headroom + byte_count);
+		npages = DIV_ROUND_UP(rq->wqe.frag_sz, PAGE_SIZE);
 		rq->buff.page_order = order_base_2(npages);
 
 		byte_count |= MLX5_HW_START_PADDING;
@@ -682,7 +684,7 @@ static void mlx5e_free_rq(struct mlx5e_r
 		mlx5_core_destroy_mkey(rq->mdev, &rq->umr_mkey);
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
-		kfree(rq->dma_info);
+		kfree(rq->wqe.frag_info);
 	}
 
 	for (i = rq->page_cache.head; i != rq->page_cache.tail;
@@ -864,6 +866,16 @@ static void mlx5e_free_rx_descs(struct m
 		mlx5_wq_ll_pop(&rq->wq, wqe_ix_be,
 			       &wqe->next.next_wqe_index);
 	}
+
+	if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST && rq->wqe.page_reuse) {
+		/* Clean outstanding pages on handled WQEs that decided to do page-reuse,
+		 * but yet to be re-posted.
+		 */
+		int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+
+		for (wqe_ix = 0; wqe_ix < wq_sz; wqe_ix++)
+			rq->dealloc_wqe(rq, wqe_ix);
+	}
 }
 
 static int mlx5e_open_rq(struct mlx5e_channel *c,
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -160,6 +160,11 @@ static inline u32 mlx5e_decompress_cqes_
 
 #define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)
 
+static inline bool mlx5e_page_is_reserved(struct page *page)
+{
+	return page_is_pfmemalloc(page) || page_to_nid(page) != numa_node_id();
+}
+
 static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,
 				      struct mlx5e_dma_info *dma_info)
 {
@@ -238,22 +243,54 @@ void mlx5e_page_release(struct mlx5e_rq
 	put_page(dma_info->page);
 }
 
+static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
+				    struct mlx5e_wqe_frag_info *wi)
+{
+	return rq->wqe.page_reuse && wi->di.page &&
+		(wi->offset + rq->wqe.frag_sz <= RQ_PAGE_SIZE(rq)) &&
+		!mlx5e_page_is_reserved(wi->di.page);
+}
+
 int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe *wqe, u16 ix)
 {
-	struct mlx5e_dma_info *di = &rq->dma_info[ix];
+	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
-	if (unlikely(mlx5e_page_alloc_mapped(rq, di)))
-		return -ENOMEM;
+	/* check if page exists, hence can be reused */
+	if (!wi->di.page) {
+		if (unlikely(mlx5e_page_alloc_mapped(rq, &wi->di)))
+			return -ENOMEM;
+		wi->offset = 0;
+	}
 
-	wqe->data.addr = cpu_to_be64(di->addr + rq->rx_headroom);
+	wqe->data.addr = cpu_to_be64(wi->di.addr + wi->offset +
+				     rq->rx_headroom);
 	return 0;
 }
 
+static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
+				     struct mlx5e_wqe_frag_info *wi)
+{
+	mlx5e_page_release(rq, &wi->di, true);
+	wi->di.page = NULL;
+}
+
+static inline void mlx5e_free_rx_wqe_reuse(struct mlx5e_rq *rq,
+					   struct mlx5e_wqe_frag_info *wi)
+{
+	if (mlx5e_page_reuse(rq, wi)) {
+		rq->stats.page_reuse++;
+		return;
+	}
+
+	mlx5e_free_rx_wqe(rq, wi);
+}
+
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 {
-	struct mlx5e_dma_info *di = &rq->dma_info[ix];
+	struct mlx5e_wqe_frag_info *wi = &rq->wqe.frag_info[ix];
 
-	mlx5e_page_release(rq, di, true);
+	if (wi->di.page)
+		mlx5e_free_rx_wqe(rq, wi);
 }
 
 static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
@@ -650,7 +687,6 @@ static inline bool mlx5e_xmit_xdp_frame(
 	if (unlikely(dma_len < MLX5E_XDP_MIN_INLINE ||
 		     MLX5E_SW2HW_MTU(rq->channel->priv, rq->netdev->mtu) < dma_len)) {
 		rq->stats.xdp_drop++;
-		mlx5e_page_release(rq, di, true);
 		return false;
 	}
 
@@ -661,7 +697,6 @@ static inline bool mlx5e_xmit_xdp_frame(
 			sq->db.doorbell = false;
 		}
 		rq->stats.xdp_tx_full++;
-		mlx5e_page_release(rq, di, true);
 		return false;
 	}
 
@@ -686,10 +721,15 @@ static inline bool mlx5e_xmit_xdp_frame(
 
 	cseg->opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_SEND);
 
+	/* move page to reference to sq responsibility,
+	 * and mark so it's not put back in page-cache.
+	 */
+	rq->wqe.xdp_xmit = true;
 	sq->db.di[pi] = *di;
 	sq->pc++;
 
 	sq->db.doorbell = true;
+
 	rq->stats.xdp_tx++;
 	return true;
 }
@@ -726,36 +766,34 @@ static inline int mlx5e_xdp_handle(struc
 		trace_xdp_exception(rq->netdev, prog, act);
 	case XDP_DROP:
 		rq->stats.xdp_drop++;
-		mlx5e_page_release(rq, di, true);
 		return true;
 	}
 }
 
 static inline
 struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
-			     u16 wqe_counter, u32 cqe_bcnt)
+			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
-	struct mlx5e_dma_info *di;
+	struct mlx5e_dma_info *di = &wi->di;
 	struct sk_buff *skb;
 	void *va, *data;
 	u16 rx_headroom = rq->rx_headroom;
 	bool consumed;
 	u32 frag_size;
 
-	di             = &rq->dma_info[wqe_counter];
-	va             = page_address(di->page);
+	va             = page_address(di->page) + wi->offset;
 	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
 
 	dma_sync_single_range_for_cpu(rq->pdev,
-				      di->addr,
-				      rx_headroom,
-				      rq->buff.wqe_sz,
+				      di->addr + wi->offset,
+				      0, frag_size,
 				      DMA_FROM_DEVICE);
 	prefetch(data);
+	wi->offset += frag_size;
 
 	if (unlikely((cqe->op_own >> 4) != MLX5_CQE_RESP_SEND)) {
 		rq->stats.wqe_err++;
-		mlx5e_page_release(rq, di, true);
 		return NULL;
 	}
 
@@ -765,17 +803,14 @@ struct sk_buff *skb_from_cqe(struct mlx5
 	if (consumed)
 		return NULL; /* page/packet was consumed by XDP */
 
-	frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
 	skb = build_skb(va, frag_size);
 	if (unlikely(!skb)) {
 		rq->stats.buff_alloc_err++;
-		mlx5e_page_release(rq, di, true);
 		return NULL;
 	}
 
-	/* queue up for recycling ..*/
+	/* queue up for recycling/reuse */
 	page_ref_inc(di->page);
-	mlx5e_page_release(rq, di, true);
 
 	skb_reserve(skb, rx_headroom);
 	skb_put(skb, cqe_bcnt);
@@ -785,6 +820,7 @@ struct sk_buff *skb_from_cqe(struct mlx5
 
 void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
 	struct sk_buff *skb;
@@ -794,15 +830,27 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
 	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
-	if (!skb)
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		/* probably for XDP */
+		if (rq->wqe.xdp_xmit) {
+			wi->di.page = NULL;
+			rq->wqe.xdp_xmit = false;
+			/* do not return page to cache, it will be returned on XDP_TX completion */
+			goto wq_ll_pop;
+		}
+		/* probably an XDP_DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi);
 		goto wq_ll_pop;
+	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
+	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
@@ -814,6 +862,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5
 	struct mlx5e_priv *priv = netdev_priv(netdev);
 	struct mlx5e_rep_priv *rpriv  = priv->ppriv;
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
+	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	__be16 wqe_counter_be;
@@ -823,11 +872,21 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
 	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
-	if (!skb)
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
+	if (!skb) {
+		if (rq->wqe.xdp_xmit) {
+			wi->di.page = NULL;
+			rq->wqe.xdp_xmit = false;
+			/* do not return page to cache, it will be returned on XDP_TX completion */
+			goto wq_ll_pop;
+		}
+		/* probably an XDP_DROP, save the page-reuse checks */
+		mlx5e_free_rx_wqe(rq, wi);
 		goto wq_ll_pop;
+	}
 
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 
@@ -836,6 +895,7 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5
 
 	napi_gro_receive(rq->cq.napi, skb);
 
+	mlx5e_free_rx_wqe_reuse(rq, wi);
 wq_ll_pop:
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
@@ -1096,6 +1156,7 @@ static inline void mlx5i_complete_rx_cqe
 
 void mlx5i_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
+	struct mlx5e_wqe_frag_info *wi;
 	struct mlx5e_rx_wqe *wqe;
 	__be16 wqe_counter_be;
 	struct sk_buff *skb;
@@ -1105,16 +1166,18 @@ void mlx5i_handle_rx_cqe(struct mlx5e_rq
 	wqe_counter_be = cqe->wqe_counter;
 	wqe_counter    = be16_to_cpu(wqe_counter_be);
 	wqe            = mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
+	wi             = &rq->wqe.frag_info[wqe_counter];
 	cqe_bcnt       = be32_to_cpu(cqe->byte_cnt);
 
-	skb = skb_from_cqe(rq, cqe, wqe_counter, cqe_bcnt);
+	skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
 	if (!skb)
-		goto wq_ll_pop;
+		goto wq_free_wqe;
 
 	mlx5i_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
-wq_ll_pop:
+wq_free_wqe:
+	mlx5e_free_rx_wqe_reuse(rq, wi);
 	mlx5_wq_ll_pop(&rq->wq, wqe_counter_be,
 		       &wqe->next.next_wqe_index);
 }
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -79,6 +79,7 @@ struct mlx5e_sw_stats {
 	u64 rx_buff_alloc_err;
 	u64 rx_cqe_compress_blks;
 	u64 rx_cqe_compress_pkts;
+	u64 rx_page_reuse;
 	u64 rx_cache_reuse;
 	u64 rx_cache_full;
 	u64 rx_cache_empty;
@@ -117,6 +118,7 @@ static const struct counter_desc sw_stat
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_buff_alloc_err) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_blks) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cqe_compress_pkts) },
+	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_page_reuse) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_reuse) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_full) },
 	{ MLX5E_DECLARE_STAT(struct mlx5e_sw_stats, rx_cache_empty) },
@@ -319,6 +321,7 @@ struct mlx5e_rq_stats {
 	u64 buff_alloc_err;
 	u64 cqe_compress_blks;
 	u64 cqe_compress_pkts;
+	u64 page_reuse;
 	u64 cache_reuse;
 	u64 cache_full;
 	u64 cache_empty;
@@ -341,6 +344,7 @@ static const struct counter_desc rq_stat
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, buff_alloc_err) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_blks) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cqe_compress_pkts) },
+	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, page_reuse) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_reuse) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_full) },
 	{ MLX5E_DECLARE_RX_STAT(struct mlx5e_rq_stats, cache_empty) },