Blob Blame History Raw
From: Tariq Toukan <tariqt@mellanox.com>
Date: Wed, 7 Feb 2018 14:41:25 +0200
Subject: net/mlx5e: Use linear SKB in Striding RQ
Patch-mainline: v4.17-rc1
Git-commit: 619a8f2a42f1031cdbd74435b6a9191eb4913139
References: bsc#1103990 FATE#326006

Current Striding RQ HW feature utilizes the RX buffers so that
there is no wasted room between the strides. This maximises
the memory utilization.
This prevents the use of build_skb() (which requires headroom
and tailroom), and demands to memcpy the packets headers into
the skb linear part.

In this patch, whenever a set of conditions holds, we apply
an RQ configuration that allows combining the use of linear SKB
on top of a Striding RQ.

To use build_skb() with Striding RQ, the following must hold:
1. packet does not cross a page boundary.
2. there is enough headroom and tailroom surrounding the packet.

We can satisfy 1 and 2 by configuring:
	stride size = MTU + headroom + tailoom.

This is possible only when:
a. (MTU - headroom - tailoom) does not exceed PAGE_SIZE.
b. HW LRO is turned off.

Using linear SKB has many advantages:
- Saves a memcpy of the headers.
- No page-boundary checks in datapath.
- No filler CQEs.
- Significantly smaller CQ.
- SKB data continuously resides in linear part, and not split to
  small amount (linear part) and large amount (fragment).
  This saves datapath cycles in driver and improves utilization
  of SKB fragments in GRO.
- The fragments of a resulting GRO SKB follow the IP forwarding
  assumption of equal-size fragments.

Some implementation details:
HW writes the packets to the beginning of a stride,
i.e. does not keep headroom. To overcome this we make sure we can
extend backwards and use the last bytes of stride i-1.
Extra care is needed for stride 0 as it has no preceding stride.
We make sure headroom bytes are available by shifting the buffer
pointer passed to HW by headroom bytes.

This configuration now becomes default, whenever capable.
Of course, this implies turning LRO off.

Performance testing:
ConnectX-5, single core, single RX ring, default MTU.

UDP packet rate, early drop in TC layer:

--------------------------------------------
| pkt size | before    | after     | ratio |
--------------------------------------------
| 1500byte | 4.65 Mpps | 5.96 Mpps | 1.28x |
|  500byte | 5.23 Mpps | 5.97 Mpps | 1.14x |
|   64byte | 5.94 Mpps | 5.96 Mpps | 1.00x |
--------------------------------------------

TCP streams: ~20% gain

Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      |   10 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   76 +++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |  102 +++++++++++++++-------
 include/linux/mlx5/device.h                       |    3 
 include/linux/mlx5/mlx5_ifc.h                     |    7 -
 5 files changed, 153 insertions(+), 45 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -473,6 +473,9 @@ struct mlx5e_page_cache {
 
 struct mlx5e_rq;
 typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+			       u16 cqe_bcnt, u32 head_offset, u32 page_idx);
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
@@ -491,6 +494,7 @@ struct mlx5e_rq {
 		} wqe;
 		struct {
 			struct mlx5e_mpw_info *info;
+			mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
 			u16                    num_strides;
 			u8                     log_stride_sz;
 			bool                   umr_in_progress;
@@ -834,6 +838,12 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_r
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -91,9 +91,14 @@ bool mlx5e_check_fragmented_striding_rq_
 
 static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
 {
-	u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+	if (!params->xdp_prog) {
+		u16 hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu);
+		u16 rq_headroom = MLX5_RX_HEADROOM + NET_IP_ALIGN;
 
-	return hw_mtu;
+		return MLX5_SKB_FRAG_SZ(rq_headroom + hw_mtu);
+	}
+
+	return PAGE_SIZE;
 }
 
 static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5e_params *params)
@@ -103,6 +108,26 @@ static u8 mlx5e_mpwqe_log_pkts_per_wqe(s
 	return MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz);
 }
 
+static bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
+					 struct mlx5e_params *params)
+{
+	u32 frag_sz = mlx5e_mpwqe_get_linear_frag_sz(params);
+	s8 signed_log_num_strides_param;
+	u8 log_num_strides;
+
+	if (params->lro_en || frag_sz > PAGE_SIZE)
+		return false;
+
+	if (MLX5_CAP_GEN(mdev, ext_stride_num_range))
+		return true;
+
+	log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(frag_sz);
+	signed_log_num_strides_param =
+		(s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE;
+
+	return signed_log_num_strides_param >= 0;
+}
+
 static u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params)
 {
 	if (params->log_rq_mtu_frames <
@@ -115,6 +140,9 @@ static u8 mlx5e_mpwqe_get_log_rq_size(st
 static u8 mlx5e_mpwqe_get_log_stride_size(struct mlx5_core_dev *mdev,
 					  struct mlx5e_params *params)
 {
+	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+		return order_base_2(mlx5e_mpwqe_get_linear_frag_sz(params));
+
 	return MLX5E_MPWQE_STRIDE_SZ(mdev,
 		MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
 }
@@ -126,7 +154,8 @@ static u8 mlx5e_mpwqe_get_log_num_stride
 		mlx5e_mpwqe_get_log_stride_size(mdev, params);
 }
 
-static u16 mlx5e_get_rq_headroom(struct mlx5e_params *params)
+static u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev,
+				 struct mlx5e_params *params)
 {
 	u16 linear_rq_headroom = params->xdp_prog ?
 		XDP_PACKET_HEADROOM : MLX5_RX_HEADROOM;
@@ -136,6 +165,9 @@ static u16 mlx5e_get_rq_headroom(struct
 	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST)
 		return linear_rq_headroom;
 
+	if (mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+		return linear_rq_headroom;
+
 	return 0;
 }
 
@@ -151,12 +183,14 @@ void mlx5e_init_rq_type_params(struct ml
 		break;
 	default: /* MLX5_WQ_TYPE_LINKED_LIST */
 		/* Extra room needed for build_skb */
-		params->lro_wqe_sz -= mlx5e_get_rq_headroom(params) +
+		params->lro_wqe_sz -= mlx5e_get_rq_headroom(mdev, params) +
 			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	}
 
 	mlx5_core_info(mdev, "MLX5E: StrdRq(%d) RqSz(%ld) StrdSz(%ld) RxCqeCmprss(%d)\n",
 		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ,
+		       params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+		       BIT(mlx5e_mpwqe_get_log_rq_size(params)) :
 		       BIT(params->log_rq_mtu_frames),
 		       BIT(mlx5e_mpwqe_get_log_stride_size(mdev, params)),
 		       MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS));
@@ -400,11 +434,10 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 		goto err_rq_wq_destroy;
 
 	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
-	rq->buff.headroom = mlx5e_get_rq_headroom(params);
+	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -422,6 +455,10 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 			goto err_rq_wq_destroy;
 		}
 
+		rq->mpwqe.skb_from_cqe_mpwrq =
+			mlx5e_rx_mpwqe_is_linear_skb(mdev, params) ?
+			mlx5e_skb_from_cqe_mpwrq_linear :
+			mlx5e_skb_from_cqe_mpwrq_nonlinear;
 		rq->mpwqe.log_stride_sz = mlx5e_mpwqe_get_log_stride_size(mdev, params);
 		rq->mpwqe.num_strides = BIT(mlx5e_mpwqe_get_log_num_strides(mdev, params));
 
@@ -484,7 +521,7 @@ static int mlx5e_alloc_rq(struct mlx5e_c
 		if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) {
 			u64 dma_offset = (u64)mlx5e_get_wqe_mtt_offset(rq, i) << PAGE_SHIFT;
 
-			wqe->data.addr = cpu_to_be64(dma_offset);
+			wqe->data.addr = cpu_to_be64(dma_offset + rq->buff.headroom);
 		}
 
 		wqe->data.byte_count = cpu_to_be32(byte_count);
@@ -1834,9 +1871,11 @@ static void mlx5e_build_rq_param(struct
 	switch (params->rq_wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
 		MLX5_SET(wq, wq, log_wqe_num_of_strides,
-			 mlx5e_mpwqe_get_log_num_strides(mdev, params) - 9);
+			 mlx5e_mpwqe_get_log_num_strides(mdev, params) -
+			 MLX5_MPWQE_LOG_NUM_STRIDES_BASE);
 		MLX5_SET(wq, wq, log_wqe_stride_size,
-			 mlx5e_mpwqe_get_log_stride_size(mdev, params) - 6);
+			 mlx5e_mpwqe_get_log_stride_size(mdev, params) -
+			 MLX5_MPWQE_LOG_STRIDE_SZ_BASE);
 		MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ);
 		MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params));
 		break;
@@ -3196,20 +3235,28 @@ typedef int (*mlx5e_feature_handler)(str
 static int set_feature_lro(struct net_device *netdev, bool enable)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
+	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_channels new_channels = {};
+	struct mlx5e_params *old_params;
 	int err = 0;
 	bool reset;
 
 	mutex_lock(&priv->state_lock);
 
-	reset = (priv->channels.params.rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST);
-	reset = reset && test_bit(MLX5E_STATE_OPENED, &priv->state);
+	old_params = &priv->channels.params;
+	reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
 
-	new_channels.params = priv->channels.params;
+	new_channels.params = *old_params;
 	new_channels.params.lro_en = enable;
 
+	if (old_params->rq_wq_type != MLX5_WQ_TYPE_LINKED_LIST) {
+		if (mlx5e_rx_mpwqe_is_linear_skb(mdev, old_params) ==
+		    mlx5e_rx_mpwqe_is_linear_skb(mdev, &new_channels.params))
+			reset = false;
+	}
+
 	if (!reset) {
-		priv->channels.params = new_channels.params;
+		*old_params = new_channels.params;
 		err = mlx5e_modify_tirs_lro(priv);
 		goto out;
 	}
@@ -4124,7 +4171,8 @@ void mlx5e_build_nic_params(struct mlx5_
 
 	/* TODO: && MLX5_CAP_ETH(mdev, lro_cap) */
 	if (params->rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
-		params->lro_en = !slow_pci_heuristic(mdev);
+		if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params))
+			params->lro_en = !slow_pci_heuristic(mdev);
 	params->lro_timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
 
 	/* CQ moderation params */
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -837,6 +837,24 @@ static inline int mlx5e_xdp_handle(struc
 }
 
 static inline
+struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
+				       u32 frag_size, u16 headroom,
+				       u32 cqe_bcnt)
+{
+	struct sk_buff *skb = build_skb(va, frag_size);
+
+	if (unlikely(!skb)) {
+		rq->stats.buff_alloc_err++;
+		return NULL;
+	}
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, cqe_bcnt);
+
+	return skb;
+}
+
+static inline
 struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 			     struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
 {
@@ -867,18 +885,13 @@ struct sk_buff *skb_from_cqe(struct mlx5
 	if (consumed)
 		return NULL; /* page/packet was consumed by XDP */
 
-	skb = build_skb(va, frag_size);
-	if (unlikely(!skb)) {
-		rq->stats.buff_alloc_err++;
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
+	if (unlikely(!skb))
 		return NULL;
-	}
 
 	/* queue up for recycling/reuse */
 	page_ref_inc(di->page);
 
-	skb_reserve(skb, rx_headroom);
-	skb_put(skb, cqe_bcnt);
-
 	return skb;
 }
 
@@ -967,20 +980,24 @@ wq_ll_pop:
 }
 #endif
 
-static inline void mlx5e_mpwqe_fill_rx_skb(struct mlx5e_rq *rq,
-					   struct mlx5_cqe64 *cqe,
-					   struct mlx5e_mpw_info *wi,
-					   u32 cqe_bcnt,
-					   struct sk_buff *skb)
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				   u16 cqe_bcnt, u32 head_offset, u32 page_idx)
 {
-	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
-	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
-	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
-	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
-	u32 head_page_idx  = page_idx;
 	u16 headlen = min_t(u16, MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, cqe_bcnt);
 	u32 frag_offset    = head_offset + headlen;
 	u16 byte_cnt       = cqe_bcnt - headlen;
+	u32 head_page_idx  = page_idx;
+	struct sk_buff *skb;
+
+	skb = napi_alloc_skb(rq->cq.napi,
+			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD, sizeof(long)));
+	if (unlikely(!skb)) {
+		rq->stats.buff_alloc_err++;
+		return NULL;
+	}
+
+	prefetchw(skb->data);
 
 	if (unlikely(frag_offset >= PAGE_SIZE)) {
 		page_idx++;
@@ -1003,6 +1020,35 @@ static inline void mlx5e_mpwqe_fill_rx_s
 	/* skb linear part was allocated with headlen and aligned to long */
 	skb->tail += headlen;
 	skb->len  += headlen;
+
+	return skb;
+}
+
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+				u16 cqe_bcnt, u32 head_offset, u32 page_idx)
+{
+	struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
+	u16 rx_headroom = rq->buff.headroom;
+	struct sk_buff *skb;
+	void *va, *data;
+	u32 frag_size;
+
+	va             = page_address(di->page) + head_offset;
+	data           = va + rx_headroom;
+	frag_size      = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
+
+	dma_sync_single_range_for_cpu(rq->pdev, di->addr, head_offset,
+				      frag_size, DMA_FROM_DEVICE);
+	prefetch(data);
+	skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
+	if (unlikely(!skb))
+		return NULL;
+
+	/* queue up for recycling/reuse */
+	wi->skbs_frags[page_idx]++;
+
+	return skb;
 }
 
 void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
@@ -1010,7 +1056,11 @@ void mlx5e_handle_rx_cqe_mpwrq(struct ml
 	u16 cstrides       = mpwrq_get_cqe_consumed_strides(cqe);
 	u16 wqe_id         = be16_to_cpu(cqe->wqe_id);
 	struct mlx5e_mpw_info *wi = &rq->mpwqe.info[wqe_id];
-	struct mlx5e_rx_wqe  *wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
+	u16 stride_ix      = mpwrq_get_cqe_stride_index(cqe);
+	u32 wqe_offset     = stride_ix << rq->mpwqe.log_stride_sz;
+	u32 head_offset    = wqe_offset & (PAGE_SIZE - 1);
+	u32 page_idx       = wqe_offset >> PAGE_SHIFT;
+	struct mlx5e_rx_wqe *wqe;
 	struct sk_buff *skb;
 	u16 cqe_bcnt;
 
@@ -1026,18 +1076,13 @@ void mlx5e_handle_rx_cqe_mpwrq(struct ml
 		goto mpwrq_cqe_out;
 	}
 
-	skb = napi_alloc_skb(rq->cq.napi,
-			     ALIGN(MLX5_MPWRQ_SMALL_PACKET_THRESHOLD,
-				   sizeof(long)));
-	if (unlikely(!skb)) {
-		rq->stats.buff_alloc_err++;
-		goto mpwrq_cqe_out;
-	}
-
-	prefetchw(skb->data);
 	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
 
-	mlx5e_mpwqe_fill_rx_skb(rq, cqe, wi, cqe_bcnt, skb);
+	skb = rq->mpwqe.skb_from_cqe_mpwrq(rq, wi, cqe_bcnt, head_offset,
+					   page_idx);
+	if (unlikely(!skb))
+		goto mpwrq_cqe_out;
+
 	mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
 	napi_gro_receive(rq->cq.napi, skb);
 
@@ -1045,6 +1090,7 @@ mpwrq_cqe_out:
 	if (likely(wi->consumed_strides < rq->mpwqe.num_strides))
 		return;
 
+	wqe = mlx5_wq_ll_get_wqe(&rq->wq, wqe_id);
 	mlx5e_free_rx_mpwqe(rq, wi);
 	mlx5_wq_ll_pop(&rq->wq, cqe->wqe_id, &wqe->next.next_wqe_index);
 }
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -782,6 +782,9 @@ static inline u64 get_cqe_ts(struct mlx5
 	return (u64)lo | ((u64)hi << 32);
 }
 
+#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE	(9)
+#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE	(6)
+
 struct mpwrq_cqe_bc {
 	__be16	filler_consumed_strides;
 	__be16	byte_cnt;
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1038,7 +1038,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         reserved_at_398[0x3];
 	u8         log_max_tis_per_sq[0x5];
 
-	u8         reserved_at_3a0[0x3];
+	u8         ext_stride_num_range[0x1];
+	u8         reserved_at_3a1[0x2];
 	u8         log_max_stride_sz_rq[0x5];
 	u8         reserved_at_3a8[0x3];
 	u8         log_min_stride_sz_rq[0x5];
@@ -1205,9 +1206,9 @@ struct mlx5_ifc_wq_bits {
 	u8         log_hairpin_num_packets[0x5];
 	u8         reserved_at_128[0x3];
 	u8         log_hairpin_data_sz[0x5];
-	u8         reserved_at_130[0x5];
 
-	u8         log_wqe_num_of_strides[0x3];
+	u8         reserved_at_130[0x4];
+	u8         log_wqe_num_of_strides[0x4];
 	u8         two_byte_shift_en[0x1];
 	u8         reserved_at_139[0x4];
 	u8         log_wqe_stride_size[0x3];