Blob Blame History Raw
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:24 -0700
Subject: net/mlx5e: xsk: Use KLM to protect frame overrun in unaligned mode
Patch-mainline: v6.1-rc1
Git-commit: 139213451046eb6653a058a8922796f29b267b0f
References: jsc#PED-1549

XSK RQs support striding RQ linear mode, but the stride size may be
bigger than the XSK frame size, because:

1. The stride size must be a power of two.

2. The stride size must be equal to the UMR page size. Each XSK frame is
treated as a separate page, because they aren't necessarily adjacent in
physical memory, so the driver can't put more than one stride per page.

3. The minimal MTT page size is 4096 on older firmware.

That means that if XSK frame size is 2048 or not a power of two, the
strides may be bigger than XSK frames. Normally, it's not a problem if
the hardware enforces the MTU. However, traffic between vports skips the
hardware MTU check, and oversized packets may be received.

If an oversized packet is bigger than the XSK frame but not bigger than
the stride, it will cause overwriting of the adjacent UMEM region. If
the packet takes more than one stride, they can be recycled for reuse,
so it's not a problem when the XSK frame size matches the stride size.

Work around the above issue by leveraging KLM to make a more
fine-grained mapping. The beginning of each stride is mapped to the
frame memory, and the padding up to the closest power of two is mapped
to the overflow page that doesn't belong to UMEM. This way, application
data corruption won't happen upon receiving packets bigger than MTU.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |    1 
 drivers/net/ethernet/mellanox/mlx5/core/en/params.c |   45 ++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c |   27 ++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   |   27 ++++++++++--
 4 files changed, 90 insertions(+), 10 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -680,6 +680,7 @@ struct mlx5e_hw_gro_data {
 enum mlx5e_mpwrq_umr_mode {
 	MLX5E_MPWRQ_UMR_MODE_ALIGNED,
 	MLX5E_MPWRQ_UMR_MODE_UNALIGNED,
+	MLX5E_MPWRQ_UMR_MODE_OVERSIZED,
 };
 
 struct mlx5e_rq {
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -36,8 +36,28 @@ mlx5e_mpwrq_umr_mode(struct mlx5_core_de
 	 * 1. MTT - direct mapping in page granularity.
 	 * 2. KSM - indirect mapping to another MKey to arbitrary addresses, but
 	 *    all mappings have the same size.
+	 * 3. KLM - indirect mapping to another MKey to arbitrary addresses, and
+	 *    mappings can have different sizes.
 	 */
+	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
 	bool unaligned = xsk ? xsk->unaligned : false;
+	bool oversized = false;
+
+	if (xsk) {
+		oversized = xsk->chunk_size < (1 << page_shift);
+		WARN_ON_ONCE(xsk->chunk_size > (1 << page_shift));
+	}
+
+	/* XSK frame size doesn't match the UMR page size, either because the
+	 * frame size is not a power of two, or it's smaller than the minimal
+	 * page size supported by the firmware.
+	 * It's possible to receive packets bigger than MTU in certain setups.
+	 * To avoid writing over the XSK frame boundary, the top region of each
+	 * stride is mapped to a garbage page, resulting in two mappings of
+	 * different sizes per frame.
+	 */
+	if (oversized)
+		return MLX5E_MPWRQ_UMR_MODE_OVERSIZED;
 
 	/* XSK frames can start at arbitrary unaligned locations, but they all
 	 * have the same size which is a power of two. It allows to optimize to
@@ -60,6 +80,8 @@ u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e
 		return sizeof(struct mlx5_mtt);
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return sizeof(struct mlx5_ksm);
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return sizeof(struct mlx5_klm) * 2;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode);
 	return 0;
@@ -145,11 +167,21 @@ u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_
 u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
 				enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)
-		return min(MLX5E_MAX_RQ_NUM_KSMS,
-			   1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
+	/* Same limits apply to KSMs and KLMs. */
+	u32 klm_limit = min(MLX5E_MAX_RQ_NUM_KSMS,
+			    1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
 
-	return MLX5E_MAX_RQ_NUM_MTTS;
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return MLX5E_MAX_RQ_NUM_MTTS;
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return klm_limit;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		/* Each entry is two KLMs. */
+		return klm_limit / 2;
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
+	return 0;
 }
 
 static u8 mlx5e_mpwrq_max_log_rq_size(struct mlx5_core_dev *mdev, u8 page_shift,
@@ -1084,6 +1116,11 @@ static u8 mlx5e_build_icosq_log_wq_sz(st
 			xsk.unaligned = true;
 			max_xsk_wqebbs = max(max_xsk_wqebbs,
 				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
+
+			/* XSK unaligned mode, frame size is not equal to stride size. */
+			xsk.chunk_size -= 1;
+			max_xsk_wqebbs = max(max_xsk_wqebbs,
+				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
 		}
 
 		wqebbs += max_xsk_wqebbs;
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -41,7 +41,15 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
 
-	if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
+	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) {
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			};
+		}
+	} else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
@@ -51,11 +59,22 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5
 			};
 		}
 	} else {
+		__be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
+					      rq->xsk_pool->chunk_size);
+		__be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size);
+
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
-			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
-				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr),
+				.bcount = frame_size,
+			};
+			umr_wqe->inline_klms[(i << 1) + 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(rq->wqe_overflow.addr),
+				.bcount = pad_size,
 			};
 		}
 	}
@@ -70,6 +89,8 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5
 	offset = ix * rq->mpwqe.mtts_per_wqe;
 	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
 		offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
+	else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_OVERSIZED))
+		offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -299,6 +299,8 @@ static u8 mlx5e_mpwrq_access_mode(enum m
 		return MLX5_MKC_ACCESS_MODE_MTT;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return MLX5_MKC_ACCESS_MODE_KSM;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return MLX5_MKC_ACCESS_MODE_KLMS;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
 	return 0;
@@ -307,10 +309,12 @@ static u8 mlx5e_mpwrq_access_mode(enum m
 static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 				 u32 npages, u8 page_shift, u32 *umr_mkey,
 				 dma_addr_t filler_addr,
-				 enum mlx5e_mpwrq_umr_mode umr_mode)
+				 enum mlx5e_mpwrq_umr_mode umr_mode,
+				 u32 xsk_chunk_size)
 {
 	struct mlx5_mtt *mtt;
 	struct mlx5_ksm *ksm;
+	struct mlx5_klm *klm;
 	u32 octwords;
 	int inlen;
 	void *mkc;
@@ -347,7 +351,8 @@ static int mlx5e_create_umr_mkey(struct
 	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET64(mkc, mkc, len, npages << page_shift);
 	MLX5_SET(mkc, mkc, translations_octword_size, octwords);
-	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+	if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
+		MLX5_SET(mkc, mkc, log_page_size, page_shift);
 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octwords);
 
 	/* Initialize the mkey with all MTTs pointing to a default
@@ -357,6 +362,21 @@ static int mlx5e_create_umr_mkey(struct
 	 * to the default page.
 	 */
 	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		klm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+		for (i = 0; i < npages; i++) {
+			klm[i << 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32(xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+			klm[(i << 1) + 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32((1 << page_shift) - xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+		}
+		break;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 		for (i = 0; i < npages; i++)
@@ -415,6 +435,7 @@ static int mlx5e_create_umr_klm_mkey(str
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
+	u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0;
 	u32 wq_size = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	u32 num_entries, max_num_entries;
 	u32 umr_mkey;
@@ -432,7 +453,7 @@ static int mlx5e_create_rq_umr_mkey(stru
 
 	err = mlx5e_create_umr_mkey(mdev, num_entries, rq->mpwqe.page_shift,
 				    &umr_mkey, rq->wqe_overflow.addr,
-				    rq->mpwqe.umr_mode);
+				    rq->mpwqe.umr_mode, xsk_chunk_size);
 	rq->mpwqe.umr_mkey_be = cpu_to_be32(umr_mkey);
 	return err;
 }