Blob Blame History Raw
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:50 -0700
Subject: RDMA/mlx5: Add shared FDB support
Patch-mainline: v5.15-rc1
Git-commit: c446d9da64075b1c8b55df9acaae72c7686c59a5
References: jsc#SLE-19250

Shared FDB allows to create a single RDMA device that holds representors
from both eswitches. As shared FDB is only active when both uplink
representors are enslaved there is a single RDMA port that represents
both uplinks.

The number of ports is the number of vports on both eswitches minus one
as we only need 1 port for both uplinks.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/infiniband/hw/mlx5/ib_rep.c |   75 +++++++++++++++++++++++++++++++++---
 drivers/infiniband/hw/mlx5/main.c   |   44 ++++++++++++---------
 2 files changed, 95 insertions(+), 24 deletions(-)

--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -8,13 +8,15 @@
 #include "srq.h"
 
 static int
-mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
+		      struct mlx5_eswitch_rep *rep,
+		      int vport_index)
 {
 	struct mlx5_ib_dev *ibdev;
-	int vport_index;
 
 	ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
-	vport_index = rep->vport_index;
+	if (!ibdev)
+		return -EINVAL;
 
 	ibdev->port[vport_index].rep = rep;
 	rep->rep_data[REP_IB].priv = ibdev;
@@ -26,19 +28,39 @@ mlx5_ib_set_vport_rep(struct mlx5_core_d
 	return 0;
 }
 
+static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
+
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
 	u32 num_ports = mlx5_eswitch_get_total_vports(dev);
 	const struct mlx5_ib_profile *profile;
+	struct mlx5_core_dev *peer_dev;
 	struct mlx5_ib_dev *ibdev;
+	u32 peer_num_ports;
 	int vport_index;
 	int ret;
 
+	vport_index = rep->vport_index;
+
+	if (mlx5_lag_is_shared_fdb(dev)) {
+		peer_dev = mlx5_lag_get_peer_mdev(dev);
+		peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
+		if (mlx5_lag_is_master(dev)) {
+			/* Only 1 ib port is the representor for both uplinks */
+			num_ports += peer_num_ports - 1;
+		} else {
+			if (rep->vport == MLX5_VPORT_UPLINK)
+				return 0;
+			vport_index += peer_num_ports;
+			dev = peer_dev;
+		}
+	}
+
 	if (rep->vport == MLX5_VPORT_UPLINK)
 		profile = &raw_eth_profile;
 	else
-		return mlx5_ib_set_vport_rep(dev, rep);
+		return mlx5_ib_set_vport_rep(dev, rep, vport_index);
 
 	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!ibdev)
@@ -64,6 +86,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_
 		goto fail_add;
 
 	rep->rep_data[REP_IB].priv = ibdev;
+	if (mlx5_lag_is_shared_fdb(dev))
+		mlx5_ib_register_peer_vport_reps(dev);
 
 	return 0;
 
@@ -82,18 +106,45 @@ static void *mlx5_ib_rep_to_dev(struct m
 static void
 mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
+	struct mlx5_core_dev *mdev = mlx5_eswitch_get_core_dev(rep->esw);
 	struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
+	int vport_index = rep->vport_index;
 	struct mlx5_ib_port *port;
 
-	port = &dev->port[rep->vport_index];
+	if (WARN_ON(!mdev))
+		return;
+
+	if (mlx5_lag_is_shared_fdb(mdev) &&
+	    !mlx5_lag_is_master(mdev)) {
+		struct mlx5_core_dev *peer_mdev;
+
+		if (rep->vport == MLX5_VPORT_UPLINK)
+			return;
+		peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+		vport_index += mlx5_eswitch_get_total_vports(peer_mdev);
+	}
+
+	if (!dev)
+		return;
+
+	port = &dev->port[vport_index];
 	write_lock(&port->roce.netdev_lock);
 	port->roce.netdev = NULL;
 	write_unlock(&port->roce.netdev_lock);
 	rep->rep_data[REP_IB].priv = NULL;
 	port->rep = NULL;
 
-	if (rep->vport == MLX5_VPORT_UPLINK)
+	if (rep->vport == MLX5_VPORT_UPLINK) {
+		struct mlx5_core_dev *peer_mdev;
+		struct mlx5_eswitch *esw;
+
+		if (mlx5_lag_is_shared_fdb(mdev)) {
+			peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+			esw = peer_mdev->priv.eswitch;
+			mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
+		}
 		__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
+	}
 }
 
 static const struct mlx5_eswitch_rep_ops rep_ops = {
@@ -102,6 +153,18 @@ static const struct mlx5_eswitch_rep_ops
 	.get_proto_dev = mlx5_ib_rep_to_dev,
 };
 
+static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+	struct mlx5_eswitch *esw;
+
+	if (!peer_mdev)
+		return;
+
+	esw = peer_mdev->priv.eswitch;
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
+}
+
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 					  u16 vport_num)
 {
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -126,6 +126,7 @@ static int get_port_state(struct ib_devi
 
 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
 					   struct net_device *ndev,
+					   struct net_device *upper,
 					   u32 *port_num)
 {
 	struct net_device *rep_ndev;
@@ -137,6 +138,14 @@ static struct mlx5_roce *mlx5_get_rep_ro
 		if (!port->rep)
 			continue;
 
+		if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) {
+			*port_num = i + 1;
+			return &port->roce;
+		}
+
+		if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
+			continue;
+
 		read_lock(&port->roce.netdev_lock);
 		rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
 						  port->rep->vport);
@@ -196,11 +205,12 @@ static int mlx5_netdev_event(struct noti
 		}
 
 		if (ibdev->is_rep)
-			roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
+			roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
 		if (!roce)
 			return NOTIFY_DONE;
-		if ((upper == ndev || (!upper && ndev == roce->netdev))
-		    && ibdev->ib_active) {
+		if ((upper == ndev ||
+		     ((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
+		    ibdev->ib_active) {
 			struct ib_event ibev = { };
 			enum ib_port_state port_state;
 
@@ -3012,7 +3022,7 @@ static int mlx5_eth_lag_init(struct mlx5
 	struct mlx5_flow_table *ft;
 	int err;
 
-	if (!ns || !mlx5_lag_is_roce(mdev))
+	if (!ns || !mlx5_lag_is_active(mdev))
 		return 0;
 
 	err = mlx5_cmd_create_vport_lag(mdev);
@@ -3074,9 +3084,11 @@ static int mlx5_enable_eth(struct mlx5_i
 {
 	int err;
 
-	err = mlx5_nic_vport_enable_roce(dev->mdev);
-	if (err)
-		return err;
+	if (!dev->is_rep && dev->profile != &raw_eth_profile) {
+		err = mlx5_nic_vport_enable_roce(dev->mdev);
+		if (err)
+			return err;
+	}
 
 	err = mlx5_eth_lag_init(dev);
 	if (err)
@@ -3085,7 +3097,8 @@ static int mlx5_enable_eth(struct mlx5_i
 	return 0;
 
 err_disable_roce:
-	mlx5_nic_vport_disable_roce(dev->mdev);
+	if (!dev->is_rep && dev->profile != &raw_eth_profile)
+		mlx5_nic_vport_disable_roce(dev->mdev);
 
 	return err;
 }
@@ -3093,7 +3106,8 @@ err_disable_roce:
 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
 {
 	mlx5_eth_lag_cleanup(dev);
-	mlx5_nic_vport_disable_roce(dev->mdev);
+	if (!dev->is_rep && dev->profile != &raw_eth_profile)
+		mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
 static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num,
@@ -3950,12 +3964,7 @@ static int mlx5_ib_roce_init(struct mlx5
 
 		/* Register only for native ports */
 		err = mlx5_add_netdev_notifier(dev, port_num);
-		if (err || dev->is_rep || !mlx5_is_roce_init_enabled(mdev))
-			/*
-			 * We don't enable ETH interface for
-			 * 1. IB representors
-			 * 2. User disabled ROCE through devlink interface
-			 */
+		if (err)
 			return err;
 
 		err = mlx5_enable_eth(dev);
@@ -3980,8 +3989,7 @@ static void mlx5_ib_roce_cleanup(struct
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		if (!dev->is_rep)
-			mlx5_disable_eth(dev);
+		mlx5_disable_eth(dev);
 
 		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
 		mlx5_remove_netdev_notifier(dev, port_num);
@@ -4037,7 +4045,7 @@ static int mlx5_ib_stage_ib_reg_init(str
 {
 	const char *name;
 
-	if (!mlx5_lag_is_roce(dev->mdev))
+	if (!mlx5_lag_is_active(dev->mdev))
 		name = "mlx5_%d";
 	else
 		name = "mlx5_bond_%d";