Blob Blame History Raw
From: Mark Bloch <mbloch@nvidia.com>
Date: Tue, 3 Aug 2021 16:19:56 -0700
Subject: net/mlx5: Lag, properly lock eswitch if needed
Patch-mainline: v5.15-rc1
Git-commit: cac1eb2cf2e338260c5e2ffb098dfa0508ee40aa
References: jsc#SLE-19253

Currently when doing hardware lag we check the eswitch mode
but as this isn't done under a lock the check isn't valid.

As the code needs to sync between two different devices an extra
care is needed.

- When going to change eswitch mode, if hardware lag is active destroy it.
- While changing eswitch modes block any hardware bond creation.
- Delay handling bonding events until there are no mode changes in
  progress.
- When attaching a new mdev to lag, block until there is no mode change
  in progress. In order for the mode change to finish the interface lock
  will have to be taken. Release the lock and sleep for 100ms to
  allow forward progress. As this is a very rare condition (can happen if
  the user unbinds and binds a PCI function while also changing eswitch
  mode of the other PCI function) it has no real world impact.

As taking multiple eswitch mode locks is now required lockdep will
complain about a possible deadlock. Register a key per eswitch to make
lockdep happy.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c          |   24 +++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          |    5 
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |    5 
 drivers/net/ethernet/mellanox/mlx5/core/lag.c              |   83 +++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/lag.h              |    1 
 drivers/net/ethernet/mellanox/mlx5/core/main.c             |    5 
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h        |    2 
 7 files changed, 107 insertions(+), 18 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1458,8 +1458,6 @@ int mlx5_eswitch_enable_locked(struct ml
 
 	esw->mode = mode;
 
-	mlx5_lag_update(esw->dev);
-
 	if (mode == MLX5_ESWITCH_LEGACY) {
 		err = esw_legacy_enable(esw);
 	} else {
@@ -1506,6 +1504,7 @@ int mlx5_eswitch_enable(struct mlx5_eswi
 	if (!mlx5_esw_allowed(esw))
 		return 0;
 
+	mlx5_lag_disable_change(esw->dev);
 	down_write(&esw->mode_lock);
 	if (esw->mode == MLX5_ESWITCH_NONE) {
 		ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs);
@@ -1519,6 +1518,7 @@ int mlx5_eswitch_enable(struct mlx5_eswi
 			esw->esw_funcs.num_vfs = num_vfs;
 	}
 	up_write(&esw->mode_lock);
+	mlx5_lag_enable_change(esw->dev);
 	return ret;
 }
 
@@ -1550,8 +1550,6 @@ void mlx5_eswitch_disable_locked(struct
 	old_mode = esw->mode;
 	esw->mode = MLX5_ESWITCH_NONE;
 
-	mlx5_lag_update(esw->dev);
-
 	if (old_mode == MLX5_ESWITCH_OFFLOADS)
 		mlx5_rescan_drivers(esw->dev);
 
@@ -1567,10 +1565,12 @@ void mlx5_eswitch_disable(struct mlx5_es
 	if (!mlx5_esw_allowed(esw))
 		return;
 
+	mlx5_lag_disable_change(esw->dev);
 	down_write(&esw->mode_lock);
 	mlx5_eswitch_disable_locked(esw, clear_vf);
 	esw->esw_funcs.num_vfs = 0;
 	up_write(&esw->mode_lock);
+	mlx5_lag_enable_change(esw->dev);
 }
 
 static int mlx5_query_hca_cap_host_pf(struct mlx5_core_dev *dev, void *out)
@@ -1759,7 +1759,9 @@ int mlx5_eswitch_init(struct mlx5_core_d
 	ida_init(&esw->offloads.vport_metadata_ida);
 	xa_init_flags(&esw->offloads.vhca_map, XA_FLAGS_ALLOC);
 	mutex_init(&esw->state_lock);
+	lockdep_register_key(&esw->mode_lock_key);
 	init_rwsem(&esw->mode_lock);
+	lockdep_set_class(&esw->mode_lock, &esw->mode_lock_key);
 
 	esw->enabled_vports = 0;
 	esw->mode = MLX5_ESWITCH_NONE;
@@ -1793,6 +1795,7 @@ void mlx5_eswitch_cleanup(struct mlx5_es
 
 	esw->dev->priv.eswitch = NULL;
 	destroy_workqueue(esw->work_queue);
+	lockdep_unregister_key(&esw->mode_lock_key);
 	mutex_destroy(&esw->state_lock);
 	WARN_ON(!xa_empty(&esw->offloads.vhca_map));
 	xa_destroy(&esw->offloads.vhca_map);
@@ -2364,10 +2367,23 @@ int mlx5_esw_try_lock(struct mlx5_eswitc
  */
 void mlx5_esw_unlock(struct mlx5_eswitch *esw)
 {
+	if (!mlx5_esw_allowed(esw))
+		return;
 	up_write(&esw->mode_lock);
 }
 
 /**
+ * mlx5_esw_lock() - Take write lock on esw mode lock
+ * @esw: eswitch device.
+ */
+void mlx5_esw_lock(struct mlx5_eswitch *esw)
+{
+	if (!mlx5_esw_allowed(esw))
+		return;
+	down_write(&esw->mode_lock);
+}
+
+/**
  * mlx5_eswitch_get_total_vports - Get total vports of the eswitch
  *
  * @dev: Pointer to core device
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -323,6 +323,7 @@ struct mlx5_eswitch {
 		u32             large_group_num;
 	}  params;
 	struct blocking_notifier_head n_head;
+	struct lock_class_key mode_lock_key;
 };
 
 void esw_offloads_disable(struct mlx5_eswitch *esw);
@@ -705,6 +706,7 @@ void mlx5_esw_get(struct mlx5_core_dev *
 void mlx5_esw_put(struct mlx5_core_dev *dev);
 int mlx5_esw_try_lock(struct mlx5_eswitch *esw);
 void mlx5_esw_unlock(struct mlx5_eswitch *esw);
+void mlx5_esw_lock(struct mlx5_eswitch *esw);
 
 void esw_vport_change_handle_locked(struct mlx5_vport *vport);
 
@@ -725,6 +727,9 @@ static inline const u32 *mlx5_esw_query_
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline void mlx5_esw_unlock(struct mlx5_eswitch *esw) { return; }
+static inline void mlx5_esw_lock(struct mlx5_eswitch *esw) { return; }
+
 static inline struct mlx5_flow_handle *
 esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag)
 {
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3056,10 +3056,11 @@ int mlx5_devlink_eswitch_mode_set(struct
 	if (esw_mode_from_devlink(mode, &mlx5_mode))
 		return -EINVAL;
 
+	mlx5_lag_disable_change(esw->dev);
 	err = mlx5_esw_try_lock(esw);
 	if (err < 0) {
 		NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
-		return err;
+		goto enable_lag;
 	}
 	cur_mlx5_mode = err;
 	err = 0;
@@ -3083,6 +3084,8 @@ int mlx5_devlink_eswitch_mode_set(struct
 
 unlock:
 	mlx5_esw_unlock(esw);
+enable_lag:
+	mlx5_lag_enable_change(esw->dev);
 	return err;
 }
 
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@@ -418,21 +418,48 @@ static void mlx5_queue_bond_work(struct
 	queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
 }
 
+static void mlx5_lag_lock_eswitches(struct mlx5_core_dev *dev0,
+				    struct mlx5_core_dev *dev1)
+{
+	if (dev0)
+		mlx5_esw_lock(dev0->priv.eswitch);
+	if (dev1)
+		mlx5_esw_lock(dev1->priv.eswitch);
+}
+
+static void mlx5_lag_unlock_eswitches(struct mlx5_core_dev *dev0,
+				      struct mlx5_core_dev *dev1)
+{
+	if (dev1)
+		mlx5_esw_unlock(dev1->priv.eswitch);
+	if (dev0)
+		mlx5_esw_unlock(dev0->priv.eswitch);
+}
+
 static void mlx5_do_bond_work(struct work_struct *work)
 {
 	struct delayed_work *delayed_work = to_delayed_work(work);
 	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
 					     bond_work);
+	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
+	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
 	int status;
 
 	status = mlx5_dev_list_trylock();
 	if (!status) {
-		/* 1 sec delay. */
 		mlx5_queue_bond_work(ldev, HZ);
 		return;
 	}
 
+	if (ldev->mode_changes_in_progress) {
+		mlx5_dev_list_unlock();
+		mlx5_queue_bond_work(ldev, HZ);
+		return;
+	}
+
+	mlx5_lag_lock_eswitches(dev0, dev1);
 	mlx5_do_bond(ldev);
+	mlx5_lag_unlock_eswitches(dev0, dev1);
 	mlx5_dev_list_unlock();
 }
 
@@ -630,7 +657,7 @@ static void mlx5_ldev_remove_mdev(struct
 }
 
 /* Must be called with intf_mutex held */
-static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
+static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
@@ -638,7 +665,7 @@ static void __mlx5_lag_dev_add_mdev(stru
 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
 	    !MLX5_CAP_GEN(dev, lag_master) ||
 	    MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS)
-		return;
+		return 0;
 
 	tmp_dev = mlx5_get_next_phys_dev(dev);
 	if (tmp_dev)
@@ -648,15 +675,17 @@ static void __mlx5_lag_dev_add_mdev(stru
 		ldev = mlx5_lag_dev_alloc(dev);
 		if (!ldev) {
 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
-			return;
+			return 0;
 		}
 	} else {
+		if (ldev->mode_changes_in_progress)
+			return -EAGAIN;
 		mlx5_ldev_get(ldev);
 	}
 
 	mlx5_ldev_add_mdev(ldev, dev);
 
-	return;
+	return 0;
 }
 
 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
@@ -667,7 +696,13 @@ void mlx5_lag_remove_mdev(struct mlx5_co
 	if (!ldev)
 		return;
 
+recheck:
 	mlx5_dev_list_lock();
+	if (ldev->mode_changes_in_progress) {
+		mlx5_dev_list_unlock();
+		msleep(100);
+		goto recheck;
+	}
 	mlx5_ldev_remove_mdev(ldev, dev);
 	mlx5_dev_list_unlock();
 	mlx5_ldev_put(ldev);
@@ -675,8 +710,16 @@ void mlx5_lag_remove_mdev(struct mlx5_co
 
 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
 {
+	int err;
+
+recheck:
 	mlx5_dev_list_lock();
-	__mlx5_lag_dev_add_mdev(dev);
+	err = __mlx5_lag_dev_add_mdev(dev);
+	if (err) {
+		mlx5_dev_list_unlock();
+		msleep(100);
+		goto recheck;
+	}
 	mlx5_dev_list_unlock();
 }
 
@@ -716,6 +759,7 @@ void mlx5_lag_add_netdev(struct mlx5_cor
 
 	if (i >= MLX5_MAX_PORTS)
 		ldev->flags |= MLX5_LAG_FLAG_READY;
+	mlx5_queue_bond_work(ldev, 0);
 }
 
 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
@@ -789,19 +833,36 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_
 }
 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
 
-void mlx5_lag_update(struct mlx5_core_dev *dev)
+void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
 {
+	struct mlx5_core_dev *dev0;
+	struct mlx5_core_dev *dev1;
 	struct mlx5_lag *ldev;
 
 	mlx5_dev_list_lock();
+
 	ldev = mlx5_lag_dev(dev);
-	if (!ldev)
-		goto unlock;
+	dev0 = ldev->pf[MLX5_LAG_P1].dev;
+	dev1 = ldev->pf[MLX5_LAG_P2].dev;
 
-	mlx5_do_bond(ldev);
+	ldev->mode_changes_in_progress++;
+	if (__mlx5_lag_is_active(ldev)) {
+		mlx5_lag_lock_eswitches(dev0, dev1);
+		mlx5_disable_lag(ldev);
+		mlx5_lag_unlock_eswitches(dev0, dev1);
+	}
+	mlx5_dev_list_unlock();
+}
+
+void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
 
-unlock:
+	mlx5_dev_list_lock();
+	ldev = mlx5_lag_dev(dev);
+	ldev->mode_changes_in_progress--;
 	mlx5_dev_list_unlock();
+	mlx5_queue_bond_work(ldev, 0);
 }
 
 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@@ -39,6 +39,7 @@ struct lag_tracker {
  */
 struct mlx5_lag {
 	u8                        flags;
+	int			  mode_changes_in_progress;
 	bool			  shared_fdb;
 	u8                        v2p_map[MLX5_MAX_PORTS];
 	struct kref               ref;
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1179,6 +1179,7 @@ static int mlx5_load(struct mlx5_core_de
 		goto err_ec;
 	}
 
+	mlx5_lag_add_mdev(dev);
 	err = mlx5_sriov_attach(dev);
 	if (err) {
 		mlx5_core_err(dev, "sriov init failed %d\n", err);
@@ -1186,11 +1187,11 @@ static int mlx5_load(struct mlx5_core_de
 	}
 
 	mlx5_sf_dev_table_create(dev);
-	mlx5_lag_add_mdev(dev);
 
 	return 0;
 
 err_sriov:
+	mlx5_lag_remove_mdev(dev);
 	mlx5_ec_cleanup(dev);
 err_ec:
 	mlx5_sf_hw_table_destroy(dev);
@@ -1222,9 +1223,9 @@ err_irq_table:
 
 static void mlx5_unload(struct mlx5_core_dev *dev)
 {
-	mlx5_lag_remove_mdev(dev);
 	mlx5_sf_dev_table_destroy(dev);
 	mlx5_sriov_detach(dev);
+	mlx5_lag_remove_mdev(dev);
 	mlx5_ec_cleanup(dev);
 	mlx5_sf_hw_table_destroy(dev);
 	mlx5_vhca_event_stop(dev);
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -168,6 +168,8 @@ void mlx5_lag_add_netdev(struct mlx5_cor
 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev, struct net_device *netdev);
 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev);
 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev);
+void mlx5_lag_disable_change(struct mlx5_core_dev *dev);
+void mlx5_lag_enable_change(struct mlx5_core_dev *dev);
 
 int mlx5_events_init(struct mlx5_core_dev *dev);
 void mlx5_events_cleanup(struct mlx5_core_dev *dev);