Blob Blame History Raw
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 18 Dec 2019 02:51:19 -0600
Subject: net/mlx5: E-switch, Protect eswitch mode changes
Patch-mainline: v5.7-rc1
Git-commit: 8e0aa4bc959c98c14ed0aaee522d77ca52690189
References: jsc#SLE-15172

Currently eswitch mode change is occurring from 2 different execution
contexts as below.
1. sriov sysfs enable/disable
2. devlink eswitch set commands

Both of them need to access eswitch related data structures in
synchronized manner.
Without any synchronization below race condition exist.

SR-IOV enable/disable with devlink eswitch mode change:

          cpu-0                         cpu-1
          -----                         -----
mlx5_device_disable_sriov()        mlx5_devlink_eswitch_mode_set()
  mlx5_eswitch_disable()             esw_offloads_stop()
    esw_offloads_disable()             mlx5_eswitch_disable()
                                         esw_offloads_disable()

Hence, they are synchronized using a new mode_lock.
eswitch's state_lock is not used as it can lead to a deadlock scenario
below and state_lock is only for vport and fdb exclusive access.

ip link set vf <param>
  netlink rcv_msg() - Lock A
    rtnl_lock
    vfinfo()
      esw->state_lock() - Lock B
devlink eswitch_set
   devlink_mutex
     esw->state_lock() - Lock B
       attach_netdev()
         register_netdev()
           rtnl_lock - Lock A

Alternatives considered:
1. Acquiring rtnl lock before taking esw->state_lock to follow similar
locking sequence as ip link flow during eswitch mode set.
rtnl lock is not good idea for two reasons.
(a) Holding rtnl lock for several hundred device commands is not good
    idea.
(b) It leads to below and more similar deadlocks.

devlink eswitch_set
   devlink_mutex
     rtnl_lock - Lock A
       esw->state_lock() - Lock B
         eswitch_disable()
           reload()
             ib_register_device()
               ib_cache_setup_one()
                 rtnl_lock()

2. Exporting devlink lock may lead to undesired use of it in vendor
driver(s) in future.

3. Unloading representors outside of the mode_lock requires
serialization with other process trying to enable the eswitch.

4. Differing the representors life cycle to a different workqueue
requires synchronization with func_change_handler workqueue.

Reviewed-by: Roi Dayan <roid@mellanox.com>
Reviewed-by: Bodong Wang <bodong@mellanox.com>
Reviewed-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c          |   54 +++++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          |   11 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |  105 ++++++++-----
 drivers/net/ethernet/mellanox/mlx5/core/sriov.c            |    3 
 4 files changed, 123 insertions(+), 50 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2092,7 +2092,7 @@ mlx5_eswitch_update_num_of_vfs(struct ml
 }
 
 /**
- * mlx5_eswitch_enable - Enable eswitch
+ * mlx5_eswitch_enable_locked - Enable eswitch
  * @esw:	Pointer to eswitch
  * @mode:	Eswitch mode to enable
  * @num_vfs:	Enable eswitch for given number of VFs. This is optional.
@@ -2104,16 +2104,17 @@ mlx5_eswitch_update_num_of_vfs(struct ml
  *		eswitch. Caller should pass < 0 when num_vfs should be
  *		completely ignored. This is typically the case when eswitch
  *		is enabled without sriov regardless of PF/ECPF system.
- * mlx5_eswitch_enable() Enables eswitch in either legacy or offloads mode.
- * If num_vfs >=0 is provided, it setup VF related eswitch vports. It returns
- * 0 on success or error code on failure.
+ * mlx5_eswitch_enable_locked() Enables eswitch in either legacy or offloads
+ * mode. If num_vfs >=0 is provided, it setup VF related eswitch vports.
+ * It returns 0 on success or error code on failure.
  */
-int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int mode, int num_vfs)
+int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs)
 {
 	int err;
 
-	if (!ESW_ALLOWED(esw) ||
-	    !MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
+	lockdep_assert_held(&esw->mode_lock);
+
+	if (!MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ft_support)) {
 		esw_warn(esw->dev, "FDB is not supported, aborting ...\n");
 		return -EOPNOTSUPP;
 	}
@@ -2164,11 +2165,34 @@ abort:
 	return err;
 }
 
-void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf)
+/**
+ * mlx5_eswitch_enable - Enable eswitch
+ * @esw:	Pointer to eswitch
+ * @num_vfs:	Enable eswitch swich for given number of VFs.
+ *		Caller must pass num_vfs > 0 when enabling eswitch for
+ *		vf vports.
+ * mlx5_eswitch_enable() returns 0 on success or error code on failure.
+ */
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
+{
+	int ret;
+
+	if (!ESW_ALLOWED(esw))
+		return 0;
+
+	mutex_lock(&esw->mode_lock);
+	ret = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY, num_vfs);
+	mutex_unlock(&esw->mode_lock);
+	return ret;
+}
+
+void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf)
 {
 	int old_mode;
 
-	if (!ESW_ALLOWED(esw) || esw->mode == MLX5_ESWITCH_NONE)
+	lockdep_assert_held_write(&esw->mode_lock);
+
+	if (esw->mode == MLX5_ESWITCH_NONE)
 		return;
 
 	esw_info(esw->dev, "Disable: mode(%s), nvfs(%d), active vports(%d)\n",
@@ -2197,6 +2221,16 @@ void mlx5_eswitch_disable(struct mlx5_es
 		mlx5_eswitch_clear_vf_vports_info(esw);
 }
 
+void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf)
+{
+	if (!ESW_ALLOWED(esw))
+		return;
+
+	mutex_lock(&esw->mode_lock);
+	mlx5_eswitch_disable_locked(esw, clear_vf);
+	mutex_unlock(&esw->mode_lock);
+}
+
 int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eswitch *esw;
@@ -2248,6 +2282,7 @@ int mlx5_eswitch_init(struct mlx5_core_d
 	hash_init(esw->offloads.mod_hdr.hlist);
 	atomic64_set(&esw->offloads.num_flows, 0);
 	mutex_init(&esw->state_lock);
+	mutex_init(&esw->mode_lock);
 
 	mlx5_esw_for_all_vports(esw, i, vport) {
 		vport->vport = mlx5_eswitch_index_to_vport_num(esw, i);
@@ -2282,6 +2317,7 @@ void mlx5_eswitch_cleanup(struct mlx5_es
 	esw->dev->priv.eswitch = NULL;
 	destroy_workqueue(esw->work_queue);
 	esw_offloads_cleanup_reps(esw);
+	mutex_destroy(&esw->mode_lock);
 	mutex_destroy(&esw->state_lock);
 	mutex_destroy(&esw->offloads.mod_hdr.lock);
 	mutex_destroy(&esw->offloads.encap_tbl_lock);
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -258,6 +258,11 @@ struct mlx5_eswitch {
 	 */
 	struct mutex            state_lock;
 
+	/* Protects eswitch mode change that occurs via one or more
+	 * user commands, i.e. sriov state change, devlink commands.
+	 */
+	struct mutex mode_lock;
+
 	struct {
 		bool            enabled;
 		u32             root_tsar_id;
@@ -298,7 +303,9 @@ int mlx5_eswitch_init(struct mlx5_core_d
 void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
 
 #define MLX5_ESWITCH_IGNORE_NUM_VFS (-1)
-int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int mode, int num_vfs);
+int mlx5_eswitch_enable_locked(struct mlx5_eswitch *esw, int mode, int num_vfs);
+int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs);
+void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw, bool clear_vf);
 void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf);
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       u16 vport, u8 mac[ETH_ALEN]);
@@ -674,7 +681,7 @@ void mlx5_eswitch_unload_vf_vports(struc
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
 static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
-static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int mode, int num_vfs) { return 0; }
+static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { return 0; }
 static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw, bool clear_vf) {}
 static inline bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1) { return true; }
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1591,14 +1591,14 @@ static int esw_offloads_start(struct mlx
 		return -EINVAL;
 	}
 
-	mlx5_eswitch_disable(esw, false);
-	err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_OFFLOADS,
-				  esw->dev->priv.sriov.num_vfs);
+	mlx5_eswitch_disable_locked(esw, false);
+	err = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_OFFLOADS,
+					 esw->dev->priv.sriov.num_vfs);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Failed setting eswitch to offloads");
-		err1 = mlx5_eswitch_enable(esw, MLX5_ESWITCH_LEGACY,
-					   MLX5_ESWITCH_IGNORE_NUM_VFS);
+		err1 = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY,
+						  MLX5_ESWITCH_IGNORE_NUM_VFS);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to legacy");
@@ -2396,13 +2396,13 @@ static int esw_offloads_stop(struct mlx5
 {
 	int err, err1;
 
-	mlx5_eswitch_disable(esw, false);
-	err = mlx5_eswitch_enable(esw, MLX5_ESWITCH_LEGACY,
-				  MLX5_ESWITCH_IGNORE_NUM_VFS);
+	mlx5_eswitch_disable_locked(esw, false);
+	err = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_LEGACY,
+					 MLX5_ESWITCH_IGNORE_NUM_VFS);
 	if (err) {
 		NL_SET_ERR_MSG_MOD(extack, "Failed setting eswitch to legacy");
-		err1 = mlx5_eswitch_enable(esw, MLX5_ESWITCH_OFFLOADS,
-					   MLX5_ESWITCH_IGNORE_NUM_VFS);
+		err1 = mlx5_eswitch_enable_locked(esw, MLX5_ESWITCH_OFFLOADS,
+						  MLX5_ESWITCH_IGNORE_NUM_VFS);
 		if (err1) {
 			NL_SET_ERR_MSG_MOD(extack,
 					   "Failed setting eswitch back to offloads");
@@ -2524,6 +2524,7 @@ int mlx5_devlink_eswitch_mode_set(struct
 				  struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	u16 cur_mlx5_mode, mlx5_mode = 0;
 	int err;
 
@@ -2531,40 +2532,50 @@ int mlx5_devlink_eswitch_mode_set(struct
 	if (err)
 		return err;
 
-	err = eswitch_devlink_esw_mode_check(dev->priv.eswitch);
-	if (err)
-		return err;
-
-	cur_mlx5_mode = dev->priv.eswitch->mode;
-
 	if (esw_mode_from_devlink(mode, &mlx5_mode))
 		return -EINVAL;
 
+	mutex_lock(&esw->mode_lock);
+	err = eswitch_devlink_esw_mode_check(esw);
+	if (err)
+		goto unlock;
+
+	cur_mlx5_mode = esw->mode;
+
 	if (cur_mlx5_mode == mlx5_mode)
-		return 0;
+		goto unlock;
 
 	if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV)
-		return esw_offloads_start(dev->priv.eswitch, extack);
+		err = esw_offloads_start(esw, extack);
 	else if (mode == DEVLINK_ESWITCH_MODE_LEGACY)
-		return esw_offloads_stop(dev->priv.eswitch, extack);
+		err = esw_offloads_stop(esw, extack);
 	else
-		return -EINVAL;
+		err = -EINVAL;
+
+unlock:
+	mutex_unlock(&esw->mode_lock);
+	return err;
 }
 
 int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
 	int err;
 
 	err = mlx5_eswitch_check(dev);
 	if (err)
 		return err;
 
+	mutex_lock(&esw->mode_lock);
 	err = eswitch_devlink_esw_mode_check(dev->priv.eswitch);
 	if (err)
-		return err;
+		goto unlock;
 
-	return esw_mode_to_devlink(dev->priv.eswitch->mode, mode);
+	err = esw_mode_to_devlink(esw->mode, mode);
+unlock:
+	mutex_unlock(&esw->mode_lock);
+	return err;
 }
 
 int mlx5_devlink_eswitch_inline_mode_set(struct devlink *devlink, u8 mode,
@@ -2579,18 +2590,20 @@ int mlx5_devlink_eswitch_inline_mode_set
 	if (err)
 		return err;
 
+	mutex_lock(&esw->mode_lock);
 	err = eswitch_devlink_esw_mode_check(esw);
 	if (err)
-		return err;
+		goto out;
 
 	switch (MLX5_CAP_ETH(dev, wqe_inline_mode)) {
 	case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
 		if (mode == DEVLINK_ESWITCH_INLINE_MODE_NONE)
-			return 0;
+			goto out;
 		/* fall through */
 	case MLX5_CAP_INLINE_MODE_L2:
 		NL_SET_ERR_MSG_MOD(extack, "Inline mode can't be set");
-		return -EOPNOTSUPP;
+		err = -EOPNOTSUPP;
+		goto out;
 	case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
 		break;
 	}
@@ -2598,7 +2611,8 @@ int mlx5_devlink_eswitch_inline_mode_set
 	if (atomic64_read(&esw->offloads.num_flows) > 0) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't set inline mode when flows are configured");
-		return -EOPNOTSUPP;
+		err = -EOPNOTSUPP;
+		goto out;
 	}
 
 	err = esw_inline_mode_from_devlink(mode, &mlx5_mode);
@@ -2615,6 +2629,7 @@ int mlx5_devlink_eswitch_inline_mode_set
 	}
 
 	esw->offloads.inline_mode = mlx5_mode;
+	mutex_unlock(&esw->mode_lock);
 	return 0;
 
 revert_inline_mode:
@@ -2624,6 +2639,7 @@ revert_inline_mode:
 						 vport,
 						 esw->offloads.inline_mode);
 out:
+	mutex_unlock(&esw->mode_lock);
 	return err;
 }
 
@@ -2637,11 +2653,15 @@ int mlx5_devlink_eswitch_inline_mode_get
 	if (err)
 		return err;
 
+	mutex_lock(&esw->mode_lock);
 	err = eswitch_devlink_esw_mode_check(esw);
 	if (err)
-		return err;
+		goto unlock;
 
-	return esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
+	err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
+unlock:
+	mutex_unlock(&esw->mode_lock);
+	return err;
 }
 
 int mlx5_devlink_eswitch_encap_mode_set(struct devlink *devlink,
@@ -2656,30 +2676,36 @@ int mlx5_devlink_eswitch_encap_mode_set(
 	if (err)
 		return err;
 
+	mutex_lock(&esw->mode_lock);
 	err = eswitch_devlink_esw_mode_check(esw);
 	if (err)
-		return err;
+		goto unlock;
 
 	if (encap != DEVLINK_ESWITCH_ENCAP_MODE_NONE &&
 	    (!MLX5_CAP_ESW_FLOWTABLE_FDB(dev, reformat) ||
-	     !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap)))
-		return -EOPNOTSUPP;
+	     !MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))) {
+		err = -EOPNOTSUPP;
+		goto unlock;
+	}
 
-	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC)
-		return -EOPNOTSUPP;
+	if (encap && encap != DEVLINK_ESWITCH_ENCAP_MODE_BASIC) {
+		err = -EOPNOTSUPP;
+		goto unlock;
+	}
 
 	if (esw->mode == MLX5_ESWITCH_LEGACY) {
 		esw->offloads.encap = encap;
-		return 0;
+		goto unlock;
 	}
 
 	if (esw->offloads.encap == encap)
-		return 0;
+		goto unlock;
 
 	if (atomic64_read(&esw->offloads.num_flows) > 0) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Can't set encapsulation when flows are configured");
-		return -EOPNOTSUPP;
+		err = -EOPNOTSUPP;
+		goto unlock;
 	}
 
 	esw_destroy_offloads_fdb_tables(esw);
@@ -2695,6 +2721,8 @@ int mlx5_devlink_eswitch_encap_mode_set(
 		(void)esw_create_offloads_fdb_tables(esw, esw->nvports);
 	}
 
+unlock:
+	mutex_unlock(&esw->mode_lock);
 	return err;
 }
 
@@ -2709,11 +2737,14 @@ int mlx5_devlink_eswitch_encap_mode_get(
 	if (err)
 		return err;
 
+	mutex_lock(&esw->mode_lock);
 	err = eswitch_devlink_esw_mode_check(esw);
 	if (err)
-		return err;
+		goto unlock;
 
 	*encap = esw->offloads.encap;
+unlock:
+	mutex_unlock(&esw->mode_lock);
 	return 0;
 }
 
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -77,8 +77,7 @@ static int mlx5_device_enable_sriov(stru
 	if (!MLX5_ESWITCH_MANAGER(dev))
 		goto enable_vfs_hca;
 
-	err = mlx5_eswitch_enable(dev->priv.eswitch, MLX5_ESWITCH_LEGACY,
-				  num_vfs);
+	err = mlx5_eswitch_enable(dev->priv.eswitch, num_vfs);
 	if (err) {
 		mlx5_core_warn(dev,
 			       "failed to enable eswitch SRIOV (%d)\n", err);