Blob Blame History Raw
From: Dmytro Linkin <dlinkin@nvidia.com>
Date: Mon, 31 May 2021 18:19:50 +0300
Subject: net/mlx5: E-switch, Allow setting share/max tx rate limits of rate
 groups
Patch-mainline: v5.15-rc1
Git-commit: f47e04eb96e02e6bd870dd5ce5da1d612b43b28d
References: jsc#SLE-19253

Provide eswitch API to allow controlling group rate limits. Use it to
implement devlink_ops->mlx5_devlink_rate_node_tx_{share|max}_set().

The share rate will create relative bandwidth share on the groups level
while within the group the user can set shared rate on the member vports
of that group and this rate will be relative to the group's share rate.
The group with the highest shared rate will get a BW share of 100 and
the rest of the groups will get a value that reflects the ratio between
their share rate and the maximum share rate.

Example:
Created four rate groups with tx_share limits:

$ devlink port function rate add \
    pci/0000:06:00.0/group_1 tx_share 30gbit
$ devlink port function rate add \
    pci/0000:06:00.0/group_2 tx_share 20gbit
$ devlink port function rate add \
    pci/0000:06:00.0/group_3 tx_share 20gbit
$ devlink port function rate add \
    pci/0000:06:00.0/group_4 tx_share 10gbit

Assuming link speed is 50 Gbit/sec ratio divider will be
50 / (30+20+20+10) = 0.625. Normalized rate values for the groups:

<group_1> 30 * 0.625 = 18.75 Gbit/sec
<group_2> 20 * 0.625 = 12.5 Gbit/sec
<group_3> 20 * 0.625 = 12.5 Gbit/sec
<group_4> 10 * 0.625 = 6.25 Gbit/sec

Rate group with unlimited tx_share rate will receive minimum BW value
(1Mbit/sec) if presented any group with tx_share rate limit. This allow
to not drop all packets in case of heavy traffic.

Co-developed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Dmytro Linkin <dlinkin@nvidia.com>
Reviewed-by: Huy Nguyen <huyn@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c |    2 
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c |  257 ++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h |    4 
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h |    1 
 4 files changed, 225 insertions(+), 39 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -295,6 +295,8 @@ static const struct devlink_ops mlx5_dev
 	.port_function_hw_addr_set = mlx5_devlink_port_function_hw_addr_set,
 	.rate_leaf_tx_share_set = mlx5_esw_devlink_rate_leaf_tx_share_set,
 	.rate_leaf_tx_max_set = mlx5_esw_devlink_rate_leaf_tx_max_set,
+	.rate_node_tx_share_set = mlx5_esw_devlink_rate_node_tx_share_set,
+	.rate_node_tx_max_set = mlx5_esw_devlink_rate_node_tx_max_set,
 	.rate_node_new = mlx5_esw_devlink_rate_node_new,
 	.rate_node_del = mlx5_esw_devlink_rate_node_del,
 #endif
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -16,8 +16,47 @@ struct mlx5_esw_rate_group {
 	u32 max_rate;
 	u32 min_rate;
 	u32 bw_share;
+	struct list_head list;
 };
 
+static int esw_qos_tsar_config(struct mlx5_core_dev *dev, u32 *sched_ctx,
+			       u32 parent_ix, u32 tsar_ix,
+			       u32 max_rate, u32 bw_share)
+{
+	u32 bitmask = 0;
+
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
+		return -EOPNOTSUPP;
+
+	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_ix);
+	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
+	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
+	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
+
+	return mlx5_modify_scheduling_element_cmd(dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  sched_ctx,
+						  tsar_ix,
+						  bitmask);
+}
+
+static int esw_qos_group_config(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group,
+				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
+{
+	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
+	struct mlx5_core_dev *dev = esw->dev;
+	int err;
+
+	err = esw_qos_tsar_config(dev, sched_ctx,
+				  esw->qos.root_tsar_ix, group->tsar_ix,
+				  max_rate, bw_share);
+	if (err)
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch modify group TSAR element failed");
+
+	return err;
+}
+
 static int esw_qos_vport_config(struct mlx5_eswitch *esw,
 				struct mlx5_vport *vport,
 				u32 max_rate, u32 bw_share,
@@ -26,12 +65,8 @@ static int esw_qos_vport_config(struct m
 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_core_dev *dev = esw->dev;
 	void *vport_elem;
-	u32 bitmask = 0;
 	int err;
 
-	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
-		return -EOPNOTSUPP;
-
 	if (!vport->qos.enabled)
 		return -EIO;
 
@@ -40,19 +75,12 @@ static int esw_qos_vport_config(struct m
 	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
 				  element_attributes);
 	MLX5_SET(vport_element, vport_elem, vport_number, vport->vport);
-	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, esw->qos.root_tsar_ix);
-	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
-	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
-	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
-	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
 
-	err = mlx5_modify_scheduling_element_cmd(dev,
-						 SCHEDULING_HIERARCHY_E_SWITCH,
-						 sched_ctx,
-						 vport->qos.esw_tsar_ix,
-						 bitmask);
+	err = esw_qos_tsar_config(dev, sched_ctx, esw->qos.root_tsar_ix, vport->qos.esw_tsar_ix,
+				  max_rate, bw_share);
 	if (err) {
-		esw_warn(esw->dev, "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n",
+		esw_warn(esw->dev,
+			 "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n",
 			 vport->vport, err);
 		NL_SET_ERR_MSG_MOD(extack, "E-Switch modify TSAR vport element failed");
 		return err;
@@ -61,17 +89,30 @@ static int esw_qos_vport_config(struct m
 	return 0;
 }
 
-static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw)
+static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
+					      struct mlx5_esw_rate_group *group,
+					      bool group_level)
 {
 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
 	struct mlx5_vport *evport;
 	u32 max_guarantee = 0;
 	unsigned long i;
 
-	mlx5_esw_for_each_vport(esw, i, evport) {
-		if (!evport->enabled || evport->qos.min_rate < max_guarantee)
-			continue;
-		max_guarantee = evport->qos.min_rate;
+	if (group_level) {
+		struct mlx5_esw_rate_group *group;
+
+		list_for_each_entry(group, &esw->qos.groups, list) {
+			if (group->min_rate < max_guarantee)
+				continue;
+			max_guarantee = group->min_rate;
+		}
+	} else {
+		mlx5_esw_for_each_vport(esw, i, evport) {
+			if (!evport->enabled || !evport->qos.enabled ||
+			    evport->qos.min_rate < max_guarantee)
+				continue;
+			max_guarantee = evport->qos.min_rate;
+		}
 	}
 
 	if (max_guarantee)
@@ -79,38 +120,62 @@ static u32 calculate_vports_min_rate_div
 	return 0;
 }
 
-static int
-esw_qos_normalize_vports_min_rate(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
+static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
+{
+	if (divider)
+		return MLX5_RATE_TO_BW_SHARE(min_rate, divider, fw_max);
+
+	return 0;
+}
+
+static int esw_qos_normalize_vports_min_rate(struct mlx5_eswitch *esw,
+					     struct mlx5_esw_rate_group *group,
+					     struct netlink_ext_ack *extack)
 {
 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
-	u32 divider = calculate_vports_min_rate_divider(esw);
+	u32 divider = esw_qos_calculate_min_rate_divider(esw, group, false);
 	struct mlx5_vport *evport;
-	u32 vport_max_rate;
-	u32 vport_min_rate;
 	unsigned long i;
 	u32 bw_share;
 	int err;
 
 	mlx5_esw_for_each_vport(esw, i, evport) {
-		if (!evport->enabled)
+		if (!evport->enabled || !evport->qos.enabled)
 			continue;
-		vport_min_rate = evport->qos.min_rate;
-		vport_max_rate = evport->qos.max_rate;
-		bw_share = 0;
-
-		if (divider)
-			bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate,
-							 divider,
-							 fw_max_bw_share);
+		bw_share = esw_qos_calc_bw_share(evport->qos.min_rate, divider, fw_max_bw_share);
 
 		if (bw_share == evport->qos.bw_share)
 			continue;
 
-		err = esw_qos_vport_config(esw, evport, vport_max_rate, bw_share, extack);
-		if (!err)
-			evport->qos.bw_share = bw_share;
-		else
+		err = esw_qos_vport_config(esw, evport, evport->qos.max_rate, bw_share, extack);
+		if (err)
+			return err;
+
+		evport->qos.bw_share = bw_share;
+	}
+
+	return 0;
+}
+
+static int esw_qos_normalize_groups_min_rate(struct mlx5_eswitch *esw, u32 divider,
+					     struct netlink_ext_ack *extack)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_esw_rate_group *group;
+	u32 bw_share;
+	int err;
+
+	list_for_each_entry(group, &esw->qos.groups, list) {
+		bw_share = esw_qos_calc_bw_share(group->min_rate, divider, fw_max_bw_share);
+
+		if (bw_share == group->bw_share)
+			continue;
+
+		err = esw_qos_group_config(esw, group, group->max_rate, bw_share, extack);
+		if (err)
 			return err;
+
+		group->bw_share = bw_share;
 	}
 
 	return 0;
@@ -136,7 +201,7 @@ int mlx5_esw_qos_set_vport_min_rate(stru
 
 	previous_min_rate = evport->qos.min_rate;
 	evport->qos.min_rate = min_rate;
-	err = esw_qos_normalize_vports_min_rate(esw, extack);
+	err = esw_qos_normalize_vports_min_rate(esw, NULL, extack);
 	if (err)
 		evport->qos.min_rate = previous_min_rate;
 
@@ -160,17 +225,68 @@ int mlx5_esw_qos_set_vport_max_rate(stru
 		return 0;
 
 	err = esw_qos_vport_config(esw, evport, max_rate, evport->qos.bw_share, extack);
+
 	if (!err)
 		evport->qos.max_rate = max_rate;
 
 	return err;
 }
 
+static int esw_qos_set_group_min_rate(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group,
+				      u32 min_rate, struct netlink_ext_ack *extack)
+{
+	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
+	struct mlx5_core_dev *dev = esw->dev;
+	u32 previous_min_rate, divider;
+	int err;
+
+	if (!(MLX5_CAP_QOS(dev, esw_bw_share) && fw_max_bw_share >= MLX5_MIN_BW_SHARE))
+		return -EOPNOTSUPP;
+
+	if (min_rate == group->min_rate)
+		return 0;
+
+	previous_min_rate = group->min_rate;
+	group->min_rate = min_rate;
+	divider = esw_qos_calculate_min_rate_divider(esw, group, true);
+	err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
+	if (err) {
+		group->min_rate = previous_min_rate;
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch group min rate setting failed");
+
+		/* Attempt restoring previous configuration */
+		divider = esw_qos_calculate_min_rate_divider(esw, group, true);
+		if (esw_qos_normalize_groups_min_rate(esw, divider, extack))
+			NL_SET_ERR_MSG_MOD(extack, "E-Switch BW share restore failed");
+	}
+
+	return err;
+}
+
+static int esw_qos_set_group_max_rate(struct mlx5_eswitch *esw,
+				      struct mlx5_esw_rate_group *group,
+				      u32 max_rate, struct netlink_ext_ack *extack)
+{
+	int err;
+
+	if (group->max_rate == max_rate)
+		return 0;
+
+	err = esw_qos_group_config(esw, group, max_rate, group->bw_share, extack);
+	if (err)
+		return err;
+
+	group->max_rate = max_rate;
+
+	return err;
+}
+
 static struct mlx5_esw_rate_group *
 esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
 {
 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
 	struct mlx5_esw_rate_group *group;
+	u32 divider;
 	int err;
 
 	if (!MLX5_CAP_QOS(esw->dev, log_esw_max_sched_depth))
@@ -191,8 +307,26 @@ esw_qos_create_rate_group(struct mlx5_es
 		goto err_sched_elem;
 	}
 
+	list_add_tail(&group->list, &esw->qos.groups);
+
+	divider = esw_qos_calculate_min_rate_divider(esw, group, true);
+	if (divider) {
+		err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "E-Switch groups normalization failed");
+			goto err_min_rate;
+		}
+	}
+
 	return group;
 
+err_min_rate:
+	list_del(&group->list);
+	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
+						  SCHEDULING_HIERARCHY_E_SWITCH,
+						  group->tsar_ix);
+	if (err)
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR for group failed");
 err_sched_elem:
 	kfree(group);
 	return ERR_PTR(err);
@@ -202,8 +336,16 @@ static int esw_qos_destroy_rate_group(st
 				      struct mlx5_esw_rate_group *group,
 				      struct netlink_ext_ack *extack)
 {
+	u32 divider;
 	int err;
 
+	list_del(&group->list);
+
+	divider = esw_qos_calculate_min_rate_divider(esw, NULL, true);
+	err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
+	if (err)
+		NL_SET_ERR_MSG_MOD(extack, "E-Switch groups' normalization failed");
+
 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
 						  SCHEDULING_HIERARCHY_E_SWITCH,
 						  group->tsar_ix);
@@ -265,6 +407,7 @@ void mlx5_esw_qos_create(struct mlx5_esw
 		goto unlock;
 	}
 
+	INIT_LIST_HEAD(&esw->qos.groups);
 	if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) {
 		esw->qos.group0 = esw_qos_create_rate_group(esw, NULL);
 		if (IS_ERR(esw->qos.group0)) {
@@ -469,6 +612,42 @@ int mlx5_esw_devlink_rate_leaf_tx_max_se
 	mutex_unlock(&esw->state_lock);
 	return err;
 }
+
+int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv,
+					    u64 tx_share, struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_esw_rate_group *group = priv;
+	int err;
+
+	err = esw_qos_devlink_rate_to_mbps(dev, "tx_share", &tx_share, extack);
+	if (err)
+		return err;
+
+	mutex_lock(&esw->state_lock);
+	err = esw_qos_set_group_min_rate(esw, group, tx_share, extack);
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
+
+int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv,
+					  u64 tx_max, struct netlink_ext_ack *extack)
+{
+	struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink);
+	struct mlx5_eswitch *esw = dev->priv.eswitch;
+	struct mlx5_esw_rate_group *group = priv;
+	int err;
+
+	err = esw_qos_devlink_rate_to_mbps(dev, "tx_max", &tx_max, extack);
+	if (err)
+		return err;
+
+	mutex_lock(&esw->state_lock);
+	err = esw_qos_set_group_max_rate(esw, group, tx_max, extack);
+	mutex_unlock(&esw->state_lock);
+	return err;
+}
 
 int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv,
 				   struct netlink_ext_ack *extack)
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.h
@@ -24,6 +24,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_
 					    u64 tx_share, struct netlink_ext_ack *extack);
 int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv,
 					  u64 tx_max, struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv,
+					    u64 tx_share, struct netlink_ext_ack *extack);
+int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv,
+					  u64 tx_max, struct netlink_ext_ack *extack);
 int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv,
 				   struct netlink_ext_ack *extack);
 int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -307,6 +307,7 @@ struct mlx5_eswitch {
 		bool            enabled;
 		u32             root_tsar_ix;
 		struct mlx5_esw_rate_group *group0;
+		struct list_head groups; /* Protected by esw->state_lock */
 	} qos;
 
 	struct mlx5_esw_bridge_offloads *br_offloads;