Blob Blame History Raw
From: Aya Levin <ayal@mellanox.com>
Date: Mon, 18 May 2020 12:31:38 +0300
Subject: net/mlx5e: Enhance TX timeout recovery
Patch-mainline: v5.9-rc1
Git-commit: e62055642797a6de80f3576c18e212cbbf5b4361
References: jsc#SLE-15172

Upon a TX timeout handle, if the TX reporter was not able to recover
from the error, reopen the channels. If tried to reopen channels, do not
loop over TX queues for timeout.

With that, the reporters state and separation will better
expose the driver's state.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c |   36 ++++++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c        |   14 -----
 2 files changed, 33 insertions(+), 17 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -83,17 +83,40 @@ out:
 	return err;
 }
 
+struct mlx5e_tx_timeout_ctx {
+	struct mlx5e_txqsq *sq;
+	signed int status;
+};
+
 static int mlx5e_tx_reporter_timeout_recover(void *ctx)
 {
+	struct mlx5e_tx_timeout_ctx *to_ctx;
+	struct mlx5e_priv *priv;
 	struct mlx5_eq_comp *eq;
 	struct mlx5e_txqsq *sq;
 	int err;
 
-	sq = ctx;
+	to_ctx = ctx;
+	sq = to_ctx->sq;
 	eq = sq->cq.mcq.eq;
+	priv = sq->channel->priv;
 	err = mlx5e_health_channel_eq_recover(eq, sq->channel);
-	if (err)
-		clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	if (!err) {
+		to_ctx->status = 0; /* this sq recovered */
+		return err;
+	}
+
+	err = mlx5e_safe_reopen_channels(priv);
+	if (!err) {
+		to_ctx->status = 1; /* all channels recovered */
+		return err;
+	}
+
+	to_ctx->status = err;
+	clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state);
+	netdev_err(priv->netdev,
+		   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
+		   err);
 
 	return err;
 }
@@ -389,9 +412,11 @@ int mlx5e_reporter_tx_timeout(struct mlx
 {
 	struct mlx5e_priv *priv = sq->channel->priv;
 	char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN];
+	struct mlx5e_tx_timeout_ctx to_ctx = {};
 	struct mlx5e_err_ctx err_ctx = {};
 
-	err_ctx.ctx = sq;
+	to_ctx.sq = sq;
+	err_ctx.ctx = &to_ctx;
 	err_ctx.recover = mlx5e_tx_reporter_timeout_recover;
 	err_ctx.dump = mlx5e_tx_reporter_dump_sq;
 	snprintf(err_str, sizeof(err_str),
@@ -399,7 +424,8 @@ int mlx5e_reporter_tx_timeout(struct mlx
 		 sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc,
 		 jiffies_to_usecs(jiffies - sq->txq->trans_start));
 
-	return mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
+	mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx);
+	return to_ctx.status;
 }
 
 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4383,8 +4383,6 @@ static void mlx5e_tx_timeout_work(struct
 {
 	struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv,
 					       tx_timeout_work);
-	bool report_failed = false;
-	int err;
 	int i;
 
 	rtnl_lock();
@@ -4402,18 +4400,10 @@ static void mlx5e_tx_timeout_work(struct
 			continue;
 
 		if (mlx5e_reporter_tx_timeout(sq))
-			report_failed = true;
+		/* break if tried to reopened channels */
+			break;
 	}
 
-	if (!report_failed)
-		goto unlock;
-
-	err = mlx5e_safe_reopen_channels(priv);
-	if (err)
-		netdev_err(priv->netdev,
-			   "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
-			   err);
-
 unlock:
 	mutex_unlock(&priv->state_lock);
 	rtnl_unlock();