From: Gavin Li <gavinl@nvidia.com>
Date: Sun, 27 Mar 2022 17:36:44 +0300
Subject: net/mlx5: Increase FW pre-init timeout for health recovery
Patch-mainline: v5.19-rc1
Git-commit: 37ca95e62ee23fa6d2c2c64e3dc40b4a0c0146dc
References: jsc#PED-1549
Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.
To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.
The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.
Signed-off-by: Gavin Li <gavinl@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 +--
drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 2 -
drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c | 1
drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h | 1
drivers/net/ethernet/mellanox/mlx5/core/main.c | 23 ++++++++++++--------
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 -
6 files changed, 20 insertions(+), 13 deletions(-)
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct
*actions_performed = BIT(action);
switch (action) {
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
break;
/* On fw_activate action, also driver is reloaded and reinit performed */
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
default:
/* Unsupported action should not get to this function */
WARN_ON(1);
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -149,7 +149,7 @@ static void mlx5_fw_reset_complete_reloa
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
complete(&fw_reset->done);
} else {
- mlx5_load_one(dev);
+ mlx5_load_one(dev, false);
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c
@@ -10,6 +10,7 @@ struct mlx5_timeouts {
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
+ [MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h
@@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
/* pre init timeouts (not read from FW) */
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
+ MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
MLX5_TO_FW_PRE_INIT_WAIT_MS,
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1012,7 +1012,7 @@ static void mlx5_cleanup_once(struct mlx
mlx5_devcom_unregister_device(dev->priv.devcom);
}
-static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
int err;
@@ -1027,11 +1027,11 @@ static int mlx5_function_setup(struct ml
/* wait for firmware to accept initialization segments configurations
*/
- err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
+ err = wait_fw_init(dev, timeout,
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
if (err) {
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
- mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+ timeout);
return err;
}
@@ -1281,7 +1281,7 @@ int mlx5_init_one(struct mlx5_core_dev *
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, true);
+ err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;
@@ -1345,9 +1345,10 @@ out:
mutex_unlock(&dev->intf_state_mutex);
}
-int mlx5_load_one(struct mlx5_core_dev *dev)
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
int err = 0;
+ u64 timeout;
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
@@ -1357,7 +1358,11 @@ int mlx5_load_one(struct mlx5_core_dev *
/* remove any previous indication of internal error */
dev->state = MLX5_DEVICE_STATE_UP;
- err = mlx5_function_setup(dev, false);
+ if (recovery)
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
+ else
+ timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
+ err = mlx5_function_setup(dev, timeout);
if (err)
goto err_function;
@@ -1732,7 +1737,7 @@ static void mlx5_pci_resume(struct pci_d
mlx5_pci_trace(dev, "Enter, loading driver..\n");
- err = mlx5_load_one(dev);
+ err = mlx5_load_one(dev, false);
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
!err ? "recovered" : "Failed");
@@ -1820,7 +1825,7 @@ static int mlx5_resume(struct pci_dev *p
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, false);
}
static const struct pci_device_id mlx5_core_pci_table[] = {
@@ -1865,7 +1870,7 @@ int mlx5_recover_device(struct mlx5_core
return -EIO;
}
- return mlx5_load_one(dev);
+ return mlx5_load_one(dev, true);
}
static struct pci_driver mlx5_core_driver = {
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_d
int mlx5_init_one(struct mlx5_core_dev *dev);
void mlx5_uninit_one(struct mlx5_core_dev *dev);
void mlx5_unload_one(struct mlx5_core_dev *dev);
-int mlx5_load_one(struct mlx5_core_dev *dev);
+int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);