Blob Blame History Raw
From: Moshe Shemesh <moshe@nvidia.com>
Date: Tue, 31 May 2022 09:14:03 +0300
Subject: net/mlx5: cmdif, Print info on any firmware cmd failure to tracepoint
Patch-mainline: v6.1-rc7
Git-commit: 870c2481174b839e7159555127bc8b5a5d0699ba
References: jsc#PED-1549

While moving to new CMD API (quiet API), some pre-existing flows may call the new API
function that in case of error, returns the error instead of printing it as previously done.
For such flows we bring back the print but to tracepoint this time for sys admins to
have the ability to check for errors especially for commands using the new quiet API.

Tracepoint output example:
         devlink-1333    [001] .....   822.746922: mlx5_cmd: ACCESS_REG(0x805) op_mod(0x0) failed, status bad resource(0x5), syndrome (0xb06e1f), err(-22)

Fixes: f23519e542e5 ("net/mlx5: cmdif, Add new api for command execution")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c                 |   41 ++++-----
 drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h |   45 ++++++++++
 include/linux/mlx5/driver.h                                   |    1 
 3 files changed, 68 insertions(+), 19 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h

--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -45,6 +45,8 @@
 #include "mlx5_core.h"
 #include "lib/eq.h"
 #include "lib/tout.h"
+#define CREATE_TRACE_POINTS
+#include "diag/cmd_tracepoint.h"
 
 enum {
 	CMD_IF_REV = 5,
@@ -785,27 +787,14 @@ EXPORT_SYMBOL(mlx5_cmd_out_err);
 static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out)
 {
 	u16 opcode, op_mod;
-	u32 syndrome;
-	u8  status;
 	u16 uid;
-	int err;
-
-	syndrome = MLX5_GET(mbox_out, out, syndrome);
-	status = MLX5_GET(mbox_out, out, status);
 
 	opcode = MLX5_GET(mbox_in, in, opcode);
 	op_mod = MLX5_GET(mbox_in, in, op_mod);
 	uid    = MLX5_GET(mbox_in, in, uid);
 
-	err = cmd_status_to_err(status);
-
 	if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY)
 		mlx5_cmd_out_err(dev, opcode, op_mod, out);
-	else
-		mlx5_core_dbg(dev,
-			"%s(0x%x) op_mod(0x%x) uid(%d) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n",
-			mlx5_command_str(opcode), opcode, op_mod, uid,
-			cmd_status_str(status), status, syndrome, err);
 }
 
 int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out)
@@ -1892,6 +1881,16 @@ out_in:
 	return err;
 }
 
+static void mlx5_cmd_err_trace(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out)
+{
+	u32 syndrome = MLX5_GET(mbox_out, out, syndrome);
+	u8 status = MLX5_GET(mbox_out, out, status);
+
+	trace_mlx5_cmd(mlx5_command_str(opcode), opcode, op_mod,
+		       cmd_status_str(status), status, syndrome,
+		       cmd_status_to_err(status));
+}
+
 static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
 			   u32 syndrome, int err)
 {
@@ -1914,7 +1913,7 @@ static void cmd_status_log(struct mlx5_c
 }
 
 /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */
-static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, void *out)
+static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, u16 op_mod, void *out)
 {
 	u32 syndrome = MLX5_GET(mbox_out, out, syndrome);
 	u8 status = MLX5_GET(mbox_out, out, status);
@@ -1922,8 +1921,10 @@ static int cmd_status_err(struct mlx5_co
 	if (err == -EREMOTEIO) /* -EREMOTEIO is preserved */
 		err = -EIO;
 
-	if (!err && status != MLX5_CMD_STAT_OK)
+	if (!err && status != MLX5_CMD_STAT_OK) {
 		err = -EREMOTEIO;
+		mlx5_cmd_err_trace(dev, opcode, op_mod, out);
+	}
 
 	cmd_status_log(dev, opcode, status, syndrome, err);
 	return err;
@@ -1951,9 +1952,9 @@ int mlx5_cmd_do(struct mlx5_core_dev *de
 {
 	int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false);
 	u16 opcode = MLX5_GET(mbox_in, in, opcode);
+	u16 op_mod = MLX5_GET(mbox_in, in, op_mod);
 
-	err = cmd_status_err(dev, err, opcode, out);
-	return err;
+	return cmd_status_err(dev, err, opcode, op_mod, out);
 }
 EXPORT_SYMBOL(mlx5_cmd_do);
 
@@ -1997,8 +1998,9 @@ int mlx5_cmd_exec_polling(struct mlx5_co
 {
 	int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true);
 	u16 opcode = MLX5_GET(mbox_in, in, opcode);
+	u16 op_mod = MLX5_GET(mbox_in, in, op_mod);
 
-	err = cmd_status_err(dev, err, opcode, out);
+	err = cmd_status_err(dev, err, opcode, op_mod, out);
 	return mlx5_cmd_check(dev, err, in, out);
 }
 EXPORT_SYMBOL(mlx5_cmd_exec_polling);
@@ -2034,7 +2036,7 @@ static void mlx5_cmd_exec_cb_handler(int
 	struct mlx5_async_ctx *ctx;
 
 	ctx = work->ctx;
-	status = cmd_status_err(ctx->dev, status, work->opcode, work->out);
+	status = cmd_status_err(ctx->dev, status, work->opcode, work->op_mod, work->out);
 	work->user_callback(status, work);
 	if (atomic_dec_and_test(&ctx->num_inflight))
 		complete(&ctx->inflight_done);
@@ -2049,6 +2051,7 @@ int mlx5_cmd_exec_cb(struct mlx5_async_c
 	work->ctx = ctx;
 	work->user_callback = callback;
 	work->opcode = MLX5_GET(mbox_in, in, opcode);
+	work->op_mod = MLX5_GET(mbox_in, in, op_mod);
 	work->out = out;
 	if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight)))
 		return -EIO;
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mlx5
+
+#if !defined(_MLX5_CMD_TP_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _MLX5_CMD_TP_H_
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+TRACE_EVENT(mlx5_cmd,
+	    TP_PROTO(const char *command_str, u16 opcode, u16 op_mod,
+		     const char *status_str, u8 status, u32 syndrome, int err),
+	    TP_ARGS(command_str, opcode, op_mod, status_str, status, syndrome, err),
+	    TP_STRUCT__entry(__string(command_str, command_str)
+			     __field(u16, opcode)
+			     __field(u16, op_mod)
+			    __string(status_str, status_str)
+			    __field(u8, status)
+			    __field(u32, syndrome)
+			    __field(int, err)
+			    ),
+	    TP_fast_assign(__assign_str(command_str, command_str);
+			__entry->opcode = opcode;
+			__entry->op_mod = op_mod;
+			__assign_str(status_str, status_str);
+			__entry->status = status;
+			__entry->syndrome = syndrome;
+			__entry->err = err;
+	    ),
+	    TP_printk("%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)",
+		      __get_str(command_str), __entry->opcode, __entry->op_mod,
+		      __get_str(status_str), __entry->status, __entry->syndrome,
+		      __entry->err)
+);
+
+#endif /* _MLX5_CMD_TP_H_ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ./diag
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE cmd_tracepoint
+#include <trace/define_trace.h>
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -981,6 +981,7 @@ struct mlx5_async_work {
 	struct mlx5_async_ctx *ctx;
 	mlx5_async_cbk_t user_callback;
 	u16 opcode; /* cmd opcode */
+	u16 op_mod; /* cmd op_mod */
 	void *out; /* pointer to the cmd output buffer */
 };