Blob Blame History Raw
From 532d7f6bc458042571752168bcb5e1fdc576b8c4 Mon Sep 17 00:00:00 2001
From: Shashank Gupta <shashank.gupta@intel.com>
Date: Fri, 20 Oct 2023 11:32:52 +0100
Subject: [PATCH] crypto: qat - add error counters
Git-commit: 532d7f6bc458042571752168bcb5e1fdc576b8c4
Patch-mainline: v6.7-rc1
References: jsc#PED-5174 jsc#PED-5861

Introduce ras counters interface for counting QAT specific device
errors and expose them through the newly created qat_ras sysfs
group attribute.

This adds the following attributes:

- errors_correctable: number of correctable errors
- errors_nonfatal: number of uncorrectable non fatal errors
- errors_fatal: number of uncorrectable fatal errors
- reset_error_counters: resets all counters

These counters are initialized during device bring up and cleared
during device shutdown and are applicable only to QAT GEN4 devices.

Signed-off-by: Shashank Gupta <shashank.gupta@intel.com>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Reviewed-by: Tero Kristo <tero.kristo@linux.intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Torsten Duwe <duwe@suse.de>

---
 .../ABI/testing/sysfs-driver-qat_ras          |  41 +++++++
 drivers/crypto/intel/qat/qat_4xxx/adf_drv.c   |   1 +
 drivers/crypto/intel/qat/qat_common/Makefile  |   1 +
 .../intel/qat/qat_common/adf_accel_devices.h  |  14 +++
 .../crypto/intel/qat/qat_common/adf_init.c    |   3 +
 .../qat/qat_common/adf_sysfs_ras_counters.c   | 112 ++++++++++++++++++
 .../qat/qat_common/adf_sysfs_ras_counters.h   |  27 +++++
 7 files changed, 199 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-qat_ras
 create mode 100644 drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.c
 create mode 100644 drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.h

diff --git a/Documentation/ABI/testing/sysfs-driver-qat_ras b/Documentation/ABI/testing/sysfs-driver-qat_ras
new file mode 100644
index 0000000000000..176dea1e9c0aa
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-qat_ras
@@ -0,0 +1,41 @@
+What:		/sys/bus/pci/devices/<BDF>/qat_ras/errors_correctable
+Date:		January 2024
+KernelVersion:	6.7
+Contact:	qat-linux@intel.com
+Description:	(RO) Reports the number of correctable errors detected by the device.
+
+		This attribute is only available for qat_4xxx devices.
+
+What:		/sys/bus/pci/devices/<BDF>/qat_ras/errors_nonfatal
+Date:		January 2024
+KernelVersion:	6.7
+Contact:	qat-linux@intel.com
+Description:	(RO) Reports the number of non fatal errors detected by the device.
+
+		This attribute is only available for qat_4xxx devices.
+
+What:		/sys/bus/pci/devices/<BDF>/qat_ras/errors_fatal
+Date:		January 2024
+KernelVersion:	6.7
+Contact:	qat-linux@intel.com
+Description:	(RO) Reports the number of fatal errors detected by the device.
+
+		This attribute is only available for qat_4xxx devices.
+
+What:		/sys/bus/pci/devices/<BDF>/qat_ras/reset_error_counters
+Date:		January 2024
+KernelVersion:	6.7
+Contact:	qat-linux@intel.com
+Description:	(WO) Write to resets all error counters of a device.
+
+		The following example reports how to reset the counters::
+
+			# echo 1 > /sys/bus/pci/devices/<BDF>/qat_ras/reset_error_counters
+			# cat /sys/bus/pci/devices/<BDF>/qat_ras/errors_correctable
+			0
+			# cat /sys/bus/pci/devices/<BDF>/qat_ras/errors_nonfatal
+			0
+			# cat /sys/bus/pci/devices/<BDF>/qat_ras/errors_fatal
+			0
+
+		This attribute is only available for qat_4xxx devices.
diff --git a/drivers/crypto/intel/qat/qat_4xxx/adf_drv.c b/drivers/crypto/intel/qat/qat_4xxx/adf_drv.c
index 2ccd1223f1ef0..8f483d1197dda 100644
--- a/drivers/crypto/intel/qat/qat_4xxx/adf_drv.c
+++ b/drivers/crypto/intel/qat/qat_4xxx/adf_drv.c
@@ -418,6 +418,7 @@ static int adf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto out_err;
 	}
 
+	accel_dev->ras_errors.enabled = true;
 	adf_dbgfs_init(accel_dev);
 
 	ret = adf_dev_up(accel_dev, true);
diff --git a/drivers/crypto/intel/qat/qat_common/Makefile b/drivers/crypto/intel/qat/qat_common/Makefile
index 151fd3c01f626..58f3c181b2ceb 100644
--- a/drivers/crypto/intel/qat/qat_common/Makefile
+++ b/drivers/crypto/intel/qat/qat_common/Makefile
@@ -12,6 +12,7 @@ intel_qat-objs := adf_cfg.o \
 	adf_admin.o \
 	adf_hw_arbiter.o \
 	adf_sysfs.o \
+	adf_sysfs_ras_counters.o \
 	adf_gen2_hw_data.o \
 	adf_gen2_config.o \
 	adf_gen4_hw_data.o \
diff --git a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
index 65d52a07e4352..197e10eb55349 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/io.h>
 #include <linux/ratelimit.h>
+#include <linux/types.h>
 #include "adf_cfg_common.h"
 #include "adf_pfvf_msg.h"
 
@@ -81,6 +82,18 @@ enum dev_sku_info {
 	DEV_SKU_UNKNOWN,
 };
 
+enum ras_errors {
+	ADF_RAS_CORR,
+	ADF_RAS_UNCORR,
+	ADF_RAS_FATAL,
+	ADF_RAS_ERRORS,
+};
+
+struct adf_error_counters {
+	atomic_t counter[ADF_RAS_ERRORS];
+	bool enabled;
+};
+
 static inline const char *get_sku_info(enum dev_sku_info info)
 {
 	switch (info) {
@@ -360,6 +373,7 @@ struct adf_accel_dev {
 			u8 pf_compat_ver;
 		} vf;
 	};
+	struct adf_error_counters ras_errors;
 	struct mutex state_lock; /* protect state of the device */
 	bool is_vf;
 	u32 accel_id;
diff --git a/drivers/crypto/intel/qat/qat_common/adf_init.c b/drivers/crypto/intel/qat/qat_common/adf_init.c
index b3cf0720cf9a1..00a32efdfc59d 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_init.c
+++ b/drivers/crypto/intel/qat/qat_common/adf_init.c
@@ -9,6 +9,7 @@
 #include "adf_common_drv.h"
 #include "adf_dbgfs.h"
 #include "adf_heartbeat.h"
+#include "adf_sysfs_ras_counters.h"
 
 static LIST_HEAD(service_table);
 static DEFINE_MUTEX(service_lock);
@@ -242,6 +243,7 @@ static int adf_dev_start(struct adf_accel_dev *accel_dev)
 	set_bit(ADF_STATUS_COMP_ALGS_REGISTERED, &accel_dev->status);
 
 	adf_dbgfs_add(accel_dev);
+	adf_sysfs_start_ras(accel_dev);
 
 	return 0;
 }
@@ -268,6 +270,7 @@ static void adf_dev_stop(struct adf_accel_dev *accel_dev)
 		return;
 
 	adf_dbgfs_rm(accel_dev);
+	adf_sysfs_stop_ras(accel_dev);
 
 	clear_bit(ADF_STATUS_STARTING, &accel_dev->status);
 	clear_bit(ADF_STATUS_STARTED, &accel_dev->status);
diff --git a/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.c b/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.c
new file mode 100644
index 0000000000000..cffe2d7229953
--- /dev/null
+++ b/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation */
+
+#include <linux/sysfs.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include "adf_common_drv.h"
+#include "adf_sysfs_ras_counters.h"
+
+static ssize_t errors_correctable_show(struct device *dev,
+				       struct device_attribute *dev_attr,
+				       char *buf)
+{
+	struct adf_accel_dev *accel_dev;
+	unsigned long counter;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	counter = ADF_RAS_ERR_CTR_READ(accel_dev->ras_errors, ADF_RAS_CORR);
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", counter);
+}
+
+static ssize_t errors_nonfatal_show(struct device *dev,
+				    struct device_attribute *dev_attr,
+				    char *buf)
+{
+	struct adf_accel_dev *accel_dev;
+	unsigned long counter;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	counter = ADF_RAS_ERR_CTR_READ(accel_dev->ras_errors, ADF_RAS_UNCORR);
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", counter);
+}
+
+static ssize_t errors_fatal_show(struct device *dev,
+				 struct device_attribute *dev_attr,
+				 char *buf)
+{
+	struct adf_accel_dev *accel_dev;
+	unsigned long counter;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	counter = ADF_RAS_ERR_CTR_READ(accel_dev->ras_errors, ADF_RAS_FATAL);
+	return scnprintf(buf, PAGE_SIZE, "%ld\n", counter);
+}
+
+static ssize_t reset_error_counters_store(struct device *dev,
+					  struct device_attribute *dev_attr,
+					  const char *buf, size_t count)
+{
+	struct adf_accel_dev *accel_dev;
+
+	if (buf[0] != '1' || count != 2)
+		return -EINVAL;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
+	if (!accel_dev)
+		return -EINVAL;
+
+	ADF_RAS_ERR_CTR_CLEAR(accel_dev->ras_errors);
+
+	return count;
+}
+
+static DEVICE_ATTR_RO(errors_correctable);
+static DEVICE_ATTR_RO(errors_nonfatal);
+static DEVICE_ATTR_RO(errors_fatal);
+static DEVICE_ATTR_WO(reset_error_counters);
+
+static struct attribute *qat_ras_attrs[] = {
+	&dev_attr_errors_correctable.attr,
+	&dev_attr_errors_nonfatal.attr,
+	&dev_attr_errors_fatal.attr,
+	&dev_attr_reset_error_counters.attr,
+	NULL,
+};
+
+static struct attribute_group qat_ras_group = {
+	.attrs = qat_ras_attrs,
+	.name = "qat_ras",
+};
+
+void adf_sysfs_start_ras(struct adf_accel_dev *accel_dev)
+{
+	if (!accel_dev->ras_errors.enabled)
+		return;
+
+	ADF_RAS_ERR_CTR_CLEAR(accel_dev->ras_errors);
+
+	if (device_add_group(&GET_DEV(accel_dev), &qat_ras_group))
+		dev_err(&GET_DEV(accel_dev),
+			"Failed to create qat_ras attribute group.\n");
+}
+
+void adf_sysfs_stop_ras(struct adf_accel_dev *accel_dev)
+{
+	if (!accel_dev->ras_errors.enabled)
+		return;
+
+	device_remove_group(&GET_DEV(accel_dev), &qat_ras_group);
+
+	ADF_RAS_ERR_CTR_CLEAR(accel_dev->ras_errors);
+}
diff --git a/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.h b/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.h
new file mode 100644
index 0000000000000..99e9d9cf57f84
--- /dev/null
+++ b/drivers/crypto/intel/qat/qat_common/adf_sysfs_ras_counters.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Intel Corporation */
+
+#ifndef ADF_RAS_H
+#define ADF_RAS_H
+
+#include <linux/bitops.h>
+#include <linux/atomic.h>
+
+struct adf_accel_dev;
+
+void adf_sysfs_start_ras(struct adf_accel_dev *accel_dev);
+void adf_sysfs_stop_ras(struct adf_accel_dev *accel_dev);
+
+#define ADF_RAS_ERR_CTR_READ(ras_errors, ERR) \
+	atomic_read(&(ras_errors).counter[ERR])
+
+#define ADF_RAS_ERR_CTR_CLEAR(ras_errors) \
+	do { \
+		for (int err = 0; err < ADF_RAS_ERRORS; ++err) \
+			atomic_set(&(ras_errors).counter[err], 0); \
+	} while (0)
+
+#define ADF_RAS_ERR_CTR_INC(ras_errors, ERR) \
+	atomic_inc(&(ras_errors).counter[ERR])
+
+#endif /* ADF_RAS_H */
-- 
2.35.3