|
Thomas Bogendoerfer |
1edd66 |
From: Yangyang Li <liyangyang20@huawei.com>
|
|
Thomas Bogendoerfer |
1edd66 |
Date: Tue, 23 Nov 2021 22:24:02 +0800
|
|
Thomas Bogendoerfer |
1edd66 |
Subject: RDMA/hns: Do not destroy QP resources in the hw resetting phase
|
|
Thomas Bogendoerfer |
1edd66 |
Patch-mainline: v5.16-rc5
|
|
Thomas Bogendoerfer |
1edd66 |
Git-commit: b0969f83890bf8b47f5c8bd42539599b2b52fdeb
|
|
Thomas Bogendoerfer |
1edd66 |
References: bsc#1190336
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
When hns_roce_v2_destroy_qp() is called, the brief calling process of the
|
|
Thomas Bogendoerfer |
1edd66 |
driver is as follows:
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
......
|
|
Thomas Bogendoerfer |
1edd66 |
hns_roce_v2_destroy_qp
|
|
Thomas Bogendoerfer |
1edd66 |
hns_roce_v2_qp_modify
|
|
Thomas Bogendoerfer |
1edd66 |
hns_roce_cmd_mbox
|
|
Thomas Bogendoerfer |
1edd66 |
hns_roce_qp_destroy
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
If hns_roce_cmd_mbox() detects that the hardware is being reset during the
|
|
Thomas Bogendoerfer |
1edd66 |
execution of the hns_roce_cmd_mbox(), the driver will not be able to get
|
|
Thomas Bogendoerfer |
1edd66 |
the return value from the hardware (the firmware cannot respond to the
|
|
Thomas Bogendoerfer |
1edd66 |
driver's mailbox during the hardware reset phase).
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
The driver needs to wait for the hardware reset to complete before
|
|
Thomas Bogendoerfer |
1edd66 |
continuing to execute hns_roce_qp_destroy(), otherwise it may happen that
|
|
Thomas Bogendoerfer |
1edd66 |
the driver releases the resources but the hardware is still accessing. In
|
|
Thomas Bogendoerfer |
1edd66 |
order to fix this problem, HNS RoCE needs to add a piece of code to wait
|
|
Thomas Bogendoerfer |
1edd66 |
for the hardware reset to complete.
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
The original interface get_hw_reset_stat() is the instantaneous state of
|
|
Thomas Bogendoerfer |
1edd66 |
the hardware reset, which cannot accurately reflect whether the hardware
|
|
Thomas Bogendoerfer |
1edd66 |
reset is completed, so it needs to be replaced with the ae_dev_reset_cnt
|
|
Thomas Bogendoerfer |
1edd66 |
interface.
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
The sign that the hardware reset is complete is that the return value of
|
|
Thomas Bogendoerfer |
1edd66 |
the ae_dev_reset_cnt interface is greater than the original value
|
|
Thomas Bogendoerfer |
1edd66 |
reset_cnt recorded by the driver.
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
Fixes: 6a04aed6afae ("RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ during reset")
|
|
Thomas Bogendoerfer |
1edd66 |
Link: https://lore.kernel.org/r/20211123142402.26936-1-liangwenpeng@huawei.com
|
|
Thomas Bogendoerfer |
1edd66 |
Signed-off-by: Yangyang Li <liyangyang20@huawei.com>
|
|
Thomas Bogendoerfer |
1edd66 |
Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com>
|
|
Thomas Bogendoerfer |
1edd66 |
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
|
|
Thomas Bogendoerfer |
1edd66 |
Acked-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
|
|
Thomas Bogendoerfer |
1edd66 |
---
|
|
Thomas Bogendoerfer |
1edd66 |
drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 +++++++++++-
|
|
Thomas Bogendoerfer |
1edd66 |
1 file changed, 11 insertions(+), 1 deletion(-)
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
|
|
Thomas Bogendoerfer |
1edd66 |
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
|
|
Thomas Bogendoerfer |
1edd66 |
@@ -33,6 +33,7 @@
|
|
Thomas Bogendoerfer |
1edd66 |
#include <linux/acpi.h>
|
|
Thomas Bogendoerfer |
1edd66 |
#include <linux/etherdevice.h>
|
|
Thomas Bogendoerfer |
1edd66 |
#include <linux/interrupt.h>
|
|
Thomas Bogendoerfer |
1edd66 |
+#include <linux/iopoll.h>
|
|
Thomas Bogendoerfer |
1edd66 |
#include <linux/kernel.h>
|
|
Thomas Bogendoerfer |
1edd66 |
#include <linux/types.h>
|
|
Thomas Bogendoerfer |
1edd66 |
#include <net/addrconf.h>
|
|
Thomas Bogendoerfer |
1edd66 |
@@ -1050,9 +1051,14 @@ static u32 hns_roce_v2_cmd_hw_resetting(
|
|
Thomas Bogendoerfer |
1edd66 |
unsigned long instance_stage,
|
|
Thomas Bogendoerfer |
1edd66 |
unsigned long reset_stage)
|
|
Thomas Bogendoerfer |
1edd66 |
{
|
|
Thomas Bogendoerfer |
1edd66 |
+#define HW_RESET_TIMEOUT_US 1000000
|
|
Thomas Bogendoerfer |
1edd66 |
+#define HW_RESET_SLEEP_US 1000
|
|
Thomas Bogendoerfer |
1edd66 |
+
|
|
Thomas Bogendoerfer |
1edd66 |
struct hns_roce_v2_priv *priv = hr_dev->priv;
|
|
Thomas Bogendoerfer |
1edd66 |
struct hnae3_handle *handle = priv->handle;
|
|
Thomas Bogendoerfer |
1edd66 |
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
|
|
Thomas Bogendoerfer |
1edd66 |
+ unsigned long val;
|
|
Thomas Bogendoerfer |
1edd66 |
+ int ret;
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
/* When hardware reset is detected, we should stop sending mailbox&cmq&
|
|
Thomas Bogendoerfer |
1edd66 |
* doorbell to hardware. If now in .init_instance() function, we should
|
|
Thomas Bogendoerfer |
1edd66 |
@@ -1064,7 +1070,11 @@ static u32 hns_roce_v2_cmd_hw_resetting(
|
|
Thomas Bogendoerfer |
1edd66 |
* again.
|
|
Thomas Bogendoerfer |
1edd66 |
*/
|
|
Thomas Bogendoerfer |
1edd66 |
hr_dev->dis_db = true;
|
|
Thomas Bogendoerfer |
1edd66 |
- if (!ops->get_hw_reset_stat(handle))
|
|
Thomas Bogendoerfer |
1edd66 |
+
|
|
Thomas Bogendoerfer |
1edd66 |
+ ret = read_poll_timeout(ops->ae_dev_reset_cnt, val,
|
|
Thomas Bogendoerfer |
1edd66 |
+ val > hr_dev->reset_cnt, HW_RESET_SLEEP_US,
|
|
Thomas Bogendoerfer |
1edd66 |
+ HW_RESET_TIMEOUT_US, false, handle);
|
|
Thomas Bogendoerfer |
1edd66 |
+ if (!ret)
|
|
Thomas Bogendoerfer |
1edd66 |
hr_dev->is_reset = true;
|
|
Thomas Bogendoerfer |
1edd66 |
|
|
Thomas Bogendoerfer |
1edd66 |
if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
|