Skip to content

Commit 6a04aed

Browse files
xavierhwjgunthorpe
authored andcommitted
RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ during reset
On hi08 chip, There is a possibility of chip hanging and some errors when sending mailbox & doorbell during reset. We can fix it by prohibiting mailbox and doorbell during reset and reset occurred to ensure that hardware can work normally. Fixes: a04ff73 ("RDMA/hns: Add command queue support for hip08 RoCE driver") Signed-off-by: Wei Hu (Xavier) <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent d061eff commit 6a04aed

File tree

4 files changed

+167
-13
lines changed

4 files changed

+167
-13
lines changed

drivers/infiniband/hw/hns/hns_roce_cmd.c

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,17 +176,33 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
176176
unsigned long in_modifier, u8 op_modifier, u16 op,
177177
unsigned long timeout)
178178
{
179-
if (hr_dev->is_reset)
180-
return 0;
179+
int ret;
180+
181+
if (hr_dev->hw->rst_prc_mbox) {
182+
ret = hr_dev->hw->rst_prc_mbox(hr_dev);
183+
if (ret == CMD_RST_PRC_SUCCESS)
184+
return 0;
185+
else if (ret == CMD_RST_PRC_EBUSY)
186+
return -EBUSY;
187+
}
181188

182189
if (hr_dev->cmd.use_events)
183-
return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
184-
in_modifier, op_modifier, op,
185-
timeout);
190+
ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
191+
in_modifier, op_modifier, op,
192+
timeout);
186193
else
187-
return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
188-
in_modifier, op_modifier, op,
189-
timeout);
194+
ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
195+
in_modifier, op_modifier, op,
196+
timeout);
197+
198+
if (ret == CMD_RST_PRC_EBUSY)
199+
return -EBUSY;
200+
201+
if (ret && (hr_dev->hw->rst_prc_mbox &&
202+
hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
203+
return 0;
204+
205+
return ret;
190206
}
191207
EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);
192208

drivers/infiniband/hw/hns/hns_roce_device.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,12 @@ enum {
237237
HNS_ROCE_RST_DIRECT_RETURN = 0,
238238
};
239239

240+
enum {
241+
CMD_RST_PRC_OTHERS,
242+
CMD_RST_PRC_SUCCESS,
243+
CMD_RST_PRC_EBUSY,
244+
};
245+
240246
#define HNS_ROCE_CMD_SUCCESS 1
241247

242248
#define HNS_ROCE_PORT_DOWN 0
@@ -874,6 +880,7 @@ struct hns_roce_hw {
874880
u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
875881
u16 token, int event);
876882
int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
883+
int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
877884
int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
878885
const union ib_gid *gid, const struct ib_gid_attr *attr);
879886
int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);

drivers/infiniband/hw/hns/hns_roce_hw_v2.c

Lines changed: 134 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,110 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
712712
return ret;
713713
}
714714

715+
static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
716+
unsigned long instance_stage,
717+
unsigned long reset_stage)
718+
{
719+
/* When hardware reset has been completed once or more, we should stop
720+
* sending mailbox&cmq to hardware. If now in .init_instance()
721+
* function, we should exit with error. If now at HNAE3_INIT_CLIENT
722+
* stage of soft reset process, we should exit with error, and then
723+
* HNAE3_INIT_CLIENT related process can rollback the operation like
724+
* notifing hardware to free resources, HNAE3_INIT_CLIENT related
725+
* process will exit with error to notify NIC driver to reschedule soft
726+
* reset process once again.
727+
*/
728+
hr_dev->is_reset = true;
729+
730+
if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
731+
instance_stage == HNS_ROCE_STATE_INIT)
732+
return CMD_RST_PRC_EBUSY;
733+
734+
return CMD_RST_PRC_SUCCESS;
735+
}
736+
737+
static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
738+
unsigned long instance_stage,
739+
unsigned long reset_stage)
740+
{
741+
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
742+
struct hnae3_handle *handle = priv->handle;
743+
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
744+
745+
/* When hardware reset is detected, we should stop sending mailbox&cmq
746+
* to hardware. If now in .init_instance() function, we should
747+
* exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
748+
* process, we should exit with error, and then HNAE3_INIT_CLIENT
749+
* related process can rollback the operation like notifing hardware to
750+
* free resources, HNAE3_INIT_CLIENT related process will exit with
751+
* error to notify NIC driver to reschedule soft reset process once
752+
* again.
753+
*/
754+
if (!ops->get_hw_reset_stat(handle))
755+
hr_dev->is_reset = true;
756+
757+
if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
758+
instance_stage == HNS_ROCE_STATE_INIT)
759+
return CMD_RST_PRC_EBUSY;
760+
761+
return CMD_RST_PRC_SUCCESS;
762+
}
763+
764+
static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
765+
{
766+
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
767+
struct hnae3_handle *handle = priv->handle;
768+
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
769+
770+
/* When software reset is detected at .init_instance() function, we
771+
* should stop sending mailbox&cmq to hardware, and exit with
772+
* error.
773+
*/
774+
if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
775+
hr_dev->is_reset = true;
776+
777+
return CMD_RST_PRC_EBUSY;
778+
}
779+
780+
static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
781+
{
782+
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
783+
struct hnae3_handle *handle = priv->handle;
784+
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
785+
unsigned long instance_stage; /* the current instance stage */
786+
unsigned long reset_stage; /* the current reset stage */
787+
unsigned long reset_cnt;
788+
bool sw_resetting;
789+
bool hw_resetting;
790+
791+
if (hr_dev->is_reset)
792+
return CMD_RST_PRC_SUCCESS;
793+
794+
/* Get information about reset from NIC driver or RoCE driver itself,
795+
* the meaning of the following variables from NIC driver are described
796+
* as below:
797+
* reset_cnt -- The count value of completed hardware reset.
798+
* hw_resetting -- Whether hardware device is resetting now.
799+
* sw_resetting -- Whether NIC's software reset process is running now.
800+
*/
801+
instance_stage = handle->rinfo.instance_state;
802+
reset_stage = handle->rinfo.reset_state;
803+
reset_cnt = ops->ae_dev_reset_cnt(handle);
804+
hw_resetting = ops->get_hw_reset_stat(handle);
805+
sw_resetting = ops->ae_dev_resetting(handle);
806+
807+
if (reset_cnt != hr_dev->reset_cnt)
808+
return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
809+
reset_stage);
810+
else if (hw_resetting)
811+
return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
812+
reset_stage);
813+
else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
814+
return hns_roce_v2_cmd_sw_resetting(hr_dev);
815+
816+
return 0;
817+
}
818+
715819
static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
716820
{
717821
int ntu = ring->next_to_use;
@@ -892,8 +996,8 @@ static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
892996
return clean;
893997
}
894998

895-
static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
896-
struct hns_roce_cmq_desc *desc, int num)
999+
static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
1000+
struct hns_roce_cmq_desc *desc, int num)
8971001
{
8981002
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
8991003
struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
@@ -905,9 +1009,6 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
9051009
int ret = 0;
9061010
int ntc;
9071011

908-
if (hr_dev->is_reset)
909-
return 0;
910-
9111012
spin_lock_bh(&csq->lock);
9121013

9131014
if (num > hns_roce_cmq_space(csq)) {
@@ -982,6 +1083,30 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
9821083
return ret;
9831084
}
9841085

1086+
int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
1087+
struct hns_roce_cmq_desc *desc, int num)
1088+
{
1089+
int retval;
1090+
int ret;
1091+
1092+
ret = hns_roce_v2_rst_process_cmd(hr_dev);
1093+
if (ret == CMD_RST_PRC_SUCCESS)
1094+
return 0;
1095+
if (ret == CMD_RST_PRC_EBUSY)
1096+
return ret;
1097+
1098+
ret = __hns_roce_cmq_send(hr_dev, desc, num);
1099+
if (ret) {
1100+
retval = hns_roce_v2_rst_process_cmd(hr_dev);
1101+
if (retval == CMD_RST_PRC_SUCCESS)
1102+
return 0;
1103+
else if (retval == CMD_RST_PRC_EBUSY)
1104+
return retval;
1105+
}
1106+
1107+
return ret;
1108+
}
1109+
9851110
static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
9861111
{
9871112
struct hns_roce_query_version *resp;
@@ -1857,6 +1982,9 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,
18571982

18581983
status = hns_roce_v2_cmd_complete(hr_dev);
18591984
if (status != 0x1) {
1985+
if (status == CMD_RST_PRC_EBUSY)
1986+
return status;
1987+
18601988
dev_err(dev, "mailbox status 0x%x!\n", status);
18611989
return -EBUSY;
18621990
}
@@ -5977,6 +6105,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
59776105
.hw_exit = hns_roce_v2_exit,
59786106
.post_mbox = hns_roce_v2_post_mbox,
59796107
.chk_mbox = hns_roce_v2_chk_mbox,
6108+
.rst_prc_mbox = hns_roce_v2_rst_process_cmd,
59806109
.set_gid = hns_roce_v2_set_gid,
59816110
.set_mac = hns_roce_v2_set_mac,
59826111
.write_mtpt = hns_roce_v2_write_mtpt,

drivers/infiniband/hw/hns/hns_roce_hw_v2.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696
#define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2
9797
#define HNS_ROCE_V2_RSV_QPS 8
9898

99+
#define HNS_ROCE_V2_HW_RST_TIMEOUT 1000
100+
99101
#define HNS_ROCE_CONTEXT_HOP_NUM 1
100102
#define HNS_ROCE_SCCC_HOP_NUM 1
101103
#define HNS_ROCE_MTT_HOP_NUM 1

0 commit comments

Comments
 (0)