Skip to content

Commit d3743fa

Browse files
xavierhwjgunthorpe
authored andcommitted
RDMA/hns: Fix the chip hanging caused by sending doorbell during reset
On hi08 chip, There is a possibility of chip hanging when sending doorbell during reset. We can fix it by prohibiting doorbell during reset. Fixes: 2d40788 ("RDMA/hns: Add support for processing send wr and receive wr") Signed-off-by: Wei Hu (Xavier) <[email protected]> Signed-off-by: Jason Gunthorpe <[email protected]>
1 parent 6a04aed commit d3743fa

File tree

3 files changed

+28
-9
lines changed

3 files changed

+28
-9
lines changed

drivers/infiniband/hw/hns/hns_roce_device.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,7 @@ struct hns_roce_dev {
947947
spinlock_t bt_cmd_lock;
948948
bool active;
949949
bool is_reset;
950+
bool dis_db;
950951
unsigned long reset_cnt;
951952
struct hns_roce_ib_iboe iboe;
952953

drivers/infiniband/hw/hns/hns_roce_hw_v2.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -587,7 +587,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp,
587587
roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
588588
V2_DB_PARAMETER_SL_S, qp->sl);
589589

590-
hns_roce_write64_k((__le32 *)&sq_db, qp->sq.db_reg_l);
590+
hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
591591

592592
qp->sq_next_wqe = ind;
593593
qp->next_sge = sge_ind;
@@ -717,7 +717,7 @@ static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
717717
unsigned long reset_stage)
718718
{
719719
/* When hardware reset has been completed once or more, we should stop
720-
* sending mailbox&cmq to hardware. If now in .init_instance()
720+
* sending mailbox&cmq&doorbell to hardware. If now in .init_instance()
721721
* function, we should exit with error. If now at HNAE3_INIT_CLIENT
722722
* stage of soft reset process, we should exit with error, and then
723723
* HNAE3_INIT_CLIENT related process can rollback the operation like
@@ -726,6 +726,7 @@ static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
726726
* reset process once again.
727727
*/
728728
hr_dev->is_reset = true;
729+
hr_dev->dis_db = true;
729730

730731
if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
731732
instance_stage == HNS_ROCE_STATE_INIT)
@@ -742,15 +743,16 @@ static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
742743
struct hnae3_handle *handle = priv->handle;
743744
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
744745

745-
/* When hardware reset is detected, we should stop sending mailbox&cmq
746-
* to hardware. If now in .init_instance() function, we should
746+
/* When hardware reset is detected, we should stop sending mailbox&cmq&
747+
* doorbell to hardware. If now in .init_instance() function, we should
747748
* exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
748749
* process, we should exit with error, and then HNAE3_INIT_CLIENT
749750
* related process can rollback the operation like notifing hardware to
750751
* free resources, HNAE3_INIT_CLIENT related process will exit with
751752
* error to notify NIC driver to reschedule soft reset process once
752753
* again.
753754
*/
755+
hr_dev->dis_db = true;
754756
if (!ops->get_hw_reset_stat(handle))
755757
hr_dev->is_reset = true;
756758

@@ -768,9 +770,10 @@ static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
768770
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
769771

770772
/* When software reset is detected at .init_instance() function, we
771-
* should stop sending mailbox&cmq to hardware, and exit with
772-
* error.
773+
* should stop sending mailbox&cmq&doorbell to hardware, and exit
774+
* with error.
773775
*/
776+
hr_dev->dis_db = true;
774777
if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
775778
hr_dev->is_reset = true;
776779

@@ -2495,6 +2498,7 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
24952498
static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
24962499
enum ib_cq_notify_flags flags)
24972500
{
2501+
struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
24982502
struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
24992503
u32 notification_flag;
25002504
u32 doorbell[2];
@@ -2520,7 +2524,7 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq,
25202524
roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
25212525
notification_flag);
25222526

2523-
hns_roce_write64_k(doorbell, hr_cq->cq_db_l);
2527+
hns_roce_write64(hr_dev, doorbell, hr_cq->cq_db_l);
25242528

25252529
return 0;
25262530
}
@@ -4763,6 +4767,7 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev,
47634767

47644768
static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
47654769
{
4770+
struct hns_roce_dev *hr_dev = eq->hr_dev;
47664771
u32 doorbell[2];
47674772

47684773
doorbell[0] = 0;
@@ -4789,7 +4794,7 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
47894794
HNS_ROCE_V2_EQ_DB_PARA_S,
47904795
(eq->cons_index & HNS_ROCE_V2_CONS_IDX_M));
47914796

4792-
hns_roce_write64_k(doorbell, eq->doorbell);
4797+
hns_roce_write64(hr_dev, doorbell, eq->doorbell);
47934798
}
47944799

47954800
static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
@@ -6011,6 +6016,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
60116016
const struct ib_recv_wr *wr,
60126017
const struct ib_recv_wr **bad_wr)
60136018
{
6019+
struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
60146020
struct hns_roce_srq *srq = to_hr_srq(ibsrq);
60156021
struct hns_roce_v2_wqe_data_seg *dseg;
60166022
struct hns_roce_v2_db srq_db;
@@ -6072,7 +6078,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
60726078
srq_db.byte_4 = HNS_ROCE_V2_SRQ_DB << 24 | srq->srqn;
60736079
srq_db.parameter = srq->head;
60746080

6075-
hns_roce_write64_k((__le32 *)&srq_db, srq->db_reg_l);
6081+
hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
60766082

60776083
}
60786084

@@ -6309,6 +6315,7 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
63096315
return 0;
63106316

63116317
hr_dev->active = false;
6318+
hr_dev->dis_db = true;
63126319

63136320
event.event = IB_EVENT_DEVICE_FATAL;
63146321
event.device = &hr_dev->ib_dev;

drivers/infiniband/hw/hns/hns_roce_hw_v2.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1799,4 +1799,15 @@ struct hns_roce_sccc_clr_done {
17991799
__le32 rsv[5];
18001800
};
18011801

1802+
static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
1803+
void __iomem *dest)
1804+
{
1805+
struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
1806+
struct hnae3_handle *handle = priv->handle;
1807+
const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
1808+
1809+
if (!hr_dev->dis_db && !ops->get_hw_reset_stat(handle))
1810+
hns_roce_write64_k(val, dest);
1811+
}
1812+
18021813
#endif

0 commit comments

Comments
 (0)