Skip to content

Commit

Permalink
prov/efa: Improve the zero-copy recv error message.
Browse files Browse the repository at this point in the history
Extend the EFA_PROV_ERRNOS and efa_show_help to process
the error message when the receiver has zcpy recv turned on
but get a rtm pkt that it cannot handle. The extended error
message includes possible root causes and the potential mitigations.

Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
shijin-aws committed Sep 14, 2024
1 parent af2dba1 commit ee4d578
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
3 changes: 2 additions & 1 deletion prov/efa/src/efa_errno.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@
_(4121, DGRAM_CQ_READ, Error reading from DGRAM CQ) \
_(4122, SHM_INTERNAL_ERROR, SHM internal error) \
_(4123, WRITE_SHM_CQ_ENTRY, Failure to write CQ entry for SHM operation) \
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established))
_(4124, ESTABLISHED_RECV_UNRESP, Unresponsive receiver (connection previously established)) \
_(4125, INVALID_PKT_TYPE_ZCPY_RX, Invalid packet type received when zero copy recv mode is ON)

/** @} */

Expand Down
5 changes: 5 additions & 0 deletions prov/efa/src/efa_strerror.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ void efa_show_help(enum efa_errno err) {
"which indicates the error is likely due to the peer process no "
"longer being present.";
break;
case FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX:
help = "This error is detected locally. "
"Please consider matching the local and remote libfabric versions, or turning off "
"the zero-copy recv feature by setting FI_EFA_USE_ZCPY_RX=0 in the environment";
break;
default:
return;
}
Expand Down
11 changes: 6 additions & 5 deletions prov/efa/src/rdm/efa_rdm_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,13 @@ static void efa_rdm_cq_handle_recv_completion(struct efa_ibv_cq *ibv_cq, struct
* QP and we cannot cancel that.
*/
if (OFI_UNLIKELY(ep->use_zcpy_rx && efa_rdm_pkt_type_is_rtm(pkt_type))) {
EFA_WARN(FI_LOG_CQ,
"Invalid pkt type %d! Peer %d doesn't respect the request from this EP that"
" RTM packets must be sent to the user recv QP.\n",
base_hdr->type, (int)pkt_entry->addr);
void *errbuf;
size_t errbuf_len;

efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE);
/* local & peer host-id & ep address will be logged by efa_rdm_write_error_msg */
if (!efa_rdm_write_error_msg(ep, pkt_entry->addr, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX, &errbuf, &errbuf_len))
EFA_WARN(FI_LOG_CQ, "Error: %s\n", (const char *) errbuf);
efa_base_ep_write_eq_error(&ep->base_ep, FI_EINVAL, FI_EFA_ERR_INVALID_PKT_TYPE_ZCPY_RX);
efa_rdm_pke_release_rx(pkt_entry);
return;
}
Expand Down

0 comments on commit ee4d578

Please sign in to comment.