Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.22.x] prov/efa: backport several recent changes #10396

Merged
merged 5 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion prov/efa/docs/efa_rdm_protocol_v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ Note, the field `extra_info` was named `features` when protocol v4 was initially
only planned for extra features. Later, we discovered that the handshake subprotocol can also be used to pass
additional request information, thus introduced the concept of "extra request" and renamed this field `extra_info`.

`nextra_p3` is number of `extra_info` flags of the endpoint plus 3. The "plus 3" is for historical reasons.
`nextra_p3` is number of 64-bit `extra_info` elements of the endpoint plus 3. The "plus 3" is for historical reasons.
When protocol v4 was initially introduced, this field is named `maxproto`. The original plan was that protocol
v4 can only have 64 extra features/requests. If the number of extra feature/request ever exceeds 64, the next
feature/request will be defined as version 5 feature/request, (version 6 if the number exceeds 128, so on so
Expand Down
13 changes: 3 additions & 10 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,6 @@ struct efa_rdm_ep {
/* Applicaiton's message prefix size. */
size_t msg_prefix_size;

/* EFA RDM protocol's max header size */
size_t max_proto_hdr_size;

/* tx iov limit of EFA device */
size_t efa_device_iov_limit;

/* threshold to release multi_recv buffer */
size_t min_multi_recv_size;

Expand Down Expand Up @@ -193,7 +187,6 @@ struct efa_rdm_ep {
*/
bool use_device_rdma;

struct fi_info *user_info; /**< fi_info passed by user when calling fi_endpoint */
bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */
bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */
char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */
Expand Down Expand Up @@ -246,7 +239,7 @@ static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep)

static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep)
{
return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS));
return ((ep->base_ep.info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->base_ep.info->rx_attr->msg_order & FI_ORDER_SAS));
}


Expand Down Expand Up @@ -371,7 +364,7 @@ bool efa_rdm_ep_support_rdma_write(struct efa_rdm_ep *ep)
* @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was.
*/
static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) {
if ((ep->user_info->caps & FI_RMA) == FI_RMA)
if ((ep->base_ep.info->caps & FI_RMA) == FI_RMA)
return 0;
EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n");
return -FI_EOPNOTSUPP;
Expand All @@ -382,7 +375,7 @@ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) {
* @return -FI_EOPNOTSUPP if FI_ATOMIC wasn't requested, 0 if it was.
*/
static inline int efa_rdm_ep_cap_check_atomic(struct efa_rdm_ep *ep) {
if ((ep->user_info->caps & FI_ATOMIC) == FI_ATOMIC)
if ((ep->base_ep.info->caps & FI_ATOMIC) == FI_ATOMIC)
return 0;
EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_ATOMIC capability, which was not requested.\n");
return -FI_EOPNOTSUPP;
Expand Down
37 changes: 13 additions & 24 deletions prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,9 +454,9 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep)
}

/* Max msg size is too large, turn off zcpy recv */
if (ep->max_msg_size > ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size) {
if (ep->max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) {
EFA_INFO(FI_LOG_EP_CTRL, "max_msg_size (%zu) is greater than the mtu size limit: %zu. Zero-copy receive protocol will be disabled.\n",
ep->max_msg_size, ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size);
ep->max_msg_size, ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size);
ep->use_zcpy_rx = false;
goto out;
}
Expand Down Expand Up @@ -552,12 +552,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
efa_rdm_ep->shm_ep = NULL;
}

efa_rdm_ep->user_info = fi_dupinfo(info);
if (!efa_rdm_ep->user_info) {
ret = -FI_ENOMEM;
goto err_free_ep;
}

efa_rdm_ep->host_id = efa_get_host_id(efa_env.host_id_file);
if (efa_rdm_ep->host_id) {
EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id);
Expand All @@ -570,17 +564,15 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
efa_rdm_ep->inject_size = info->tx_attr->inject_size;
efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size;
efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size;
efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit;
efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version);
efa_rdm_ep->shm_permitted = true;
efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size;
efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size;
efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size;
efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size();
efa_rdm_ep->mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size;

efa_rdm_ep->max_data_payload_size = efa_rdm_ep->mtu_size - sizeof(struct efa_rdm_ctsdata_hdr) - sizeof(struct efa_rdm_ctsdata_opt_connid_hdr);
efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_ep->max_proto_hdr_size;
efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_pkt_type_get_max_hdr_size();

if (efa_env.tx_queue_size > 0 &&
efa_env.tx_queue_size < efa_rdm_ep->efa_max_outstanding_tx_ops)
Expand Down Expand Up @@ -1001,9 +993,6 @@ static int efa_rdm_ep_close(struct fid *fid)
if (efa_rdm_ep->pke_vec)
free(efa_rdm_ep->pke_vec);

if (efa_rdm_ep->user_info)
fi_freeinfo(efa_rdm_ep->user_info);

free(efa_rdm_ep);
return retv;
}
Expand Down Expand Up @@ -1139,7 +1128,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep)

use_shm = true;

assert(ep->user_info);
assert(ep->base_ep.info);

/*
* shm provider must make cuda calls to transfer cuda memory.
Expand All @@ -1149,7 +1138,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep)
* AWS Neuron and Habana Synapse, have no SHM provider
* support anyways, so disabling SHM will not impact them.
*/
if (((ep->user_info->caps & FI_HMEM)
if (((ep->base_ep.info->caps & FI_HMEM)
&& hmem_ops[FI_HMEM_CUDA].initialized
&& !ep->cuda_api_permitted)
|| !ep->shm_permitted) {
Expand Down Expand Up @@ -1470,11 +1459,11 @@ static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool sh
*/
static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_size)
{
if (max_msg_size > ep->user_info->ep_attr->max_msg_size) {
if (max_msg_size > ep->base_ep.info->ep_attr->max_msg_size) {
EFA_WARN(FI_LOG_EP_CTRL,
"Requested size of %zu for FI_OPT_MAX_MSG_SIZE "
"exceeds the maximum (%zu)\n",
max_msg_size, ep->user_info->ep_attr->max_msg_size);
max_msg_size, ep->base_ep.info->ep_attr->max_msg_size);
return -FI_EINVAL;
}
ep->max_msg_size = max_msg_size;
Expand All @@ -1496,11 +1485,11 @@ static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_siz
*/
static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_size)
{
if (max_rma_size > ep->user_info->ep_attr->max_msg_size) {
if (max_rma_size > ep->base_ep.info->ep_attr->max_msg_size) {
EFA_WARN(FI_LOG_EP_CTRL,
"Requested size of %zu for FI_OPT_MAX_RMA_SIZE "
"exceeds the maximum (%zu)\n",
max_rma_size, ep->user_info->ep_attr->max_msg_size);
max_rma_size, ep->base_ep.info->ep_attr->max_msg_size);
return -FI_EINVAL;
}
ep->max_rma_size = max_rma_size;
Expand All @@ -1522,11 +1511,11 @@ static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_siz
*/
static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_msg_size)
{
if (inject_msg_size > ep->user_info->tx_attr->inject_size) {
if (inject_msg_size > ep->base_ep.info->tx_attr->inject_size) {
EFA_WARN(FI_LOG_EP_CTRL,
"Requested size of %zu for FI_OPT_INJECT_MSG_SIZE "
"exceeds the maximum (%zu)\n",
inject_msg_size, ep->user_info->tx_attr->inject_size);
inject_msg_size, ep->base_ep.info->tx_attr->inject_size);
return -FI_EINVAL;
}
ep->inject_size = inject_msg_size;
Expand All @@ -1548,11 +1537,11 @@ static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_m
*/
static int efa_rdm_ep_set_inject_rma_size(struct efa_rdm_ep *ep, size_t inject_rma_size)
{
if (inject_rma_size > ep->user_info->tx_attr->inject_size) {
if (inject_rma_size > ep->base_ep.info->tx_attr->inject_size) {
EFA_WARN(FI_LOG_EP_CTRL,
"Requested size of %zu for FI_OPT_INJECT_RMA_SIZE "
"exceeds the maximum (%zu)\n",
inject_rma_size, ep->user_info->tx_attr->inject_size);
inject_rma_size, ep->base_ep.info->tx_attr->inject_size);
return -FI_EINVAL;
}
ep->inject_size = inject_rma_size;
Expand Down
2 changes: 1 addition & 1 deletion prov/efa/src/rdm/efa_rdm_ope.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe,
txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count);
txe->cq_entry.buf = OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL;

if (ep->user_info->mode & FI_MSG_PREFIX) {
if (ep->base_ep.info->mode & FI_MSG_PREFIX) {
ofi_consume_iov_desc(txe->iov, txe->desc, &txe->iov_count, ep->msg_prefix_size);
}
txe->total_len = ofi_total_iov_len(txe->iov, txe->iov_count);
Expand Down
2 changes: 0 additions & 2 deletions prov/efa/src/rdm/efa_rdm_peer.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ struct efa_rdm_peer {
int rnr_queued_pkt_cnt; /**< queued RNR packet count */
struct dlist_entry rnr_backoff_entry; /**< linked to efa_domain->peer_backoff_list */
struct dlist_entry handshake_queued_entry; /**< linked with efa_domain->handshake_queued_peer_list */
struct dlist_entry rx_unexp_list; /**< a list of unexpected untagged rxe for this peer */
struct dlist_entry rx_unexp_tagged_list; /**< a list of unexpected tagged rxe for this peer */
struct dlist_entry txe_list; /**< a list of txe related to this peer */
struct dlist_entry rxe_list; /**< a list of rxe relased to this peer */
struct dlist_entry overflow_pke_list; /**< a list of out-of-order pke that overflow the current recvwin */
Expand Down
7 changes: 6 additions & 1 deletion prov/efa/src/rdm/efa_rdm_protocol.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,12 @@ struct efa_ep_addr {
#define EFA_RDM_EXTRA_FEATURE_READ_NACK BIT_ULL(6)
#define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP BIT_ULL(7)
#define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST 8
#define EFA_RDM_MAX_NUM_EXINFO (256)
/*
* The length of 64-bit extra_info array used in efa_rdm_ep
* and efa_rdm_peer
* 4 means 64*4=256 bits of extra features or requests
*/
#define EFA_RDM_MAX_NUM_EXINFO (4)

/*
* Packet type ID of each packet type (section 1.3)
Expand Down
6 changes: 3 additions & 3 deletions prov/efa/test/efa_unit_test_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource,
will_return(efa_mock_ibv_end_poll_check_mock, NULL);
will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND);
will_return(efa_mock_ibv_read_vendor_err_return_mock, vendor_error);
will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
ret = fi_cq_read(resource->cq, &cq_entry, 1);
/* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */
assert_int_equal(g_ibv_submitted_wr_id_cnt, 0);
Expand Down Expand Up @@ -317,7 +317,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state)
* therefore use will_return_always()
*/
will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
will_return_always(efa_mock_ibv_read_qp_num_return_mock, 0);
will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
/* the recv error will not populate to application cq because it's an EFA internal error and
* and not related to any application recv. Currently we can only read the error from eq.
Expand Down Expand Up @@ -612,7 +612,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc
will_return(efa_mock_ibv_read_slid_return_mock, 0xffff); // slid=0xffff(-1) indicates an unknown AH
will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size);
will_return_maybe(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, 0);
will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0);
will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn);

Expand Down
8 changes: 4 additions & 4 deletions prov/efa/test/efa_unit_test_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin
will_return(efa_mock_ibv_next_poll_check_function_called_and_return_mock, ENOENT);
will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size);
will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
will_return(efa_mock_ibv_read_wc_flags_return_mock, 0);
will_return(efa_mock_ibv_read_slid_return_mock, efa_rdm_ep_get_peer_ahn(efa_rdm_ep, peer_addr));
will_return(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn);
Expand All @@ -204,7 +204,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin
*/
will_return(efa_mock_ibv_end_poll_check_mock, NULL);
will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND);
will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
will_return(efa_mock_ibv_read_vendor_err_return_mock, FI_EFA_ERR_OTHER);
will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS);

Expand Down Expand Up @@ -742,7 +742,7 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state)

/* ensure we don't have RMA capability. */
efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
assert_int_equal( efa_rdm_ep->user_info->caps & FI_RMA, 0);
assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_RMA, 0);

/* create a fake peer */
err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
Expand Down Expand Up @@ -793,7 +793,7 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state)

/* ensure we don't have ATOMIC capability. */
efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
assert_int_equal( efa_rdm_ep->user_info->caps & FI_ATOMIC, 0);
assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_ATOMIC, 0);

/* create a fake peer */
err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
Expand Down
Loading