ofiwg · shijin-aws · Sep 19, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 16, 2024
diff --git a/prov/efa/docs/efa_rdm_protocol_v4.md b/prov/efa/docs/efa_rdm_protocol_v4.md
@@ -414,7 +414,7 @@ Note, the field `extra_info` was named `features` when protocol v4 was initially
 only planned for extra features. Later, we discovered that the handshake subprotocol can also be used to pass
 additional request information, thus introduced the concept of "extra request" and renamed this field `extra_info`.
 
-`nextra_p3` is number of `extra_info` flags of the endpoint plus 3. The "plus 3" is for historical reasons.
+`nextra_p3` is number of 64-bit `extra_info` elements of the endpoint plus 3. The "plus 3" is for historical reasons.
 When protocol v4 was initially introduced, this field is named `maxproto`. The original plan was that protocol
 v4 can only have 64 extra features/requests. If the number of extra feature/request ever exceeds 64, the next
 feature/request will be defined as version 5 feature/request, (version 6 if the number exceeds 128, so on so

diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h
@@ -91,12 +91,6 @@ struct efa_rdm_ep {
 	/* Applicaiton's message prefix size. */
 	size_t msg_prefix_size;
 
-	/* EFA RDM protocol's max header size */
-	size_t max_proto_hdr_size;
-
-	/* tx iov limit of EFA device */
-	size_t efa_device_iov_limit;
-
 	/* threshold to release multi_recv buffer */
 	size_t min_multi_recv_size;
 
@@ -193,7 +187,6 @@ struct efa_rdm_ep {
 	*/
 	bool use_device_rdma;
 
-	struct fi_info *user_info; /**< fi_info passed by user when calling fi_endpoint */
 	bool sendrecv_in_order_aligned_128_bytes; /**< whether to support in order send/recv of each aligned 128 bytes memory region */
 	bool write_in_order_aligned_128_bytes; /**< whether to support in order write of each aligned 128 bytes memory region */
 	char err_msg[EFA_RDM_ERROR_MSG_BUFFER_LENGTH]; /* A large enough buffer to store CQ/EQ error data used by e.g. fi_cq_readerr */
@@ -246,7 +239,7 @@ static inline size_t efa_rdm_ep_get_tx_pool_size(struct efa_rdm_ep *ep)
 
 static inline int efa_rdm_ep_need_sas(struct efa_rdm_ep *ep)
 {
-	return ((ep->user_info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->user_info->rx_attr->msg_order & FI_ORDER_SAS));
+	return ((ep->base_ep.info->tx_attr->msg_order & FI_ORDER_SAS) || (ep->base_ep.info->rx_attr->msg_order & FI_ORDER_SAS));
 }
 
 
@@ -371,7 +364,7 @@ bool efa_rdm_ep_support_rdma_write(struct efa_rdm_ep *ep)
  * @return -FI_EOPNOTSUPP if FI_RMA wasn't requested, 0 if it was.
  */
 static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) {
-	if ((ep->user_info->caps & FI_RMA) == FI_RMA)
+	if ((ep->base_ep.info->caps & FI_RMA) == FI_RMA)
 		return 0;
 	EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_RMA capability, which was not requested.\n");
 	return -FI_EOPNOTSUPP;
@@ -382,7 +375,7 @@ static inline int efa_rdm_ep_cap_check_rma(struct efa_rdm_ep *ep) {
  * @return -FI_EOPNOTSUPP if FI_ATOMIC wasn't requested, 0 if it was.
  */
 static inline int efa_rdm_ep_cap_check_atomic(struct efa_rdm_ep *ep) {
-	if ((ep->user_info->caps & FI_ATOMIC) == FI_ATOMIC)
+	if ((ep->base_ep.info->caps & FI_ATOMIC) == FI_ATOMIC)
 		return 0;
 	EFA_WARN_ONCE(FI_LOG_EP_DATA, "Operation requires FI_ATOMIC capability, which was not requested.\n");
 	return -FI_EOPNOTSUPP;

diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c
@@ -454,9 +454,9 @@ void efa_rdm_ep_set_use_zcpy_rx(struct efa_rdm_ep *ep)
 	}
 
 	/* Max msg size is too large, turn off zcpy recv */
-	if (ep->max_msg_size > ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size) {
+	if (ep->max_msg_size > ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size) {
 		EFA_INFO(FI_LOG_EP_CTRL, "max_msg_size (%zu) is greater than the mtu size limit: %zu. Zero-copy receive protocol will be disabled.\n",
-			ep->max_msg_size, ep->mtu_size - ep->user_info->ep_attr->msg_prefix_size);
+			ep->max_msg_size, ep->mtu_size - ep->base_ep.info->ep_attr->msg_prefix_size);
 		ep->use_zcpy_rx = false;
 		goto out;
 	}
@@ -552,12 +552,6 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
 		efa_rdm_ep->shm_ep = NULL;
 	}
 
-	efa_rdm_ep->user_info = fi_dupinfo(info);
-	if (!efa_rdm_ep->user_info) {
-		ret = -FI_ENOMEM;
-		goto err_free_ep;
-	}
-
 	efa_rdm_ep->host_id = efa_get_host_id(efa_env.host_id_file);
 	if (efa_rdm_ep->host_id) {
 		EFA_INFO(FI_LOG_EP_CTRL, "efa_rdm_ep->host_id: i-%017lx\n", efa_rdm_ep->host_id);
@@ -570,17 +564,15 @@ int efa_rdm_ep_open(struct fid_domain *domain, struct fi_info *info,
 	efa_rdm_ep->inject_size = info->tx_attr->inject_size;
 	efa_rdm_ep->efa_max_outstanding_tx_ops = efa_domain->device->rdm_info->tx_attr->size;
 	efa_rdm_ep->efa_max_outstanding_rx_ops = efa_domain->device->rdm_info->rx_attr->size;
-	efa_rdm_ep->efa_device_iov_limit = efa_domain->device->rdm_info->tx_attr->iov_limit;
 	efa_rdm_ep->use_device_rdma = efa_rdm_get_use_device_rdma(info->fabric_attr->api_version);
 	efa_rdm_ep->shm_permitted = true;
 	efa_rdm_ep->max_msg_size = info->ep_attr->max_msg_size;
 	efa_rdm_ep->max_rma_size = info->ep_attr->max_msg_size;
 	efa_rdm_ep->msg_prefix_size = info->ep_attr->msg_prefix_size;
-	efa_rdm_ep->max_proto_hdr_size = efa_rdm_pkt_type_get_max_hdr_size();
 	efa_rdm_ep->mtu_size = efa_domain->device->rdm_info->ep_attr->max_msg_size;
 
 	efa_rdm_ep->max_data_payload_size = efa_rdm_ep->mtu_size - sizeof(struct efa_rdm_ctsdata_hdr) - sizeof(struct efa_rdm_ctsdata_opt_connid_hdr);
-	efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_ep->max_proto_hdr_size;
+	efa_rdm_ep->min_multi_recv_size = efa_rdm_ep->mtu_size - efa_rdm_pkt_type_get_max_hdr_size();
 
 	if (efa_env.tx_queue_size > 0 &&
 	    efa_env.tx_queue_size < efa_rdm_ep->efa_max_outstanding_tx_ops)
@@ -1001,9 +993,6 @@ static int efa_rdm_ep_close(struct fid *fid)
 	if (efa_rdm_ep->pke_vec)
 		free(efa_rdm_ep->pke_vec);
 
-	if (efa_rdm_ep->user_info)
-		fi_freeinfo(efa_rdm_ep->user_info);
-
 	free(efa_rdm_ep);
 	return retv;
 }
@@ -1139,7 +1128,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep)
 
 	use_shm = true;
 
-	assert(ep->user_info);
+	assert(ep->base_ep.info);
 
 	/*
 	 * shm provider must make cuda calls to transfer cuda memory.
@@ -1149,7 +1138,7 @@ void efa_rdm_ep_update_shm(struct efa_rdm_ep *ep)
 	 * AWS Neuron and Habana Synapse, have no SHM provider
 	 * support anyways, so disabling SHM will not impact them.
 	 */
-	if (((ep->user_info->caps & FI_HMEM)
+	if (((ep->base_ep.info->caps & FI_HMEM)
 	    && hmem_ops[FI_HMEM_CUDA].initialized
 	    && !ep->cuda_api_permitted)
 		|| !ep->shm_permitted) {
@@ -1470,11 +1459,11 @@ static int efa_rdm_ep_set_shared_memory_permitted(struct efa_rdm_ep *ep, bool sh
  */
 static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_size)
 {
-	if (max_msg_size > ep->user_info->ep_attr->max_msg_size) {
+	if (max_msg_size > ep->base_ep.info->ep_attr->max_msg_size) {
 		EFA_WARN(FI_LOG_EP_CTRL,
 			"Requested size of %zu for FI_OPT_MAX_MSG_SIZE "
 			"exceeds the maximum (%zu)\n",
-			max_msg_size, ep->user_info->ep_attr->max_msg_size);
+			max_msg_size, ep->base_ep.info->ep_attr->max_msg_size);
 		return -FI_EINVAL;
 	}
 	ep->max_msg_size = max_msg_size;
@@ -1496,11 +1485,11 @@ static int efa_rdm_ep_set_max_msg_size(struct efa_rdm_ep *ep, size_t max_msg_siz
  */
 static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_size)
 {
-	if (max_rma_size > ep->user_info->ep_attr->max_msg_size) {
+	if (max_rma_size > ep->base_ep.info->ep_attr->max_msg_size) {
 		EFA_WARN(FI_LOG_EP_CTRL,
 			"Requested size of %zu for FI_OPT_MAX_RMA_SIZE "
 			"exceeds the maximum (%zu)\n",
-			max_rma_size, ep->user_info->ep_attr->max_msg_size);
+			max_rma_size, ep->base_ep.info->ep_attr->max_msg_size);
 		return -FI_EINVAL;
 	}
 	ep->max_rma_size = max_rma_size;
@@ -1522,11 +1511,11 @@ static int efa_rdm_ep_set_max_rma_size(struct efa_rdm_ep *ep, size_t max_rma_siz
  */
 static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_msg_size)
 {
-	if (inject_msg_size > ep->user_info->tx_attr->inject_size) {
+	if (inject_msg_size > ep->base_ep.info->tx_attr->inject_size) {
 		EFA_WARN(FI_LOG_EP_CTRL,
 			"Requested size of %zu for FI_OPT_INJECT_MSG_SIZE "
 			"exceeds the maximum (%zu)\n",
-			inject_msg_size, ep->user_info->tx_attr->inject_size);
+			inject_msg_size, ep->base_ep.info->tx_attr->inject_size);
 		return -FI_EINVAL;
 	}
 	ep->inject_size = inject_msg_size;
@@ -1548,11 +1537,11 @@ static int efa_rdm_ep_set_inject_msg_size(struct efa_rdm_ep *ep, size_t inject_m
  */
 static int efa_rdm_ep_set_inject_rma_size(struct efa_rdm_ep *ep, size_t inject_rma_size)
 {
-	if (inject_rma_size > ep->user_info->tx_attr->inject_size) {
+	if (inject_rma_size > ep->base_ep.info->tx_attr->inject_size) {
 		EFA_WARN(FI_LOG_EP_CTRL,
 			"Requested size of %zu for FI_OPT_INJECT_RMA_SIZE "
 			"exceeds the maximum (%zu)\n",
-			inject_rma_size, ep->user_info->tx_attr->inject_size);
+			inject_rma_size, ep->base_ep.info->tx_attr->inject_size);
 		return -FI_EINVAL;
 	}
 	ep->inject_size = inject_rma_size;

diff --git a/prov/efa/src/rdm/efa_rdm_ope.c b/prov/efa/src/rdm/efa_rdm_ope.c
@@ -58,7 +58,7 @@ void efa_rdm_txe_construct(struct efa_rdm_ope *txe,
 	txe->cq_entry.len = ofi_total_iov_len(txe->iov, txe->iov_count);
 	txe->cq_entry.buf = OFI_LIKELY(txe->cq_entry.len > 0) ? txe->iov[0].iov_base : NULL;
 
-	if (ep->user_info->mode & FI_MSG_PREFIX) {
+	if (ep->base_ep.info->mode & FI_MSG_PREFIX) {
 		ofi_consume_iov_desc(txe->iov, txe->desc, &txe->iov_count, ep->msg_prefix_size);
 	}
 	txe->total_len = ofi_total_iov_len(txe->iov, txe->iov_count);

diff --git a/prov/efa/src/rdm/efa_rdm_peer.h b/prov/efa/src/rdm/efa_rdm_peer.h
@@ -60,8 +60,6 @@ struct efa_rdm_peer {
 	int rnr_queued_pkt_cnt;		/**< queued RNR packet count */
 	struct dlist_entry rnr_backoff_entry;	/**< linked to efa_domain->peer_backoff_list */
 	struct dlist_entry handshake_queued_entry; /**< linked with efa_domain->handshake_queued_peer_list */
-	struct dlist_entry rx_unexp_list; /**< a list of unexpected untagged rxe for this peer */
-	struct dlist_entry rx_unexp_tagged_list; /**< a list of unexpected tagged rxe for this peer */
 	struct dlist_entry txe_list; /**< a list of txe related to this peer */
 	struct dlist_entry rxe_list; /**< a list of rxe relased to this peer */
 	struct dlist_entry overflow_pke_list; /**< a list of out-of-order pke that overflow the current recvwin */

diff --git a/prov/efa/src/rdm/efa_rdm_protocol.h b/prov/efa/src/rdm/efa_rdm_protocol.h
@@ -41,7 +41,12 @@ struct efa_ep_addr {
 #define EFA_RDM_EXTRA_FEATURE_READ_NACK		BIT_ULL(6)
 #define EFA_RDM_EXTRA_FEATURE_REQUEST_USER_RECV_QP	BIT_ULL(7)
 #define EFA_RDM_NUM_EXTRA_FEATURE_OR_REQUEST		8
-#define EFA_RDM_MAX_NUM_EXINFO				(256)
+/*
+ * The length of 64-bit extra_info array used in efa_rdm_ep
+ * and efa_rdm_peer
+ * 4 means 64*4=256 bits of extra features or requests
+ */
+#define EFA_RDM_MAX_NUM_EXINFO				(4)
 
 /*
  * Packet type ID of each packet type (section 1.3)

diff --git a/prov/efa/test/efa_unit_test_cq.c b/prov/efa/test/efa_unit_test_cq.c
@@ -153,7 +153,7 @@ static void test_rdm_cq_read_bad_send_status(struct efa_resource *resource,
 	will_return(efa_mock_ibv_end_poll_check_mock, NULL);
 	will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND);
 	will_return(efa_mock_ibv_read_vendor_err_return_mock, vendor_error);
-	will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
+	will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
 	ret = fi_cq_read(resource->cq, &cq_entry, 1);
 	/* fi_cq_read() called efa_mock_ibv_start_poll_use_saved_send_wr(), which pulled one send_wr from g_ibv_submitted_wr_idv=_vec */
 	assert_int_equal(g_ibv_submitted_wr_id_cnt, 0);
@@ -317,7 +317,7 @@ void test_ibv_cq_ex_read_bad_recv_status(struct efa_resource **state)
 	 * therefore use will_return_always()
 	 */
 	will_return_always(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
-	will_return_always(efa_mock_ibv_read_qp_num_return_mock, 0);
+	will_return_always(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
 	will_return(efa_mock_ibv_read_vendor_err_return_mock, EFA_IO_COMP_STATUS_LOCAL_ERROR_UNRESP_REMOTE);
 	/* the recv error will not populate to application cq because it's an EFA internal error and
 	 * and not related to any application recv. Currently we can only read the error from eq.
@@ -612,7 +612,7 @@ static void test_impl_ibv_cq_ex_read_unknow_peer_ah(struct efa_resource *resourc
 	will_return(efa_mock_ibv_read_slid_return_mock, 0xffff); // slid=0xffff(-1) indicates an unknown AH
 	will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size);
 	will_return_maybe(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
-	will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, 0);
+	will_return_maybe(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
 	will_return_maybe(efa_mock_ibv_read_wc_flags_return_mock, 0);
 	will_return_maybe(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn);
 

diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c
@@ -192,7 +192,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin
 	will_return(efa_mock_ibv_next_poll_check_function_called_and_return_mock, ENOENT);
 	will_return(efa_mock_ibv_read_byte_len_return_mock, pkt_entry->pkt_size);
 	will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_RECV);
-	will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
+	will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
 	will_return(efa_mock_ibv_read_wc_flags_return_mock, 0);
 	will_return(efa_mock_ibv_read_slid_return_mock, efa_rdm_ep_get_peer_ahn(efa_rdm_ep, peer_addr));
 	will_return(efa_mock_ibv_read_src_qp_return_mock, raw_addr.qpn);
@@ -204,7 +204,7 @@ void test_efa_rdm_ep_handshake_exchange_host_id(struct efa_resource **state, uin
 	 */
 	will_return(efa_mock_ibv_end_poll_check_mock, NULL);
 	will_return(efa_mock_ibv_read_opcode_return_mock, IBV_WC_SEND);
-	will_return(efa_mock_ibv_read_qp_num_return_mock, 0);
+	will_return(efa_mock_ibv_read_qp_num_return_mock, efa_rdm_ep->base_ep.qp->qp_num);
 	will_return(efa_mock_ibv_read_vendor_err_return_mock, FI_EFA_ERR_OTHER);
 	will_return(efa_mock_ibv_start_poll_return_mock, IBV_WC_SUCCESS);
 
@@ -742,7 +742,7 @@ void test_efa_rdm_ep_rma_without_caps(struct efa_resource **state)
 
 	/* ensure we don't have RMA capability. */
 	efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-	assert_int_equal( efa_rdm_ep->user_info->caps & FI_RMA, 0);
+	assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_RMA, 0);
 
 	/* create a fake peer */
 	err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);
@@ -793,7 +793,7 @@ void test_efa_rdm_ep_atomic_without_caps(struct efa_resource **state)
 
 	/* ensure we don't have ATOMIC capability. */
 	efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid);
-	assert_int_equal( efa_rdm_ep->user_info->caps & FI_ATOMIC, 0);
+	assert_int_equal( efa_rdm_ep->base_ep.info->caps & FI_ATOMIC, 0);
 
 	/* create a fake peer */
 	err = fi_getname(&resource->ep->fid, &raw_addr, &raw_addr_len);