diff --git a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c index 66482b4bc62..497c74cced1 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvfrag.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvfrag.c @@ -468,7 +468,7 @@ void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, ); } - /* no need to check if complete we know we are.. */ + /* no need to check if complete we know we are. */ /* don't need a rmb as that is for checking */ recv_request_pml_complete(match); } diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c index 70969415c49..1f594cacc10 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c @@ -255,6 +255,7 @@ int mca_pml_ob1_recv_request_ack_send_btl( static int mca_pml_ob1_recv_request_ack( mca_pml_ob1_recv_request_t* recvreq, + mca_btl_base_module_t* btl, mca_pml_ob1_rendezvous_hdr_t* hdr, size_t bytes_received) { @@ -315,12 +316,12 @@ static int mca_pml_ob1_recv_request_ack( /* let know to shedule function there is no need to put ACK flag. If not all message went over * RDMA then we cancel the GET protocol in order to switch back to send/recv. In this case send - * back the remote send request, the peer kept a poointer to the frag locally. In the future we + * back the remote send request, the peer kept a pointer to the frag locally. In the future we * might want to cancel the fragment itself, in which case we will have to send back the remote * fragment instead of the remote request. */ recvreq->req_ack_sent = true; - return mca_pml_ob1_recv_request_ack_send(proc, hdr->hdr_src_req.lval, + return mca_pml_ob1_recv_request_ack_send(btl, proc, hdr->hdr_src_req.lval, recvreq, recvreq->req_send_offset, 0, recvreq->req_send_offset == bytes_received); } @@ -356,7 +357,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr } /* tell peer to fall back on send for this region */ - rc = mca_pml_ob1_recv_request_ack_send(proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, + rc = mca_pml_ob1_recv_request_ack_send(NULL, proc, frag->rdma_hdr.hdr_rget.hdr_rndv.hdr_src_req.lval, recvreq, frag->rdma_offset, frag->rdma_length, false); MCA_PML_OB1_RDMA_FRAG_RETURN(frag); return rc; @@ -672,7 +673,7 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq if (mca_pml_ob1_cuda_need_buffers(recvreq, btl)) #endif /* OPAL_CUDA_SUPPORT */ { - mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0); + mca_pml_ob1_recv_request_ack(recvreq, btl, &hdr->hdr_rndv, 0); return; } } @@ -815,7 +816,7 @@ void mca_pml_ob1_recv_request_progress_rndv( mca_pml_ob1_recv_request_t* recvreq recvreq->remote_req_send = hdr->hdr_rndv.hdr_src_req; recvreq->req_rdma_offset = bytes_received; MCA_PML_OB1_RECV_REQUEST_MATCHED(recvreq, &hdr->hdr_match); - mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, bytes_received); + mca_pml_ob1_recv_request_ack(recvreq, btl, &hdr->hdr_rndv, bytes_received); /** * The PUT protocol do not attach any data to the original request. * Therefore, we might want to avoid unpacking if there is nothing to diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.h b/ompi/mca/pml/ob1/pml_ob1_recvreq.h index 0ced47e2915..64fafad250a 100644 --- a/ompi/mca/pml/ob1/pml_ob1_recvreq.h +++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.h @@ -428,9 +428,11 @@ int mca_pml_ob1_recv_request_ack_send_btl(ompi_proc_t* proc, mca_bml_base_btl_t* bml_btl, uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_rdma_offset, uint64_t size, bool nordma); -static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, - uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, - uint64_t size, bool nordma) +static inline int +mca_pml_ob1_recv_request_ack_send(mca_btl_base_module_t* btl, + ompi_proc_t* proc, + uint64_t hdr_src_req, void *hdr_dst_req, uint64_t hdr_send_offset, + uint64_t size, bool nordma) { size_t i; mca_bml_base_btl_t* bml_btl; @@ -438,11 +440,18 @@ static inline int mca_pml_ob1_recv_request_ack_send(ompi_proc_t* proc, assert (NULL != endpoint); + /** + * If a btl has been requested then send the ack using that specific device, otherwise + * we are free to pick one. We need to force the ack to go over a specific BTL, in order + * to prevent the establishement of new connections during the matching handshake. + */ for(i = 0; i < mca_bml_base_btl_array_get_size(&endpoint->btl_eager); i++) { bml_btl = mca_bml_base_btl_array_get_next(&endpoint->btl_eager); - if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, - hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS) - return OMPI_SUCCESS; + if( (NULL == btl) || (btl == bml_btl->btl) ) { + if(mca_pml_ob1_recv_request_ack_send_btl(proc, bml_btl, hdr_src_req, + hdr_dst_req, hdr_send_offset, size, nordma) == OMPI_SUCCESS) + return OMPI_SUCCESS; + } } MCA_PML_OB1_ADD_ACK_TO_PENDING(proc, hdr_src_req, hdr_dst_req,