Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions ompi/mca/osc/rdma/osc_rdma_active_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "start group size %d", sync->num_peers);

sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;

if (0 == ompi_group_size (group)) {
/* nothing more to do. this is an empty start epoch */
OPAL_THREAD_UNLOCK(&module->lock);
Expand All @@ -393,8 +395,6 @@ int ompi_osc_rdma_start_atomic (ompi_group_t *group, int mpi_assert, ompi_win_t

opal_atomic_wmb ();

sync->type = OMPI_OSC_RDMA_SYNC_TYPE_PSCW;

/* prevent us from entering a passive-target, fence, or another pscw access epoch until
* the matching complete is called */
sync->epoch_active = true;
Expand Down Expand Up @@ -466,17 +466,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
sync->type = OMPI_OSC_RDMA_SYNC_TYPE_NONE;
sync->epoch_active = false;

/* phase 2 cleanup group */
OBJ_RELEASE(group);

peers = sync->peer_list.peers;
if (NULL == peers) {
/* empty peer list */
OPAL_THREAD_UNLOCK(&(module->lock));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would't it be better to release the lock after OBj_RELEASE(group), right before the return statement? Any chance multiple threads would compete to decrement the group's reference count?

Maybe this is totally irrelevant, I do not know this codebase. In that case just ignore my comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that seems reasonable to me. I'll make the change. Thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The user provided group is unnecessary at this point, releasing it as early as possible seems correct.

In fact, the user provided group is only necessary early on while building the peers array. As we refcount each peer proc explicitly, technically we don't even need to keep a pointer to the group, because this leads to double refcounting the peers. From a performance perspective, I would rather keep and refcount only the group than refcounting each individual procs (because the procs are already refcounted by the group). We could save 2*peers atomic operations per epoch.

OBJ_RELEASE(group);
if(MPI_GROUP_EMPTY != group) {
OBJ_RELEASE(group);
}
return OMPI_SUCCESS;
}

/* phase 2 cleanup group */
OBJ_RELEASE(group);

sync->peer_list.peers = NULL;

OPAL_THREAD_UNLOCK(&(module->lock));
Expand Down
25 changes: 10 additions & 15 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ static int ompi_osc_rdma_component_register (void)
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
free(description_str);

ompi_osc_rdma_btl_alternate_names = "sm,tcp";
ompi_osc_rdma_btl_alternate_names = "sm,self,tcp";
opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying "
"connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str,
Expand Down Expand Up @@ -581,7 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
int my_rank = ompi_comm_rank (module->comm);
int global_size = ompi_comm_size (module->comm);
ompi_osc_rdma_region_t *state_region;
struct _local_data *temp;
struct _local_data *temp = NULL;
char *data_file;
int page_size = opal_getpagesize();

Expand Down Expand Up @@ -624,13 +624,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
}

do {
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
break;
}
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

do {
temp[local_rank].rank = my_rank;
temp[local_rank].size = size;

Expand Down Expand Up @@ -788,10 +787,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
}
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
if (i > 0) {
peer->state_endpoint = local_leader->state_endpoint;
peer->state_btl_index = local_leader->state_btl_index;
}
peer->state_endpoint = local_leader->data_endpoint; // data_endpoint initialized in ompi_osc_rdma_new_peer();
peer->state_btl_index = local_leader->data_btl_index;
}

if (my_rank == peer_rank) {
Expand Down Expand Up @@ -914,10 +911,8 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
{
mca_btl_base_selected_module_t *item;
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
int btls_found = 0;

btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
if (NULL == btls_to_use) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
return OMPI_ERR_UNREACH;
Expand Down
7 changes: 0 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -428,10 +428,6 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module,
}
peer->regions = temp;

/* lock the region */
ompi_osc_rdma_lock_acquire_shared (module, &peer->super, 1, offsetof (ompi_osc_rdma_state_t, regions_lock),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you say a word why you removed the lock? AFAICS, the owner of the region takes an exclusive lock in ompi_osc_rdma_attach and the shared lock here is needed to ensure the consistency of the region information.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing a hang in this code path. From what I can see, because this process already has an exclusive lock on this peer, it can't acquire the shared lock (it thinks someone else owns it), and will loop for infinity trying to acquire this lock. To me this looks like a double-lock scenerio. @hjelmn can probably say if that is intended or not, perhaps there is a different fix needed for this case.

OMPI_OSC_RDMA_LOCK_EXCLUSIVE);

source_address = (uint64_t)(intptr_t) peer->super.state + offsetof (ompi_osc_rdma_state_t, regions);
ret = ompi_osc_get_data_blocking (module, peer->super.state_btl_index, peer->super.state_endpoint,
source_address, peer->super.state_handle, peer->regions, region_len);
Expand All @@ -440,9 +436,6 @@ static int ompi_osc_rdma_refresh_dynamic_region (ompi_osc_rdma_module_t *module,
return ret;
}

/* release the region lock */
ompi_osc_rdma_lock_release_shared (module, &peer->super, -1, offsetof (ompi_osc_rdma_state_t, regions_lock));

/* update cached region ids */
peer->region_id = region_id;
peer->region_count = region_count;
Expand Down
2 changes: 1 addition & 1 deletion opal/mca/btl/self/btl_self.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ static int mca_btl_self_send(struct mca_btl_base_module_t *btl,
if (btl_ownership) {
mca_btl_self_free(btl, des);
}
return 1;
return OPAL_SUCCESS;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a valid reason to return 1 from this async function. It is to indicate that everything has been taken care of, and the message is already done despite the fact that we expect it to happen in an async way. Without this return the upper level cannot make assumptions about the code in the case where MCA_BTL_DES_SEND_ALWAYS_CALLBACK is not provided.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a byproduct of btl/base now calling btl_send() directly. Here it expects a return code of OPAL_SUCCESS, or else it does not remove the message from the queue. For btl/self this is problematic, and will result in a double free:

https://github.com/open-mpi/ompi/blob/master/opal/mca/btl/base/btl_base_am_rdma.c#L784

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure of the best way to address this, maybe it should always remove it from the queue?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code you pointed to is incorrect, it should not test for OMPI_SUCCESS but should use OPAL_UNLIKELY(rc >= 0) as everywhere else. I think the fix is simple, you can remove the item from the list as soon as the ret is OPAL_UNLIKELY(ret >= 0), as this means either that the message has been already sent, or that there will be a callback once the send completes (but in both cases the BTL will handle the rest of this message lifetime).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hjelmn any comment here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bosilca Yes. We should be checking if the return code is >= 0 not just success. Forgot about that case. 1 means the data is gone.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making this change back to returning 1 for btl_self_send() seems to open up a whole can of worms for osc/rdma that need to get flushed out. Unfortunately it doesn't seem like a trivial fix - it just exposes other incompatibilities.

}

static int mca_btl_self_sendi(struct mca_btl_base_module_t *btl,
Expand Down
146 changes: 0 additions & 146 deletions opal/mca/btl/tcp/btl_tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ mca_btl_tcp_module_t mca_btl_tcp_module =
.btl_free = mca_btl_tcp_free,
.btl_prepare_src = mca_btl_tcp_prepare_src,
.btl_send = mca_btl_tcp_send,
.btl_put = mca_btl_tcp_put,
.btl_dump = mca_btl_base_dump,
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
},
Expand Down Expand Up @@ -330,151 +329,6 @@ int mca_btl_tcp_send(struct mca_btl_base_module_t *btl, struct mca_btl_base_endp
return mca_btl_tcp_endpoint_send(endpoint, frag);
}

static void fake_rdma_complete(mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int rc)
{
mca_btl_tcp_frag_t *frag = (mca_btl_tcp_frag_t *) desc;

frag->cb.func(btl, endpoint, frag->segments[0].seg_addr.pval, NULL, frag->cb.context,
frag->cb.data, rc);
}

/**
* Initiate an asynchronous put.
*/

int mca_btl_tcp_put(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata)
{
mca_btl_tcp_module_t *tcp_btl = (mca_btl_tcp_module_t *) btl;
mca_btl_tcp_frag_t *frag = NULL;
int i;

MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

frag->endpoint = endpoint;

frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;

frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;

frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;

frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;
if (endpoint->endpoint_nbo) {
MCA_BTL_BASE_SEGMENT_HTON(frag->segments[1]);
}

frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;

frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;

frag->btl = tcp_btl;
frag->endpoint = endpoint;
frag->rc = 0;
frag->iov_idx = 0;
frag->hdr.size = 0;
frag->iov_cnt = 2;
frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE *) &frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE *) (frag->segments + 1);
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
for (i = 0; i < (int) frag->base.des_segment_count; i++) {
frag->hdr.size += frag->segments[i].seg_len;
frag->iov[i + 2].iov_len = frag->segments[i].seg_len;
frag->iov[i + 2].iov_base = (IOVBASE_TYPE *) frag->segments[i].seg_addr.pval;
frag->iov_cnt++;
}
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_PUT;
frag->hdr.count = 1;
if (endpoint->endpoint_nbo) {
MCA_BTL_TCP_HDR_HTON(frag->hdr);
}
return ((i = mca_btl_tcp_endpoint_send(endpoint, frag)) >= 0 ? OPAL_SUCCESS : i);
}

/**
* Initiate an asynchronous get.
*/

int mca_btl_tcp_get(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata)
{
mca_btl_tcp_module_t *tcp_btl = (mca_btl_tcp_module_t *) btl;
mca_btl_tcp_frag_t *frag = NULL;
int rc;

MCA_BTL_TCP_FRAG_ALLOC_USER(frag);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
;
}

frag->endpoint = endpoint;

frag->segments->seg_len = size;
frag->segments->seg_addr.pval = local_address;

frag->base.des_segments = frag->segments;
frag->base.des_segment_count = 1;
frag->base.order = MCA_BTL_NO_ORDER;

frag->segments[0].seg_addr.pval = local_address;
frag->segments[0].seg_len = size;

frag->segments[1].seg_addr.lval = remote_address;
frag->segments[1].seg_len = size;

/* call the rdma callback through the descriptor callback. this is
* tcp so the extra latency is not an issue */
frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP | MCA_BTL_DES_SEND_ALWAYS_CALLBACK;
frag->base.des_cbfunc = fake_rdma_complete;

frag->cb.func = cbfunc;
frag->cb.data = cbdata;
frag->cb.context = cbcontext;

frag->btl = tcp_btl;
frag->endpoint = endpoint;
frag->rc = 0;
frag->iov_idx = 0;
frag->hdr.size = 0;
frag->iov_cnt = 2;
frag->iov_ptr = frag->iov;
frag->iov[0].iov_base = (IOVBASE_TYPE *) &frag->hdr;
frag->iov[0].iov_len = sizeof(frag->hdr);
frag->iov[1].iov_base = (IOVBASE_TYPE *) &frag->segments[1];
frag->iov[1].iov_len = sizeof(mca_btl_base_segment_t);
frag->hdr.base.tag = MCA_BTL_TAG_BTL;
frag->hdr.type = MCA_BTL_TCP_HDR_TYPE_GET;
frag->hdr.count = 1;
if (endpoint->endpoint_nbo) {
MCA_BTL_TCP_HDR_HTON(frag->hdr);
}
return ((rc = mca_btl_tcp_endpoint_send(endpoint, frag)) >= 0 ? OPAL_SUCCESS : rc);
}

/*
* Cleanup/release module resources.
*/
Expand Down
22 changes: 0 additions & 22 deletions opal/mca/btl/tcp/btl_tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,28 +251,6 @@ extern int mca_btl_tcp_send(struct mca_btl_base_module_t *btl,
struct mca_btl_base_endpoint_t *btl_peer,
struct mca_btl_base_descriptor_t *descriptor, mca_btl_base_tag_t tag);

/**
* Initiate an asynchronous put.
*/

int mca_btl_tcp_put(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata);

/**
* Initiate an asynchronous get.
*/

int mca_btl_tcp_get(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address,
mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata);

/**
* Allocate a descriptor with a segment of the requested size.
* Note that the BTL layer may choose to return a smaller size
Expand Down
19 changes: 0 additions & 19 deletions opal/mca/btl/tcp/btl_tcp_frag.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,25 +288,6 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t *frag, int sd)
goto repeat;
}
break;
case MCA_BTL_TCP_HDR_TYPE_PUT:
if (frag->iov_idx == 1) {
frag->iov[1].iov_base = (IOVBASE_TYPE *) frag->segments;
frag->iov[1].iov_len = frag->hdr.count * sizeof(mca_btl_base_segment_t);
frag->iov_cnt++;
goto repeat;
} else if (frag->iov_idx == 2) {
for (i = 0; i < frag->hdr.count; i++) {
if (btl_endpoint->endpoint_nbo) {
MCA_BTL_BASE_SEGMENT_NTOH(frag->segments[i]);
}
frag->iov[i + 2].iov_base = (IOVBASE_TYPE *) frag->segments[i].seg_addr.pval;
frag->iov[i + 2].iov_len = frag->segments[i].seg_len;
}
frag->iov_cnt += frag->hdr.count;
goto repeat;
}
break;
case MCA_BTL_TCP_HDR_TYPE_GET:
default:
break;
}
Expand Down
2 changes: 0 additions & 2 deletions opal/mca/btl/tcp/btl_tcp_hdr.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ BEGIN_C_DECLS
*/

#define MCA_BTL_TCP_HDR_TYPE_SEND 1
#define MCA_BTL_TCP_HDR_TYPE_PUT 2
#define MCA_BTL_TCP_HDR_TYPE_GET 3
#define MCA_BTL_TCP_HDR_TYPE_FIN 4
/* The MCA_BTL_TCP_HDR_TYPE_FIN is a special kind of message sent during normal
* connexion closing. Before the endpoint closes the socket, it performs a
Expand Down