Skip to content

Commit 9bae131

Browse files
committed
btl/openib: fix message coalescing
There was a bug in the openib btl handling this valid sequence of calls: desc = btl_alloc (); btl_free (desc); When triggered the bug would cause either fragment loss or undefined behavior (SEGV, etc). The problem occured because btl_alloc contained the logic to modify the pending fragment (length, etc) and these changes were not corrected if the fragment was freed instead of sent. To fix this issue I 1) moved some of the coalescing logic to the btl_send function, and 2) retry the coalesced fragment on btl_free if it was never sent. This appears to completely address the issue.
1 parent 9aaac11 commit 9bae131

File tree

2 files changed

+46
-19
lines changed

2 files changed

+46
-19
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,15 +1133,16 @@ ib_frag_alloc(mca_btl_openib_module_t *btl, size_t size, uint8_t order,
11331133

11341134
/* check if pending fragment has enough space for coalescing */
11351135
static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
1136-
opal_mutex_t *lock, mca_btl_base_endpoint_t *ep, size_t size)
1136+
opal_mutex_t *lock, struct mca_btl_base_endpoint_t *ep, size_t size,
1137+
mca_btl_openib_coalesced_frag_t **cfrag)
11371138
{
11381139
mca_btl_openib_send_frag_t *frag = NULL;
11391140

1140-
if(opal_list_is_empty(frag_list))
1141+
if (opal_list_is_empty(frag_list))
11411142
return NULL;
11421143

11431144
OPAL_THREAD_LOCK(lock);
1144-
if(!opal_list_is_empty(frag_list)) {
1145+
if (!opal_list_is_empty(frag_list)) {
11451146
int qp;
11461147
size_t total_length;
11471148
opal_list_item_t *i = opal_list_get_first(frag_list);
@@ -1158,10 +1159,20 @@ static mca_btl_openib_send_frag_t *check_coalescing(opal_list_t *frag_list,
11581159

11591160
qp = to_base_frag(frag)->base.order;
11601161

1161-
if(total_length <= mca_btl_openib_component.qp_infos[qp].size)
1162-
opal_list_remove_first(frag_list);
1163-
else
1162+
if(total_length <= mca_btl_openib_component.qp_infos[qp].size) {
1163+
/* make sure we can allocate a coalescing frag before returning success */
1164+
*cfrag = alloc_coalesced_frag();
1165+
if (OPAL_LIKELY(NULL != cfrag)) {
1166+
(*cfrag)->send_frag = frag;
1167+
(*cfrag)->sent = false;
1168+
1169+
opal_list_remove_first(frag_list);
1170+
} else {
1171+
frag = NULL;
1172+
}
1173+
} else {
11641174
frag = NULL;
1175+
}
11651176
}
11661177
OPAL_THREAD_UNLOCK(lock);
11671178

@@ -1188,7 +1199,7 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
11881199
mca_btl_openib_module_t *obtl = (mca_btl_openib_module_t*)btl;
11891200
int qp = frag_size_to_order(obtl, size);
11901201
mca_btl_openib_send_frag_t *sfrag = NULL;
1191-
mca_btl_openib_coalesced_frag_t *cfrag;
1202+
mca_btl_openib_coalesced_frag_t *cfrag = NULL;
11921203

11931204
assert(qp != MCA_BTL_NO_ORDER);
11941205

@@ -1197,26 +1208,25 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
11971208
int prio = !(flags & MCA_BTL_DES_FLAGS_PRIORITY);
11981209

11991210
sfrag = check_coalescing(&ep->qps[qp].no_wqe_pending_frags[prio],
1200-
&ep->endpoint_lock, ep, size);
1211+
&ep->endpoint_lock, ep, size, &cfrag);
12011212

1202-
if(NULL == sfrag) {
1213+
if (NULL == sfrag) {
12031214
if(BTL_OPENIB_QP_TYPE_PP(qp)) {
12041215
sfrag = check_coalescing(&ep->qps[qp].no_credits_pending_frags[prio],
1205-
&ep->endpoint_lock, ep, size);
1216+
&ep->endpoint_lock, ep, size, &cfrag);
12061217
} else {
12071218
sfrag = check_coalescing(
12081219
&obtl->qps[qp].u.srq_qp.pending_frags[prio],
1209-
&obtl->ib_lock, ep, size);
1220+
&obtl->ib_lock, ep, size, &cfrag);
12101221
}
12111222
}
12121223
}
12131224

1214-
if(NULL == sfrag)
1225+
if (NULL == sfrag) {
12151226
return ib_frag_alloc((mca_btl_openib_module_t*)btl, size, order, flags);
1227+
}
12161228

12171229
/* begin coalescing message */
1218-
cfrag = alloc_coalesced_frag();
1219-
cfrag->send_frag = sfrag;
12201230

12211231
/* fix up new coalescing header if this is the first coalesced frag */
12221232
if(sfrag->hdr != sfrag->chdr) {
@@ -1250,10 +1260,9 @@ mca_btl_base_descriptor_t* mca_btl_openib_alloc(
12501260
to_base_frag(cfrag)->segment.base.seg_addr.pval = cfrag->hdr + 1;
12511261
to_base_frag(cfrag)->segment.base.seg_len = size;
12521262

1253-
/* save coalesced fragment on a main fragment; we will need it after send
1254-
* completion to free it and to call upper layer callback */
1255-
opal_list_append(&sfrag->coalesced_frags, (opal_list_item_t*)cfrag);
1256-
sfrag->coalesced_length += (size+sizeof(mca_btl_openib_header_coalesced_t));
1263+
/* NTH: there is no reason to append the coalesced fragment here. No more
1264+
* fragments will be added until either send or free has been called on
1265+
* the coalesced frag. */
12571266

12581267
to_base_frag(cfrag)->base.des_flags = flags;
12591268

@@ -1306,6 +1315,15 @@ int mca_btl_openib_free(
13061315
default:
13071316
break;
13081317
}
1318+
1319+
if (openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED && !to_coalesced_frag(des)->sent) {
1320+
mca_btl_openib_send_frag_t *sfrag = to_coalesced_frag(des)->send_frag;
1321+
1322+
/* the coalesced fragment would have sent the original fragment but that
1323+
* will not happen so send the fragment now */
1324+
mca_btl_openib_endpoint_send(to_com_frag(sfrag)->endpoint, sfrag);
1325+
}
1326+
13091327
MCA_BTL_IB_FRAG_RETURN(des);
13101328

13111329
return OPAL_SUCCESS;
@@ -1888,11 +1906,19 @@ int mca_btl_openib_send(
18881906
openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED);
18891907

18901908
if(openib_frag_type(des) == MCA_BTL_OPENIB_FRAG_COALESCED) {
1909+
frag = to_coalesced_frag(des)->send_frag;
1910+
1911+
/* save coalesced fragment on a main fragment; we will need it after send
1912+
* completion to free it and to call upper layer callback */
1913+
opal_list_append(&frag->coalesced_frags, (opal_list_item_t*) des);
1914+
frag->coalesced_length += to_coalesced_frag(des)->hdr->alloc_size +
1915+
sizeof(mca_btl_openib_header_coalesced_t);
1916+
1917+
to_coalesced_frag(des)->sent = true;
18911918
to_coalesced_frag(des)->hdr->tag = tag;
18921919
to_coalesced_frag(des)->hdr->size = des->des_local->seg_len;
18931920
if(ep->nbo)
18941921
BTL_OPENIB_HEADER_COALESCED_HTON(*to_coalesced_frag(des)->hdr);
1895-
frag = to_coalesced_frag(des)->send_frag;
18961922
} else {
18971923
frag = to_send_frag(des);
18981924
to_com_frag(des)->endpoint = ep;

opal/mca/btl/openib/btl_openib_frag.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ typedef struct mca_btl_openib_coalesced_frag_t {
371371
mca_btl_openib_frag_t super;
372372
mca_btl_openib_send_frag_t *send_frag;
373373
mca_btl_openib_header_coalesced_t *hdr;
374+
bool sent;
374375
} mca_btl_openib_coalesced_frag_t;
375376
OBJ_CLASS_DECLARATION(mca_btl_openib_coalesced_frag_t);
376377

0 commit comments

Comments
 (0)