Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCP/WIREUP/IB: Fix error message when FLID is not available #10515

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 22 additions & 35 deletions src/ucp/wireup/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
const ucp_wireup_criteria_t *criteria, ucp_tl_bitmap_t tl_bitmap,
uint64_t remote_md_map, uint64_t local_dev_bitmap,
uint64_t remote_dev_bitmap, int show_error,
ucp_wireup_select_info_t *select_info, char *info_str,
size_t info_str_size)
ucp_wireup_select_info_t *select_info)
{
UCS_STRING_BUFFER_ONSTACK(missing_flags_str,
UCP_WIREUP_MAX_FLAGS_STRING_SIZE);
Expand All @@ -410,6 +409,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
ucp_rsc_index_t dev_index;
ucp_lane_index_t lane;
char tls_info[256];
char uct_info[256];
char *p, *endp;
uct_iface_attr_t *iface_attr;
uct_md_attr_v2_t *md_attr;
Expand Down Expand Up @@ -591,13 +591,15 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
UCS_STATIC_BITMAP_AND_INPLACE(&rsc_addr_index_map, addr_index_map);
}

/* ucp_wireup_is_reachable() can fail without filling uct_info string if
* none of the remote transports match the local one */
snprintf(uct_info, sizeof(uct_info), "not available");
is_reachable = 0;

UCS_STATIC_BITMAP_FOR_EACH_BIT(addr_index, &rsc_addr_index_map) {
ae = &address->address_list[addr_index];
if (!ucp_wireup_is_reachable(ep, select_params->ep_init_flags,
rsc_index, ae, info_str,
info_str_size)) {
rsc_index, ae, uct_info,
sizeof(uct_info))) {
/* Must be reachable device address, on same transport */
continue;
}
Expand All @@ -623,9 +625,8 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
/* If a local resource cannot reach any of the remote addresses,
* generate debug message. */
if (!is_reachable) {
snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - %s, ",
UCT_TL_RESOURCE_DESC_ARG(resource),
ucs_status_string(UCS_ERR_UNREACHABLE));
snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT " - %s, ",
UCT_TL_RESOURCE_DESC_ARG(resource), uct_info);
p += strlen(p);
}
}
Expand Down Expand Up @@ -920,7 +921,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(
status = ucp_wireup_select_transport(select_ctx, select_params,
&mem_criteria, mem_type_tl_bitmap,
remote_md_map, UINT64_MAX, UINT64_MAX,
!allow_am, &select_info, NULL, 0);
!allow_am, &select_info);
if (status == UCS_OK) {
/* Add to the list of lanes */
status = ucp_wireup_add_lane(select_params, &select_info, lane_type,
Expand Down Expand Up @@ -966,8 +967,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(
status = ucp_wireup_select_transport(select_ctx, select_params,
&mem_criteria, tl_bitmap,
remote_md_map, UINT64_MAX,
UINT64_MAX, 0, &select_info,
NULL, 0);
UINT64_MAX, 0, &select_info);
/* Break if: */
/* - transport selection wasn't OK */
if ((status != UCS_OK) ||
Expand Down Expand Up @@ -1419,8 +1419,7 @@ ucp_wireup_is_am_required(const ucp_wireup_select_params_t *select_params,
static ucs_status_t
ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
ucp_wireup_select_info_t *am_info,
ucp_wireup_select_context_t *select_ctx,
char *info_string, size_t info_string_length)
ucp_wireup_select_context_t *select_ctx)
{
ucp_worker_h worker = select_params->ep->worker;
ucp_tl_bitmap_t tl_bitmap = select_params->tl_bitmap;
Expand Down Expand Up @@ -1460,8 +1459,7 @@ ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params,
&criteria, tl_bitmap, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 1,
am_info, info_string,
info_string_length);
am_info);
if (status != UCS_OK) {
return status;
}
Expand Down Expand Up @@ -1615,8 +1613,7 @@ ucp_wireup_add_bw_lanes(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params,
&bw_info->criteria, tl_bitmap,
UINT64_MAX, local_dev_bitmap,
remote_dev_bitmap, 0, sinfo,
NULL, 0);
remote_dev_bitmap, 0, sinfo);
if (status != UCS_OK) {
ucs_array_pop_back(&sinfo_array);
break;
Expand Down Expand Up @@ -2027,7 +2024,7 @@ ucp_wireup_add_tag_lane(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 0,
&select_info, NULL, 0);
&select_info);
if ((status == UCS_OK) &&
(ucp_score_cmp(select_info.score,
am_info->score) >= 0)) {
Expand Down Expand Up @@ -2173,7 +2170,7 @@ ucp_wireup_add_keepalive_lane(const ucp_wireup_select_params_t *select_params,

status = ucp_wireup_select_transport(select_ctx, select_params, &criteria,
*tl_bitmap, UINT64_MAX, UINT64_MAX,
UINT64_MAX, 0, &select_info, NULL, 0);
UINT64_MAX, 0, &select_info);
if (status == UCS_OK) {
return ucp_wireup_add_lane(select_params, &select_info,
UCP_LANE_TYPE_KEEPALIVE, /* show error */ 1,
Expand All @@ -2195,8 +2192,7 @@ ucp_wireup_select_context_init(ucp_wireup_select_context_t *select_ctx)
static UCS_F_NOINLINE ucs_status_t
ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params,
ucp_err_handling_mode_t err_mode,
ucp_wireup_select_context_t *select_ctx,
char *info_string, size_t info_string_length)
ucp_wireup_select_context_t *select_ctx)
{
ucp_wireup_select_info_t am_info;
ucs_status_t status;
Expand All @@ -2222,8 +2218,7 @@ ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params,

/* Add AM lane only after RMA/AMO was selected to be aware
* about whether they need emulation over AM or not */
status = ucp_wireup_add_am_lane(select_params, &am_info, select_ctx,
info_string, info_string_length);
status = ucp_wireup_add_am_lane(select_params, &am_info, select_ctx);
if (status != UCS_OK) {
return status;
}
Expand Down Expand Up @@ -2482,7 +2477,6 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_tl_bitmap_t scalable_tl_bitmap = worker->scalable_tl_bitmap;
/* TODO: remove initialization after all ucp_wireup_add_X_lanes functions
will support specifying a reason */
char wireup_info[256] = {0};
ucp_wireup_select_context_t select_ctx;
ucp_wireup_select_params_t select_params;
ucs_status_t status;
Expand All @@ -2493,8 +2487,7 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
remote_address, scalable_tl_bitmap, 0);
status = ucp_wireup_search_lanes(&select_params, key->err_mode,
&select_ctx, wireup_info,
sizeof(wireup_info));
&select_ctx);
if (status == UCS_OK) {
goto out;
}
Expand All @@ -2507,13 +2500,8 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
remote_address, tl_bitmap, show_error);
status = ucp_wireup_search_lanes(&select_params, key->err_mode,
&select_ctx, wireup_info,
sizeof(wireup_info));
&select_ctx);
if (status != UCS_OK) {
if (wireup_info[0] != '\0') {
ucs_diag("destination is unreachable [%s]", wireup_info);
}

return status;
}

Expand Down Expand Up @@ -2554,7 +2542,7 @@ ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
status = ucp_wireup_select_transport(&select_ctx, &select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 0,
select_info, NULL, 0);
select_info);
if (status == UCS_OK) {
return UCS_OK;
}
Expand All @@ -2564,6 +2552,5 @@ ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_fill_aux_criteria(&criteria, ep_init_flags, 0);
return ucp_wireup_select_transport(&select_ctx, &select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 1, select_info,
NULL, 0);
UINT64_MAX, UINT64_MAX, 1, select_info);
}
97 changes: 56 additions & 41 deletions src/uct/ib/base/ib_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -516,8 +516,8 @@ void uct_ib_iface_address_pack(uct_ib_iface_t *iface, uct_ib_address_t *ib_addr)
uct_ib_address_pack(&params, ib_addr);
}

void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
uct_ib_address_pack_params_t *params_p)
ucs_status_t uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
uct_ib_address_pack_params_t *params_p)
{
const void *ptr = ib_addr + 1;
/* silence cppcheck warning */
Expand Down Expand Up @@ -554,14 +554,15 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
params.flags |= UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID;
}

if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
if (ucs_test_all_flags(ib_addr->flags,
UCT_IB_ADDRESS_FLAG_SUBNET16 |
UCT_IB_ADDRESS_FLAG_SUBNET64)) {
return UCS_ERR_INVALID_PARAM;
} else if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET16) {
site_local_subnet = *ucs_serialize_next(&ptr, const uint16_t);
params.gid.global.subnet_prefix = UCT_IB_SITE_LOCAL_PREFIX |
(site_local_subnet << 48);
ucs_assert(!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64));
}

if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
} else if (ib_addr->flags & UCT_IB_ADDRESS_FLAG_SUBNET64) {
params.gid.global.subnet_prefix =
*ucs_serialize_next(&ptr, const uint64_t);
params.flags |= UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX;
Expand All @@ -586,15 +587,20 @@ void uct_ib_address_unpack(const uct_ib_address_t *ib_addr,
params.flags |= UCT_IB_ADDRESS_PACK_FLAG_PKEY;

*params_p = params;
return UCS_OK;
}

const char *uct_ib_address_str(const uct_ib_address_t *ib_addr, char *buf,
size_t max)
{
uct_ib_address_pack_params_t params;
ucs_status_t status;
char *p, *endp;

uct_ib_address_unpack(ib_addr, &params);
status = uct_ib_address_unpack(ib_addr, &params);
if (status != UCS_OK) {
return "<invalid address>";
}

p = buf;
endp = buf + max;
Expand Down Expand Up @@ -810,8 +816,12 @@ int uct_ib_iface_is_same_device(const uct_ib_address_t *ib_addr, uint16_t dlid,
const union ibv_gid *dgid)
{
uct_ib_address_pack_params_t params;
ucs_status_t status;

uct_ib_address_unpack(ib_addr, &params);
status = uct_ib_address_unpack(ib_addr, &params);
if (status != UCS_OK) {
return 0;
}

if (!(params.flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) &&
(dlid != params.lid)) {
Expand Down Expand Up @@ -841,21 +851,20 @@ static int uct_ib_iface_gid_extract_flid(const union ibv_gid *gid)
return ntohs(*((uint16_t*)UCS_PTR_BYTE_OFFSET(gid->raw, 4)));
}

static int uct_ib_iface_is_flid_enabled(uct_ib_iface_t *iface)
{
return iface->config.flid_enabled &&
(uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) != 0);
}

static int uct_ib_iface_dev_addr_is_reachable(
uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
const uct_iface_is_reachable_params_t *is_reachable_params)
{
int is_local_eth = uct_ib_iface_is_roce(iface);
uct_ib_address_pack_params_t params;
const char *flid_info_str;
ucs_status_t status;

uct_ib_address_unpack(ib_addr, &params);
status = uct_ib_address_unpack(ib_addr, &params);
if (status != UCS_OK) {
return 0;
}

/* at least one PKEY has to be with full membership */
if (!((params.pkey | iface->pkey) & UCT_IB_PKEY_MEMBERSHIP_MASK)) {
Expand Down Expand Up @@ -883,22 +892,22 @@ static int uct_ib_iface_dev_addr_is_reachable(
}

/* Check FLID route: is enabled locally, and remote GID has it */
if (!uct_ib_iface_is_flid_enabled(iface)) {
uct_iface_fill_info_str_buf(is_reachable_params,
"FLID routing is disabled");
return 0;
}

if (uct_ib_iface_gid_extract_flid(&params.gid) == 0) {
uct_iface_fill_info_str_buf(
is_reachable_params,
"IB subnet prefix differs 0x%"PRIx64" vs 0x%"PRIx64"",
be64toh(iface->gid_info.gid.global.subnet_prefix),
be64toh(params.gid.global.subnet_prefix));
return 0;
if (!iface->config.flid_enabled) {
flid_info_str = "disabled";
} else if ((uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) == 0) ||
(uct_ib_iface_gid_extract_flid(&params.gid) == 0)) {
flid_info_str = "not available";
} else {
return 1;
}

return 1;
uct_iface_fill_info_str_buf(
is_reachable_params,
"different subnet prefix 0x%" PRIx64 "/0x%" PRIx64
" and FLID is %s",
be64toh(iface->gid_info.gid.global.subnet_prefix),
be64toh(params.gid.global.subnet_prefix), flid_info_str);
return 0;
} else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
/* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH
* flag should be on. If reachable, the remote and local RoCE versions
Expand Down Expand Up @@ -938,7 +947,6 @@ int uct_ib_iface_is_reachable_v2(const uct_iface_h tl_iface,
}

if (!uct_ib_iface_dev_addr_is_reachable(iface, device_addr, params)) {
uct_iface_fill_info_str_buf(params, "unreachable IB device address");
return 0;
}

Expand Down Expand Up @@ -1020,33 +1028,39 @@ static uint16_t uct_ib_gid_site_local_subnet_prefix(const union ibv_gid *gid)
uint16_t uct_ib_iface_resolve_remote_flid(uct_ib_iface_t *iface,
const union ibv_gid *gid)
{
if (!uct_ib_iface_is_flid_enabled(iface)) {
if (uct_ib_gid_site_local_subnet_prefix(gid) ==
uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) {
/* On the same subnet, no need to use FLID */
return 0;
}

if (uct_ib_gid_site_local_subnet_prefix(gid) ==
uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) {
/* On the same subnet, no need to use FLID*/
if (!iface->config.flid_enabled ||
(uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) == 0)) {
return 0;
}

return uct_ib_iface_gid_extract_flid(gid);
}

void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
unsigned path_index,
struct ibv_ah_attr *ah_attr,
enum ibv_mtu *path_mtu)
ucs_status_t
uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
unsigned path_index,
struct ibv_ah_attr *ah_attr,
enum ibv_mtu *path_mtu)
{
union ibv_gid *gid = NULL;
uint16_t lid, flid = 0;
uct_ib_address_pack_params_t params;
ucs_status_t status;

ucs_assert(!uct_ib_iface_is_roce(iface) ==
!(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH));

uct_ib_address_unpack(ib_addr, &params);
status = uct_ib_address_unpack(ib_addr, &params);
if (status != UCS_OK) {
return status;
}

if (params.flags & UCT_IB_ADDRESS_PACK_FLAG_PATH_MTU) {
ucs_assert(params.path_mtu != UCT_IB_ADDRESS_INVALID_PATH_MTU);
Expand All @@ -1072,6 +1086,7 @@ void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
lid = (flid == 0) ? params.lid : flid;
uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, gid, params.gid_index,
path_index, ah_attr);
return UCS_OK;
}

static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
Expand Down
Loading
Loading