Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCP/WIREUP/IB: Fix error message when FLID is not available #10515

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 22 additions & 35 deletions src/ucp/wireup/select.c
Original file line number Diff line number Diff line change
Expand Up @@ -386,8 +386,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
const ucp_wireup_criteria_t *criteria, ucp_tl_bitmap_t tl_bitmap,
uint64_t remote_md_map, uint64_t local_dev_bitmap,
uint64_t remote_dev_bitmap, int show_error,
ucp_wireup_select_info_t *select_info, char *info_str,
size_t info_str_size)
ucp_wireup_select_info_t *select_info)
{
UCS_STRING_BUFFER_ONSTACK(missing_flags_str,
UCP_WIREUP_MAX_FLAGS_STRING_SIZE);
Expand All @@ -410,6 +409,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
ucp_rsc_index_t dev_index;
ucp_lane_index_t lane;
char tls_info[256];
char uct_info[256];
char *p, *endp;
uct_iface_attr_t *iface_attr;
uct_md_attr_v2_t *md_attr;
Expand Down Expand Up @@ -591,13 +591,15 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
UCS_STATIC_BITMAP_AND_INPLACE(&rsc_addr_index_map, addr_index_map);
}

/* ucp_wireup_is_reachable() can fail without filling uct_info string if
* none of the remote transports match the local one */
snprintf(uct_info, sizeof(uct_info), "not available");
is_reachable = 0;

UCS_STATIC_BITMAP_FOR_EACH_BIT(addr_index, &rsc_addr_index_map) {
ae = &address->address_list[addr_index];
if (!ucp_wireup_is_reachable(ep, select_params->ep_init_flags,
rsc_index, ae, info_str,
info_str_size)) {
rsc_index, ae, uct_info,
sizeof(uct_info))) {
/* Must be reachable device address, on same transport */
continue;
}
Expand All @@ -623,9 +625,8 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_select_transport(
/* If a local resource cannot reach any of the remote addresses,
* generate debug message. */
if (!is_reachable) {
snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT" - %s, ",
UCT_TL_RESOURCE_DESC_ARG(resource),
ucs_status_string(UCS_ERR_UNREACHABLE));
snprintf(p, endp - p, UCT_TL_RESOURCE_DESC_FMT " - %s, ",
UCT_TL_RESOURCE_DESC_ARG(resource), uct_info);
p += strlen(p);
}
}
Expand Down Expand Up @@ -920,7 +921,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(
status = ucp_wireup_select_transport(select_ctx, select_params,
&mem_criteria, mem_type_tl_bitmap,
remote_md_map, UINT64_MAX, UINT64_MAX,
!allow_am, &select_info, NULL, 0);
!allow_am, &select_info);
if (status == UCS_OK) {
/* Add to the list of lanes */
status = ucp_wireup_add_lane(select_params, &select_info, lane_type,
Expand Down Expand Up @@ -966,8 +967,7 @@ static UCS_F_NOINLINE ucs_status_t ucp_wireup_add_memaccess_lanes(
status = ucp_wireup_select_transport(select_ctx, select_params,
&mem_criteria, tl_bitmap,
remote_md_map, UINT64_MAX,
UINT64_MAX, 0, &select_info,
NULL, 0);
UINT64_MAX, 0, &select_info);
/* Break if: */
/* - transport selection wasn't OK */
if ((status != UCS_OK) ||
Expand Down Expand Up @@ -1419,8 +1419,7 @@ ucp_wireup_is_am_required(const ucp_wireup_select_params_t *select_params,
static ucs_status_t
ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
ucp_wireup_select_info_t *am_info,
ucp_wireup_select_context_t *select_ctx,
char *info_string, size_t info_string_length)
ucp_wireup_select_context_t *select_ctx)
{
ucp_worker_h worker = select_params->ep->worker;
ucp_tl_bitmap_t tl_bitmap = select_params->tl_bitmap;
Expand Down Expand Up @@ -1460,8 +1459,7 @@ ucp_wireup_add_am_lane(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params,
&criteria, tl_bitmap, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 1,
am_info, info_string,
info_string_length);
am_info);
if (status != UCS_OK) {
return status;
}
Expand Down Expand Up @@ -1615,8 +1613,7 @@ ucp_wireup_add_bw_lanes(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params,
&bw_info->criteria, tl_bitmap,
UINT64_MAX, local_dev_bitmap,
remote_dev_bitmap, 0, sinfo,
NULL, 0);
remote_dev_bitmap, 0, sinfo);
if (status != UCS_OK) {
ucs_array_pop_back(&sinfo_array);
break;
Expand Down Expand Up @@ -2027,7 +2024,7 @@ ucp_wireup_add_tag_lane(const ucp_wireup_select_params_t *select_params,
status = ucp_wireup_select_transport(select_ctx, select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 0,
&select_info, NULL, 0);
&select_info);
if ((status == UCS_OK) &&
(ucp_score_cmp(select_info.score,
am_info->score) >= 0)) {
Expand Down Expand Up @@ -2173,7 +2170,7 @@ ucp_wireup_add_keepalive_lane(const ucp_wireup_select_params_t *select_params,

status = ucp_wireup_select_transport(select_ctx, select_params, &criteria,
*tl_bitmap, UINT64_MAX, UINT64_MAX,
UINT64_MAX, 0, &select_info, NULL, 0);
UINT64_MAX, 0, &select_info);
if (status == UCS_OK) {
return ucp_wireup_add_lane(select_params, &select_info,
UCP_LANE_TYPE_KEEPALIVE, /* show error */ 1,
Expand All @@ -2195,8 +2192,7 @@ ucp_wireup_select_context_init(ucp_wireup_select_context_t *select_ctx)
static UCS_F_NOINLINE ucs_status_t
ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params,
ucp_err_handling_mode_t err_mode,
ucp_wireup_select_context_t *select_ctx,
char *info_string, size_t info_string_length)
ucp_wireup_select_context_t *select_ctx)
{
ucp_wireup_select_info_t am_info;
ucs_status_t status;
Expand All @@ -2222,8 +2218,7 @@ ucp_wireup_search_lanes(const ucp_wireup_select_params_t *select_params,

/* Add AM lane only after RMA/AMO was selected to be aware
* about whether they need emulation over AM or not */
status = ucp_wireup_add_am_lane(select_params, &am_info, select_ctx,
info_string, info_string_length);
status = ucp_wireup_add_am_lane(select_params, &am_info, select_ctx);
if (status != UCS_OK) {
return status;
}
Expand Down Expand Up @@ -2482,7 +2477,6 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_tl_bitmap_t scalable_tl_bitmap = worker->scalable_tl_bitmap;
/* TODO: remove initialization after all ucp_wireup_add_X_lanes functions
will support specifying a reason */
char wireup_info[256] = {0};
ucp_wireup_select_context_t select_ctx;
ucp_wireup_select_params_t select_params;
ucs_status_t status;
Expand All @@ -2493,8 +2487,7 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
remote_address, scalable_tl_bitmap, 0);
status = ucp_wireup_search_lanes(&select_params, key->err_mode,
&select_ctx, wireup_info,
sizeof(wireup_info));
&select_ctx);
if (status == UCS_OK) {
goto out;
}
Expand All @@ -2507,13 +2500,8 @@ ucp_wireup_select_lanes(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_select_params_init(&select_params, ep, ep_init_flags,
remote_address, tl_bitmap, show_error);
status = ucp_wireup_search_lanes(&select_params, key->err_mode,
&select_ctx, wireup_info,
sizeof(wireup_info));
&select_ctx);
if (status != UCS_OK) {
if (wireup_info[0] != '\0') {
ucs_diag("destination is unreachable [%s]", wireup_info);
}

return status;
}

Expand Down Expand Up @@ -2554,7 +2542,7 @@ ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
status = ucp_wireup_select_transport(&select_ctx, &select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 0,
select_info, NULL, 0);
select_info);
if (status == UCS_OK) {
return UCS_OK;
}
Expand All @@ -2564,6 +2552,5 @@ ucp_wireup_select_aux_transport(ucp_ep_h ep, unsigned ep_init_flags,
ucp_wireup_fill_aux_criteria(&criteria, ep_init_flags, 0);
return ucp_wireup_select_transport(&select_ctx, &select_params, &criteria,
ucp_tl_bitmap_max, UINT64_MAX,
UINT64_MAX, UINT64_MAX, 1, select_info,
NULL, 0);
UINT64_MAX, UINT64_MAX, 1, select_info);
}
45 changes: 20 additions & 25 deletions src/uct/ib/base/ib_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -841,19 +841,14 @@ static int uct_ib_iface_gid_extract_flid(const union ibv_gid *gid)
return ntohs(*((uint16_t*)UCS_PTR_BYTE_OFFSET(gid->raw, 4)));
}

static int uct_ib_iface_is_flid_enabled(uct_ib_iface_t *iface)
{
return iface->config.flid_enabled &&
(uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) != 0);
}

static int uct_ib_iface_dev_addr_is_reachable(
uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
const uct_iface_is_reachable_params_t *is_reachable_params)
{
int is_local_eth = uct_ib_iface_is_roce(iface);
uct_ib_address_pack_params_t params;
const char *flid_info_str;

uct_ib_address_unpack(ib_addr, &params);

Expand Down Expand Up @@ -883,22 +878,22 @@ static int uct_ib_iface_dev_addr_is_reachable(
}

/* Check FLID route: is enabled locally, and remote GID has it */
if (!uct_ib_iface_is_flid_enabled(iface)) {
uct_iface_fill_info_str_buf(is_reachable_params,
"FLID routing is disabled");
return 0;
}

if (uct_ib_iface_gid_extract_flid(&params.gid) == 0) {
uct_iface_fill_info_str_buf(
is_reachable_params,
"IB subnet prefix differs 0x%"PRIx64" vs 0x%"PRIx64"",
be64toh(iface->gid_info.gid.global.subnet_prefix),
be64toh(params.gid.global.subnet_prefix));
return 0;
if (!iface->config.flid_enabled) {
flid_info_str = "disabled";
} else if ((uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) == 0) ||
(uct_ib_iface_gid_extract_flid(&params.gid) == 0)) {
flid_info_str = "not available";
} else {
return 1;
}

return 1;
uct_iface_fill_info_str_buf(
is_reachable_params,
"different subnet prefix 0x%" PRIx64 "/0x%" PRIx64
" and FLID is %s",
be64toh(iface->gid_info.gid.global.subnet_prefix),
be64toh(params.gid.global.subnet_prefix), flid_info_str);
return 0;
} else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
/* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH
* flag should be on. If reachable, the remote and local RoCE versions
Expand Down Expand Up @@ -938,7 +933,6 @@ int uct_ib_iface_is_reachable_v2(const uct_iface_h tl_iface,
}

if (!uct_ib_iface_dev_addr_is_reachable(iface, device_addr, params)) {
uct_iface_fill_info_str_buf(params, "unreachable IB device address");
return 0;
}

Expand Down Expand Up @@ -1020,13 +1014,14 @@ static uint16_t uct_ib_gid_site_local_subnet_prefix(const union ibv_gid *gid)
uint16_t uct_ib_iface_resolve_remote_flid(uct_ib_iface_t *iface,
const union ibv_gid *gid)
{
if (!uct_ib_iface_is_flid_enabled(iface)) {
if (uct_ib_gid_site_local_subnet_prefix(gid) ==
uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) {
/* On the same subnet, no need to use FLID */
return 0;
}

if (uct_ib_gid_site_local_subnet_prefix(gid) ==
uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) {
/* On the same subnet, no need to use FLID*/
if (!iface->config.flid_enabled ||
(uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) == 0)) {
return 0;
}

Expand Down
21 changes: 16 additions & 5 deletions src/uct/ib/mlx5/rc/rc_mlx5_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,17 @@ static uint8_t uct_rc_mlx5_iface_get_address_type(uct_iface_h tl_iface)
UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC;
}

static const char *uct_rc_mlx5_iface_tm_type_str(uint8_t tm_type)
{
if (tm_type == UCT_RC_MLX5_IFACE_ADDR_TYPE_BASIC) {
return "basic";
} else if (tm_type == UCT_RC_MLX5_IFACE_ADDR_TYPE_TM) {
return "hw offload";
} else {
return "unknown";
}
}

static ucs_status_t uct_rc_mlx5_iface_get_address(uct_iface_h tl_iface,
uct_iface_addr_t *addr)
{
Expand All @@ -657,7 +668,6 @@ static int
uct_rc_mlx5_iface_is_reachable_v2(const uct_iface_h tl_iface,
const uct_iface_is_reachable_params_t *params)
{
static const char *tm_type_to_str[] = {"basic", "tag matching"};
uint8_t my_type = uct_rc_mlx5_iface_get_address_type(tl_iface);
uint8_t remote_type;
const uct_iface_addr_t *iface_addr;
Expand All @@ -668,10 +678,11 @@ uct_rc_mlx5_iface_is_reachable_v2(const uct_iface_h tl_iface,
/* Check hardware tag matching compatibility */
if ((iface_addr != NULL) &&
((remote_type = *(uint8_t*)iface_addr) != my_type)) {
uct_iface_fill_info_str_buf(
params, "incompatible hardware tag matching. "
"%s (local) vs %s (remote)",
tm_type_to_str[my_type], tm_type_to_str[remote_type]);
uct_iface_fill_info_str_buf(params,
"incompatible hardware tag matching. "
"%s (local) vs %s (remote)",
uct_rc_mlx5_iface_tm_type_str(my_type),
uct_rc_mlx5_iface_tm_type_str(remote_type));
return 0;
}

Expand Down
1 change: 1 addition & 0 deletions test/gtest/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ gtest_SOURCES = \
uct/test_pending.cc \
uct/test_progress.cc \
uct/test_uct_ep.cc \
uct/test_uct_iface.cc \
uct/test_uct_perf.cc \
uct/v2/test_uct_query.cc \
uct/test_zcopy_comp.cc \
Expand Down
Loading
Loading