Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/IB: Skip SMI devices using netlink query #10507

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion buildlib/tools/check_tls_perf_caps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,14 @@ for tl_name in $(grep Transport ${legacy_info_file} | awk '{print $3}')
do
old_tl_caps=$(grep -A 8 "Transport: $tl_name" ${legacy_info_file} || true)
new_tl_caps=$(grep -A 8 "Transport: $tl_name" ${pr_info_file} || true)
for device in $(echo "${old_tl_caps}" | grep Device | awk '{print $3}')
for device in $(echo "${old_tl_caps}" | grep Device | awk '{print $3}')
do
# Ignore SMI devices
if [[ "$device" =~ ^smi[0-9]*:.$ ]]
then
continue;
fi

old_caps=$(echo "$old_tl_caps" | grep -A 7 "Device: $device" || true)
new_caps=$(echo "$new_tl_caps" | grep -A 7 "Device: $device" || true)
for cap in bandwidth latency overhead
Expand Down
1 change: 1 addition & 0 deletions buildlib/tools/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ get_ib_devices() {
set +x
for ibdev in $device_list
do
[[ "$device" =~ ^smi[0-9]*:.$ ]] && continue
num_ports=$(ibv_devinfo -d $ibdev| awk '/phys_port_cnt:/ {print $2}')
for port in $(seq 1 $num_ports)
do
Expand Down
37 changes: 11 additions & 26 deletions src/ucs/sys/netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <ucs/debug/memtrack_int.h>

#include <errno.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sys/socket.h>
#include <unistd.h>
Expand All @@ -30,19 +29,6 @@ typedef struct {
} ucs_netlink_route_info_t;


/**
* Callback function for parsing individual netlink messages.
*
* @param [in] nlh Pointer to the netlink message header.
* @param [in] nl_msg Pointer to the netlink message payload.
* @param [in] arg User-provided argument passed through from the caller.
*
* @return UCS_OK if parsing is complete, UCS_INPROGRESS if there are more
* messages to be parsed, or error code otherwise.
*/
typedef ucs_status_t (*ucs_netlink_parse_cb_t)(const struct nlmsghdr *nlh,
const void *nl_msg, void *arg);

static ucs_status_t ucs_netlink_socket_init(int *fd_p, int protocol)
{
struct sockaddr_nl sa = {.nl_family = AF_NETLINK};
Expand Down Expand Up @@ -85,15 +71,16 @@ ucs_netlink_parse_msg(const void *msg, size_t msg_len,
return UCS_ERR_IO_ERROR;
}

status = parse_cb(nlh, NLMSG_DATA(nlh), arg);
status = parse_cb(nlh, arg);
nlh = NLMSG_NEXT(nlh, msg_len);
}

return UCS_OK;
}

static ucs_status_t
ucs_status_t
ucs_netlink_send_request(int protocol, unsigned short nlmsg_type,
unsigned short nlmsg_flags,
const void *protocol_header, size_t header_length,
ucs_netlink_parse_cb_t parse_cb, void *arg)
{
Expand All @@ -112,7 +99,7 @@ ucs_netlink_send_request(int protocol, unsigned short nlmsg_type,

nlh.nlmsg_len = NLMSG_LENGTH(header_length);
nlh.nlmsg_type = nlmsg_type;
nlh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
nlh.nlmsg_flags = NLM_F_REQUEST | nlmsg_flags;
iov[0].iov_base = &nlh;
iov[0].iov_len = sizeof(nlh);
iov[1].iov_base = (void *)protocol_header;
Expand Down Expand Up @@ -183,16 +170,15 @@ ucs_netlink_get_route_info(const struct rtattr *rta, int len, int *if_index_p,
}

static ucs_status_t
ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, const void *nl_msg,
void *arg)
ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, void *arg)
{
ucs_netlink_route_info_t *info = (ucs_netlink_route_info_t *)arg;
const struct rtmsg *rt_msg = NLMSG_DATA(nlh);
int rule_iface;
const void *dst_in_addr;

if (ucs_netlink_get_route_info(RTM_RTA((const struct rtmsg *)nl_msg),
RTM_PAYLOAD(nlh), &rule_iface,
&dst_in_addr) != UCS_OK) {
if (ucs_netlink_get_route_info(RTM_RTA(rt_msg), RTM_PAYLOAD(nlh),
&rule_iface, &dst_in_addr) != UCS_OK) {
return UCS_INPROGRESS;
}

Expand All @@ -201,8 +187,7 @@ ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, const void *nl_msg,
}

if (ucs_bitwise_is_equal(ucs_sockaddr_get_inet_addr(info->sa_remote),
dst_in_addr,
((const struct rtmsg *)nl_msg)->rtm_dst_len)) {
dst_in_addr, rt_msg->rtm_dst_len)) {
info->found = 1;
return UCS_OK;
}
Expand All @@ -229,8 +214,8 @@ int ucs_netlink_route_exists(const char *if_name,
info.if_index = iface_index;
info.sa_remote = sa_remote;

ucs_netlink_send_request(NETLINK_ROUTE, RTM_GETROUTE, &rtm, sizeof(rtm),
ucs_netlink_parse_rt_entry_cb, &info);
ucs_netlink_send_request(NETLINK_ROUTE, RTM_GETROUTE, NLM_F_DUMP, &rtm,
sizeof(rtm), ucs_netlink_parse_rt_entry_cb, &info);

out:
return info.found;
Expand Down
30 changes: 30 additions & 0 deletions src/ucs/sys/netlink.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,40 @@

#include <ucs/type/status.h>

#include <linux/netlink.h>
#include <netinet/in.h>

BEGIN_C_DECLS

/**
* Callback function for parsing individual netlink messages.
*
* @param [in] nlh Pointer to the netlink message header.
* @param [in] arg User-provided argument passed through from the caller.
*
* @return UCS_OK if parsing is complete, UCS_INPROGRESS if there are more
* messages to be parsed, or error code otherwise.
*/
typedef ucs_status_t (*ucs_netlink_parse_cb_t)(const struct nlmsghdr *nlh,
void *arg);

/*
* Send a netlink request and parse the response.
*
* @param [in] protocol Netlink protocol (e.g. NETLINK_ROUTE).
* @param [in] nlmsg_type Protocol message type (e.g. NETLINK_GETROUTE).
* @param [in] nlmsg_flags Flags for message header (e.g. NLM_F_ROOT).
* @param [in] protocol_header Netlink protocol header.
* @param [in] header_length Netlink protocol header length.
* @param [in] parse_cb Callback function to parse the response.
* @param [in] arg User-provided argument for the parse callback.
*/
ucs_status_t
ucs_netlink_send_request(int protocol, unsigned short nlmsg_type,
unsigned short nlmsg_flags,
const void *protocol_header, size_t header_length,
ucs_netlink_parse_cb_t parse_cb, void *arg);


/**
* Check whether a routing table rule exists for a given network
Expand Down
58 changes: 58 additions & 0 deletions src/uct/ib/base/ib_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
#include <ucs/async/async.h>
#include <ucs/sys/compiler.h>
#include <ucs/sys/string.h>
#include <ucs/sys/netlink.h>
#include <ucs/sys/sock.h>
#include <ucs/sys/sys.h>
#include <sys/poll.h>
#include <libgen.h>
#include <sched.h>

#ifdef HAVE_DECL_RDMA_NL_NLDEV
#include <rdma/rdma_netlink.h>
#endif


/* This table is according to "Encoding for RNR NAK Timer Field"
* in IBTA specification */
Expand Down Expand Up @@ -1518,3 +1523,56 @@ const char* uct_ib_ah_attr_str(char *buf, size_t max,
return buf;
}

#ifdef HAVE_DECL_RDMA_NL_NLDEV
static ucs_status_t
uct_ib_device_is_smi_cb(const struct nlmsghdr *nlh, void *arg)
{
int *is_smi_p = (int*)arg;
const struct nlattr *attr;
uint8_t dev_type;

for (attr = NLMSG_DATA(nlh); UCS_PTR_BYTE_DIFF(nlh, attr) < nlh->nlmsg_len;
attr = UCS_PTR_BYTE_OFFSET(attr, NLA_ALIGN(attr->nla_len))) {
if (attr->nla_type == RDMA_NLDEV_ATTR_DEV_TYPE /* 99 */) {
dev_type = *(const uint8_t*)UCS_PTR_BYTE_OFFSET(attr, NLA_HDRLEN);
if (dev_type == RDMA_DEVICE_TYPE_SMI /* 1 */) {
*is_smi_p = 1;
return UCS_OK;
}
}
}

return UCS_INPROGRESS;
}

int uct_ib_device_is_smi(struct ibv_device *ibv_device)
{
struct nlattr *attr;
uint32_t *dev_index_attr;
size_t header_length;
ucs_status_t status;
int is_smi;

header_length = NLA_HDRLEN + sizeof(*dev_index_attr);
attr = ucs_alloca(header_length);
dev_index_attr = (uint32_t*)UCS_PTR_BYTE_OFFSET(attr, NLA_HDRLEN);
attr->nla_type = RDMA_NLDEV_ATTR_DEV_INDEX;
attr->nla_len = header_length;
*dev_index_attr = ibv_get_device_index(ibv_device);

is_smi = 0;
status = ucs_netlink_send_request(
NETLINK_RDMA, RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_GET),
0, attr, header_length, uct_ib_device_is_smi_cb, &is_smi);
if (status != UCS_OK) {
return 0;
}

return is_smi;
}
#else
int uct_ib_device_is_smi(struct ibv_device *ibv_device)
{
return 0;
}
#endif
2 changes: 2 additions & 0 deletions src/uct/ib/base/ib_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,4 +502,6 @@ static inline void uct_ib_destroy_cq(struct ibv_cq *cq, const char *desc)

void uct_ib_handle_async_event(uct_ib_device_t *dev, uct_ib_async_event_t *event);

int uct_ib_device_is_smi(struct ibv_device *ibv_device);

#endif
26 changes: 15 additions & 11 deletions src/uct/ib/base/ib_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -787,17 +787,21 @@ static const char *uct_ib_device_transport_type_name(struct ibv_device *device)
static int uct_ib_device_is_supported(struct ibv_device *device)
{
/* TODO: enable additional transport types when ready */
int ret =
#if HAVE_DECL_IBV_TRANSPORT_UNSPECIFIED
(device->transport_type == IBV_TRANSPORT_UNSPECIFIED) ||
#endif
(device->transport_type == IBV_TRANSPORT_IB);
if (!ret) {
ucs_debug("device %s of type %s is not supported",
device->dev_name, uct_ib_device_transport_type_name(device));
if (IBV_DEVICE_TRANSPORT_UNSPECIFIED(device) &&
(device->transport_type != IBV_TRANSPORT_IB)) {
ucs_debug("%s: unsupported transport type %s",
ibv_get_device_name(device),
uct_ib_device_transport_type_name(device));
return 0;
}

if (uct_ib_device_is_smi(device)) {
ucs_debug("%s: smi device is not supported",
ibv_get_device_name(device));
return 0;
}

return ret;
return 1;
}

int uct_ib_device_is_accessible(struct ibv_device *device)
Expand Down Expand Up @@ -1315,7 +1319,7 @@ ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,

/* Check for GPU-direct support */
if (md_config->enable_gpudirect_rdma != UCS_NO) {
/* Check peer memory driver is loaded, different driver versions use
/* Check peer memory driver is loaded, different driver versions use
* different paths */
uct_ib_check_gpudirect_driver(
md, "/sys/kernel/mm/memory_peers/nv_mem/version",
Expand All @@ -1326,7 +1330,7 @@ ucs_status_t uct_ib_md_open_common(uct_ib_md_t *md,
uct_ib_check_gpudirect_driver(
md, "/sys/module/nv_peer_mem/version",
UCS_MEMORY_TYPE_CUDA);


/* check if ROCM KFD driver is loaded */
uct_ib_check_gpudirect_driver(md, "/dev/kfd", UCS_MEMORY_TYPE_ROCM);
Expand Down
6 changes: 6 additions & 0 deletions src/uct/ib/base/ib_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@ static inline ucs_status_t uct_ib_query_device(struct ibv_context *ctx,
# define uct_ib_grh_required(_attr) 0
#endif

#if HAVE_DECL_IBV_TRANSPORT_UNSPECIFIED
# define IBV_DEVICE_TRANSPORT_UNSPECIFIED(_dev) ((_dev)->transport_type == IBV_TRANSPORT_UNSPECIFIED)
#else
# define IBV_DEVICE_TRANSPORT_UNSPECIFIED(_dev) 0
#endif

/* Dummy structure declaration, when not present in verbs.h */
#if !HAVE_IBV_DM
struct ibv_dm;
Expand Down
17 changes: 16 additions & 1 deletion src/uct/ib/configure.m4
Original file line number Diff line number Diff line change
Expand Up @@ -282,13 +282,28 @@ AS_IF([test "x$with_ib" = "xyes"],
AC_CHECK_DECLS([ibv_alloc_dm],
[AC_DEFINE([HAVE_IBV_DM], 1, [Device Memory support])],
[], [[#include <infiniband/verbs.h>]])])

# DDP support
AS_IF([test "x$have_mlx5" = xyes], [
AC_CHECK_DECLS([MLX5DV_CONTEXT_MASK_OOO_RECV_WRS],
[AC_DEFINE([HAVE_OOO_RECV_WRS], 1, [Have DDP support])],
[], [[#include <infiniband/mlx5dv.h>]])])

# RDMA netlink support
AC_CHECK_DECLS([RDMA_NL_NLDEV],
[
# Define replacement constants if not present in header files
AC_CHECK_DECL(RDMA_NLDEV_ATTR_DEV_TYPE, [],
[AC_DEFINE([RDMA_NLDEV_ATTR_DEV_TYPE], 99,
[RDMA netlink device type attribute])],
[[#include <rdma/rdma_netlink.h>]])
AC_CHECK_DECL(RDMA_DEVICE_TYPE_SMI, [],
[AC_DEFINE([RDMA_DEVICE_TYPE_SMI], 1,
[RDMA netlink SMI device type])]
[[#include <rdma/rdma_netlink.h>]])
],
[], [[#include <rdma/rdma_netlink.h>]])

mlnx_valg_libdir=$with_verbs/lib${libsuff}/mlnx_ofed/valgrind
AC_MSG_NOTICE([Checking OFED valgrind libs $mlnx_valg_libdir])

Expand Down
2 changes: 1 addition & 1 deletion test/apps/test_ucx_tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def test_tls_allow_list(ucx_info):
else:
rc_max_num_eps = 0

status, output = exec_cmd("ibv_devinfo -l | tail -n +2 | sed -e 's/^[ \t]*//' | head -n -1 ")
status, output = exec_cmd("ibv_devinfo -l | tail -n+2 | head -n-1 | sed -e 's/^[ \t]*//' | grep -v '^smi[0-9]*$'")
dev_list = output.splitlines()
port = "1"

Expand Down
5 changes: 5 additions & 0 deletions test/gtest/common/test_helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,11 @@ static std::map<std::string, std::string> get_all_rdmacm_net_devices()

if (!ib_devices.empty()) {
std::string ib_device = ib_devices.front();
if (ib_device.compare(0, 3, "smi") == 0) {
/* Skip SMI device */
continue;
}

std::string ports_dir = infiniband_dir + "/" + ib_device +
"/ports";
std::string ib_port = read_dir(ports_dir).front();
Expand Down
Loading