Skip to content

Commit

Permalink
prov/efa: Refactor dmabuf reg
Browse files Browse the repository at this point in the history
Introduce a boolean dmabuf_supported in efa_hmem_info, check if dmabuf is supported for different hmem ifaces.

When dmabuf is supported, retrieve the dmabuf fd and use ibv_reg_dmabuf_mr to register memory. Otherwise, fall back to ibv_reg_mr.
Always use ibv_reg_dmabuf_mr when FI_MR_DMABUF is set.

Remove macros in efa_mr_reg_ibv_mr and combine duplicate logic of different hmem ifaces.

Signed-off-by: Jessie Yang <[email protected]>
  • Loading branch information
jiaxiyan authored and shijin-aws committed Jun 6, 2024
1 parent 43ca6ac commit 6aa6708
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 91 deletions.
43 changes: 25 additions & 18 deletions prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ static int efa_domain_hmem_info_init_system(struct efa_domain *efa_domain)
info->p2p_disabled_by_user = false;
info->p2p_required_by_impl = false;
info->p2p_supported_by_device = true;
info->dmabuf_supported = false;

efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYSTEM);
return 0;
}
Expand Down Expand Up @@ -137,6 +139,7 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)

info->initialized = true;
info->p2p_disabled_by_user = false;
info->dmabuf_supported = false;

/* If user is using libfabric API 1.18 or later, by default EFA provider is permitted to
* use CUDA library to support CUDA memory, therefore p2p is not required.
Expand All @@ -146,26 +149,24 @@ static int efa_domain_hmem_info_init_cuda(struct efa_domain *efa_domain)
else
info->p2p_required_by_impl = true;

#if HAVE_EFA_DMABUF_MR
ret = cuda_get_dmabuf_fd(ptr, len, &dmabuf_fd, &dmabuf_offset);
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_CUDA, ptr, len, &dmabuf_fd, &dmabuf_offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(g_device_list[0].ibv_pd, dmabuf_offset,
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(efa_domain->ibv_pd, dmabuf_offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
if (!ibv_mr) {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to register CUDA device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported = true;
}
} else {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to retrieve dmabuf fd of CUDA device buffer: %d. "
"Fall back to ibv_reg_mr\n", ret);
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
}
#else
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
#endif

if (!ibv_mr) {
info->p2p_supported_by_device = false;
Expand Down Expand Up @@ -247,22 +248,27 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
info->p2p_disabled_by_user = false;
/* Neuron currently requires P2P */
info->p2p_required_by_impl = true;
info->dmabuf_supported = false;

#if HAVE_EFA_DMABUF_MR
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
ret = ofi_hmem_get_dmabuf_fd(FI_HMEM_NEURON, ptr, (uint64_t)len, &dmabuf_fd, &offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(
g_device_list[0].ibv_pd, offset,
ibv_mr = efa_mr_reg_ibv_dmabuf_mr(
efa_domain->ibv_pd, offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
} else if (ret == -FI_ENOPROTOOPT) {
EFA_INFO(FI_LOG_MR,
if (!ibv_mr) {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to register neuron device buffer via dmabuf: %s. "
"Fall back to ibv_reg_mr\n", fi_strerror(-errno));
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
} else {
info->dmabuf_supported = true;
}
} else {
EFA_INFO(FI_LOG_DOMAIN,
"Unable to retrieve dmabuf fd of Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
ibv_mr = ibv_reg_mr(efa_domain->ibv_pd, ptr, len, ibv_access);
}
#else
ibv_mr = ibv_reg_mr(g_device_list[0].ibv_pd, ptr, len, ibv_access);
#endif

if (!ibv_mr) {
info->p2p_supported_by_device = false;
Expand Down Expand Up @@ -325,6 +331,7 @@ static int efa_domain_hmem_info_init_synapseai(struct efa_domain *efa_domain)
/* SynapseAI currently requires P2P */
info->p2p_required_by_impl = true;
info->p2p_supported_by_device = true;
info->dmabuf_supported = true;
efa_domain_hmem_info_init_protocol_thresholds(efa_domain, FI_HMEM_SYNAPSEAI);

/* Only the long read protocol is supported */
Expand Down
1 change: 1 addition & 0 deletions prov/efa/src/efa_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct efa_hmem_info {
bool p2p_disabled_by_user; /* Did the user disable p2p via FI_OPT_FI_HMEM_P2P? */
bool p2p_required_by_impl; /* Is p2p required for this interface? */
bool p2p_supported_by_device; /* do we support p2p with this device */
bool dmabuf_supported;

size_t max_intra_eager_size; /* Maximum message size to use eager protocol for intra-node */
size_t max_medium_msg_size;
Expand Down
100 changes: 27 additions & 73 deletions prov/efa/src/efa_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -475,30 +475,6 @@ struct fi_ops efa_mr_ops = {
.ops_open = fi_no_ops_open,
};

#if HAVE_EFA_DMABUF_MR

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access);
}

#else

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
EFA_WARN(FI_LOG_MR,
"ibv_reg_dmabuf_mr is required for memory"
" registration with FI_MR_DMABUF flags, but "
" not available in the current rdma-core library."
" please build libfabric with rdma-core >= 34.0\n");
return NULL;
}

#endif
/**
* @brief Register a memory buffer with rdma-core api.
*
Expand All @@ -511,7 +487,20 @@ struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr,
int access, const uint64_t flags)
{
if (flags & FI_MR_DMABUF)
int dmabuf_fd;
uint64_t offset;
int ret;

assert(efa_mr->domain->hmem_info[mr_attr->iface].p2p_supported_by_device);

if (flags & FI_MR_DMABUF) {
if (OFI_UNLIKELY(!efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported)) {
EFA_WARN(FI_LOG_MR, "Requested FI_MR_DMABUF, but dmabuf is not supported.\n");
return NULL;
}

EFA_INFO(FI_LOG_MR, "FI_MR_DMABUF is set. Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n",
mr_attr->dmabuf->fd, mr_attr->dmabuf->offset, mr_attr->dmabuf->len);
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd,
mr_attr->dmabuf->offset,
Expand All @@ -520,64 +509,29 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
mr_attr->dmabuf->fd,
access
);
}

/*
* TODO: remove the synapseai and neuron blocks by onboarding the
* ofi_hmem_get_dmabuf_fd API.
*/
#if HAVE_SYNAPSEAI
if (efa_mr_is_synapseai(efa_mr)) {
int dmabuf_fd;
uint64_t offset;
int ret;

ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);
if (efa_mr->domain->hmem_info[mr_attr->iface].dmabuf_supported) {
ret = ofi_hmem_get_dmabuf_fd(
mr_attr->iface,
mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd, &offset);
if (ret != FI_SUCCESS) {
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for device buffer. errno: %d, err_msg: %s\n",
ret, fi_strerror(-ret));
return NULL;
}
EFA_INFO(FI_LOG_MR, "Registering dmabuf mr with fd: %d, offset: %lu, len: %zu\n",
dmabuf_fd, offset, mr_attr->mr_iov->iov_len);
return efa_mr_reg_ibv_dmabuf_mr(efa_mr->domain->ibv_pd, offset,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
}
#endif

#if HAVE_NEURON
if (efa_mr_is_neuron(efa_mr)) {
int dmabuf_fd;
uint64_t offset;
int ret;

ret = neuron_get_dmabuf_fd(
mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len,
&dmabuf_fd,
&offset);

if (ret == FI_SUCCESS) {
/* Success => invoke ibv_reg_dmabuf_mr */
return efa_mr_reg_ibv_dmabuf_mr(
efa_mr->domain->ibv_pd, 0,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
} else if (ret == -FI_ENOPROTOOPT) {
/* Protocol not availabe => fallback */
EFA_INFO(FI_LOG_MR,
"Unable to get dmabuf fd for Neuron device buffer, "
"Fall back to ibv_reg_mr\n");
return ibv_reg_mr(
efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
}
return NULL;
}
#endif

EFA_INFO(FI_LOG_MR, "Dmabuf is not supported. Registering memory via ibv_reg_mr with addr: %lu, len: %zu\n",
(uint64_t)mr_attr->mr_iov->iov_base, mr_attr->mr_iov->iov_len);
return ibv_reg_mr(efa_mr->domain->ibv_pd,
(void *)mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_len, access);
Expand Down
28 changes: 28 additions & 0 deletions prov/efa/src/efa_mr.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

#include <stdbool.h>
#include <ofi_mr.h>
#include <infiniband/verbs.h>

#include "efa_prov.h"

/*
* Descriptor returned for FI_HMEM peer memory registrations
Expand Down Expand Up @@ -35,6 +38,31 @@ struct efa_mr {
bool needs_sync;
};

#if HAVE_EFA_DMABUF_MR

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
return ibv_reg_dmabuf_mr(pd, offset, len, iova, fd, access);
}

#else

static inline
struct ibv_mr *efa_mr_reg_ibv_dmabuf_mr(struct ibv_pd *pd, uint64_t offset,
size_t len, uint64_t iova, int fd, int access)
{
EFA_WARN(FI_LOG_MR,
"ibv_reg_dmabuf_mr is required for memory"
" registration with FI_MR_DMABUF flags, but "
" not available in the current rdma-core library."
" please build libfabric with rdma-core >= 34.0\n");
return NULL;
}

#endif

extern int efa_mr_cache_enable;
extern size_t efa_mr_max_cached_count;
extern size_t efa_mr_max_cached_size;
Expand Down

0 comments on commit 6aa6708

Please sign in to comment.