diff --git a/config/opal_check_ofi.m4 b/config/opal_check_ofi.m4 index d3d4185ae9e..b7761e262f0 100644 --- a/config/opal_check_ofi.m4 +++ b/config/opal_check_ofi.m4 @@ -126,19 +126,26 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ CPPFLAGS="$CPPFLAGS $opal_ofi_CPPFLAGS" AS_IF([test $opal_ofi_happy = yes], - [AC_CHECK_MEMBER([struct fi_info.nic], + [AC_CHECK_HEADERS([rdma/fi_ext.h]) + + AC_CHECK_MEMBER([struct fi_info.nic], [opal_check_fi_info_pci=1], [opal_check_fi_info_pci=0], - [[#include ]])]) + [[#include ]]) + + AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE], + [$opal_check_fi_info_pci], + [check if pci data is available in ofi]) - AC_DEFINE_UNQUOTED([OPAL_OFI_PCI_DATA_AVAILABLE], - [$opal_check_fi_info_pci], - [check if pci data is available in ofi]) + AC_CHECK_DECLS([PMIX_PACKAGE_RANK], + [], + [], + [#include ]) - AC_CHECK_DECLS([PMIX_PACKAGE_RANK], - [], - [], - [#include ]) + AC_CHECK_TYPES([struct fi_ops_mem_monitor], [], [], + [#ifdef HAVE_RDMA_FI_EXT_H +#include +#endif])]) CPPFLAGS=$opal_check_ofi_save_CPPFLAGS LDFLAGS=$opal_check_ofi_save_LDFLAGS @@ -157,18 +164,6 @@ AC_DEFUN([_OPAL_CHECK_OFI],[ [AC_MSG_WARN([OFI libfabric support requested (via --with-ofi or --with-libfabric), but not found.]) AC_MSG_ERROR([Cannot continue.])]) ]) - opal_ofi_import_monitor=no - AS_IF([test $opal_ofi_happy = "yes"], - [OPAL_CHECK_OFI_VERSION_GE([1,14], - [opal_ofi_import_monitor=yes], - [opal_ofi_import_monitor=no])]) - - -if test "$opal_ofi_import_monitor" = "yes"; then - AC_DEFINE_UNQUOTED([OPAL_OFI_IMPORT_MONITOR_SUPPORT],1, - [Whether libfabric supports monitor import]) -fi - ])dnl diff --git a/ompi/mca/mtl/ofi/mtl_ofi_component.c b/ompi/mca/mtl/ofi/mtl_ofi_component.c index bbe32213bd2..4f93c86879c 100644 --- a/ompi/mca/mtl/ofi/mtl_ofi_component.c +++ b/ompi/mca/mtl/ofi/mtl_ofi_component.c @@ -254,9 +254,7 @@ ompi_mtl_ofi_component_register(void) MCA_BASE_VAR_SCOPE_READONLY, &ompi_mtl_ofi.num_ofi_contexts); - opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version); - - return OMPI_SUCCESS; + return opal_common_ofi_mca_register(&mca_mtl_ofi_component.super.mtl_version); } @@ -285,7 +283,7 @@ ompi_mtl_ofi_component_open(void) "provider_exclude")) { return OMPI_ERR_NOT_AVAILABLE; } - return opal_common_ofi_init(); + return opal_common_ofi_open(); } static int @@ -302,9 +300,7 @@ ompi_mtl_ofi_component_close(void) #if OPAL_CUDA_SUPPORT mca_common_cuda_fini(); #endif - opal_common_ofi_mca_deregister(); - opal_common_ofi_fini(); - return OMPI_SUCCESS; + return opal_common_ofi_close(); } int @@ -582,8 +578,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, int universe_size; char *univ_size_str; - opal_common_ofi_mca_register(); - opal_output_verbose(1, opal_common_ofi.output, "%s:%d: mtl:ofi:provider_include = \"%s\"\n", __FILE__, __LINE__, *opal_common_ofi.prov_include); @@ -893,6 +887,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads, } } + /* this must be called during single threaded part of the code and + * before Libfabric configures its memory monitors. Easiest to do + * that before domain open. Silently ignore not-supported errors, + * as they are not critical to program correctness, but only + * indicate that LIbfabric will have to pick a different, possibly + * less optimial, monitor. */ + ret = opal_common_ofi_export_memory_monitor(); + if (0 != ret && -FI_ENOSYS != ret) { + opal_output_verbose(1, opal_common_ofi.output, + "Failed to inject Libfabric memory monitor: %s", + fi_strerror(-ret)); + } + + /** * Open fabric * The getinfo struct returns a fabric attribute struct that can be used to diff --git a/opal/mca/btl/ofi/btl_ofi_component.c b/opal/mca/btl/ofi/btl_ofi_component.c index cd1816b273a..e2f5512ae93 100644 --- a/opal/mca/btl/ofi/btl_ofi_component.c +++ b/opal/mca/btl/ofi/btl_ofi_component.c @@ -123,6 +123,7 @@ static int validate_info(struct fi_info *info, uint64_t required_caps, char **in /* Register the MCA parameters */ static int mca_btl_ofi_component_register(void) { + int ret; char *msg; mca_btl_ofi_module_t *module = &mca_btl_ofi_module_template; @@ -191,7 +192,10 @@ static int mca_btl_ofi_component_register(void) /* for now we want this component to lose to the MTL. */ module->super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_HIGH - 50; - opal_common_ofi_register_mca_variables(&mca_btl_ofi_component.super.btl_version); + ret = opal_common_ofi_mca_register(&mca_btl_ofi_component.super.btl_version); + if (OPAL_SUCCESS != ret) { + return ret; + } return mca_btl_base_param_register(&mca_btl_ofi_component.super.btl_version, &module->super); } @@ -199,7 +203,7 @@ static int mca_btl_ofi_component_register(void) static int mca_btl_ofi_component_open(void) { mca_btl_ofi_component.module_count = 0; - return opal_common_ofi_init(); + return opal_common_ofi_open(); } /* @@ -207,11 +211,11 @@ static int mca_btl_ofi_component_open(void) */ static int mca_btl_ofi_component_close(void) { - opal_common_ofi_mca_deregister(); - opal_common_ofi_fini(); + int ret; + ret = opal_common_ofi_close(); /* If we don't sleep, sockets provider freaks out. Ummm this is a scary comment */ sleep(1); - return OPAL_SUCCESS; + return ret; } void mca_btl_ofi_exit(void) @@ -259,8 +263,6 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules, struct fi_domain_attr domain_attr = {0}; uint64_t required_caps; - opal_common_ofi_mca_register(); - switch (mca_btl_ofi_component.mode) { case MCA_BTL_OFI_MODE_TWO_SIDED: @@ -444,6 +446,19 @@ static int mca_btl_ofi_init_device(struct fi_info *info) * to prevent races. */ mca_btl_ofi_rcache_init(module); + /* for similar reasons to the rcache call, this must be called + * during single threaded part of the code and before Libfabric + * configures its memory monitors. Easiest to do that before + * domain open. Silently ignore not-supported errors, as they + * are not critical to program correctness, but only indicate + * that LIbfabric will have to pick a different, possibly less + * optimial, monitor. */ + rc = opal_common_ofi_export_memory_monitor(); + if (0 != rc && -FI_ENOSYS != rc) { + BTL_VERBOSE(("Failed to inject Libfabric memory monitor: %s", + fi_strerror(-rc))); + } + linux_device_name = info->domain_attr->name; BTL_VERBOSE( ("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name)); diff --git a/opal/mca/common/ofi/common_ofi.c b/opal/mca/common/ofi/common_ofi.c index 1a5d0fb72f5..a9880151e25 100644 --- a/opal/mca/common/ofi/common_ofi.c +++ b/opal/mca/common/ofi/common_ofi.c @@ -15,57 +15,78 @@ * $HEADER$ */ + +#include "opal_config.h" + #include #include +#include +#ifdef HAVE_RDMA_FI_EXT_H +#include +#endif -#include "opal_config.h" #include "common_ofi.h" #include "opal/constants.h" #include "opal/mca/base/mca_base_framework.h" #include "opal/mca/base/mca_base_var.h" #include "opal/mca/hwloc/base/base.h" +#include "opal/mca/memory/base/base.h" #include "opal/mca/pmix/base/base.h" #include "opal/util/argv.h" #include "opal/util/show_help.h" -OPAL_DECLSPEC opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, - .prov_exclude = NULL, - .registered = 0, - .verbose = 0}; - +opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL, + .prov_exclude = NULL, + .output = -1}; static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic"; static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT; -static bool opal_common_ofi_initialized = false; static int opal_common_ofi_init_ref_cnt = 0; +static bool opal_common_ofi_installed_memory_monitor = false; -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR +/* + * Monitor object to export into Libfabric to provide memory release + * notifications using our own memory hooks framework. Monitors may + * use the subscribe/unsubscribe notifications to reduce unnecessary + * notifications, but are not required to do so. Because patcher + * notifies about all releases, it is cheaper for us to not filter and + * this monitor can safely ignore subscribe/unsubscribe notifications. + * + * Libfabric requires the object to be fully defined. Unlike most of + * Open MPI, it does not have NULL function pointer checks in calling + * code. + */ static int opal_common_ofi_monitor_start(struct fid_mem_monitor *monitor) { return 0; } + static void opal_common_ofi_monitor_stop(struct fid_mem_monitor *monitor) { return; } + static int opal_common_ofi_monitor_subscribe(struct fid_mem_monitor *monitor, const void *addr, size_t len) { return 0; } + static void opal_common_ofi_monitor_unsubscribe(struct fid_mem_monitor *monitor, const void *addr, size_t len) { return; } + static bool opal_common_ofi_monitor_valid(struct fid_mem_monitor *monitor, const void *addr, size_t len) { return true; } -static struct fid_mem_monitor *opal_common_ofi_monitor; -static struct fid *opal_common_ofi_cache_fid; +static struct fid_mem_monitor *opal_common_ofi_monitor = NULL; +static struct fid *opal_common_ofi_cache_fid = NULL; static struct fi_ops_mem_monitor opal_common_ofi_export_ops = { .size = sizeof(struct fi_ops_mem_monitor), .start = opal_common_ofi_monitor_start, @@ -75,82 +96,152 @@ static struct fi_ops_mem_monitor opal_common_ofi_export_ops = { .valid = opal_common_ofi_monitor_valid, }; -OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length, - void *cbdata, bool from_alloc) +/** + * Callback function from Open MPI memory monitor + * + * Translation function between the callback function from Open MPI's + * memory notifier to the Libfabric memory monitor. + */ +static void opal_common_ofi_mem_release_cb(void *buf, size_t length, + void *cbdata, bool from_alloc) { opal_common_ofi_monitor->import_ops->notify(opal_common_ofi_monitor, buf, length); } -#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */ -OPAL_DECLSPEC int opal_common_ofi_init(void) +#endif /* HAVE_STRUCT_FI_OPS_MEM_MONITOR */ + +int opal_common_ofi_export_memory_monitor(void) { - int ret; + int ret = -FI_ENOSYS; - opal_common_ofi_init_ref_cnt++; - if (opal_common_ofi_initialized) { - return OPAL_SUCCESS; +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR + OPAL_THREAD_LOCK(&opal_common_ofi_mutex); + + if (NULL != opal_common_ofi_cache_fid) { + return 0; } -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT - mca_base_framework_open(&opal_memory_base_framework, 0); + /* + * While the memory import functionality was introduced in 1.13, + * some deadlock bugs exist in the 1.13 series. Require version + * 1.14 before this code is activated. Not activating the code + * should not break any functionality directly, but may lead to + * sub-optimal memory monitors being used in Libfabric, as Open + * MPI will almost certainly install a patcher first. + */ + if (FI_VERSION_LT(fi_version(), FI_VERSION(1, 14))) { + ret = -FI_ENOSYS; + goto err; + } + + ret = mca_base_framework_open(&opal_memory_base_framework, 0); + if (OPAL_SUCCESS != ret) { + ret = -FI_ENOSYS; + goto err; + } if ((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT) != (((OPAL_MEMORY_FREE_SUPPORT | OPAL_MEMORY_MUNMAP_SUPPORT)) & opal_mem_hooks_support_level())) { - return OPAL_SUCCESS; + ret = -FI_ENOSYS; + goto err; } - ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, &opal_common_ofi_cache_fid, NULL); - if (ret) { + /* + * The monitor import object has the well known name "mr_cache" + * and was introduced in Libfabric 1.13 + */ + ret = fi_open(FI_VERSION(1,13), "mr_cache", NULL, 0, 0, + &opal_common_ofi_cache_fid, NULL); + if (0 != ret) { goto err; } opal_common_ofi_monitor = calloc(1, sizeof(*opal_common_ofi_monitor)); - if (!opal_common_ofi_monitor) { + if (NULL == opal_common_ofi_monitor) { + ret = -FI_ENOMEM; goto err; } opal_common_ofi_monitor->fid.fclass = FI_CLASS_MEM_MONITOR; opal_common_ofi_monitor->export_ops = &opal_common_ofi_export_ops; - ret = fi_import_fid(opal_common_ofi_cache_fid, &opal_common_ofi_monitor->fid, 0); - if (ret) { + ret = fi_import_fid(opal_common_ofi_cache_fid, + &opal_common_ofi_monitor->fid, 0); + if (0 != ret) { goto err; } opal_mem_hooks_register_release(opal_common_ofi_mem_release_cb, NULL); - opal_common_ofi_initialized = true; + opal_common_ofi_installed_memory_monitor = true; + + ret = 0; - return OPAL_SUCCESS; err: - if (opal_common_ofi_cache_fid) { - fi_close(opal_common_ofi_cache_fid); - } - if (opal_common_ofi_monitor) { - free(opal_common_ofi_monitor); + if (0 != ret) { + if (NULL != opal_common_ofi_cache_fid) { + fi_close(opal_common_ofi_cache_fid); + } + if (NULL != opal_common_ofi_monitor) { + free(opal_common_ofi_monitor); + } } - return OPAL_ERROR; -#else - opal_common_ofi_initialized = true; - return OPAL_SUCCESS; + opal_common_ofi_installed_memory_monitor = false; + + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); #endif + + return ret; } -OPAL_DECLSPEC int opal_common_ofi_fini(void) +static int opal_common_ofi_remove_memory_monitor(void) { - if (opal_common_ofi_initialized && !--opal_common_ofi_init_ref_cnt) { -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT +#ifdef HAVE_STRUCT_FI_OPS_MEM_MONITOR + if (opal_common_ofi_installed_memory_monitor) { opal_mem_hooks_unregister_release(opal_common_ofi_mem_release_cb); fi_close(opal_common_ofi_cache_fid); fi_close(&opal_common_ofi_monitor->fid); free(opal_common_ofi_monitor); + opal_common_ofi_installed_memory_monitor = false; + } #endif - opal_common_ofi_initialized = false; + + return OPAL_SUCCESS; +} + +int opal_common_ofi_open(void) +{ + if ((opal_common_ofi_init_ref_cnt++) > 0) { + return OPAL_SUCCESS; } return OPAL_SUCCESS; } -OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item) +int opal_common_ofi_close(void) +{ + int ret; + + if ((--opal_common_ofi_init_ref_cnt) > 0) { + return OPAL_SUCCESS; + } + + ret = opal_common_ofi_remove_memory_monitor(); + if (OPAL_SUCCESS != ret) { + return ret; + } + + if (-1 != opal_common_ofi.output) { + opal_output_close(opal_common_ofi.output); + opal_common_ofi.output = -1; + if (OPAL_SUCCESS != ret) { + return ret; + } + } + + return OPAL_SUCCESS; +} + +int opal_common_ofi_is_in_list(char **list, char *item) { int i = 0; @@ -169,11 +260,12 @@ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item) return 0; } -OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component) +int opal_common_ofi_mca_register(const mca_base_component_t *component) { static int include_index; static int exclude_index; static int verbose_index; + int verbose; int param; if (fi_version() < FI_VERSION(1, 0)) { @@ -231,7 +323,7 @@ OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_componen MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL, - &opal_common_ofi.verbose); + &verbose); } else { verbose_index = param; } @@ -248,32 +340,14 @@ OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_componen "verbose", 0); } - OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); - - return OPAL_SUCCESS; -} - -OPAL_DECLSPEC void opal_common_ofi_mca_register(void) -{ - opal_common_ofi.registered++; - if (opal_common_ofi.registered > 1) { - opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose); - return; + if (opal_common_ofi.output == -1) { + opal_common_ofi.output = opal_output_open(NULL); + opal_output_set_verbosity(opal_common_ofi.output, verbose); } - opal_common_ofi.output = opal_output_open(NULL); - opal_output_set_verbosity(opal_common_ofi.output, opal_common_ofi.verbose); -} + OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex); -OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void) -{ - /* unregister only on last deregister */ - opal_common_ofi.registered--; - assert(opal_common_ofi.registered >= 0); - if (opal_common_ofi.registered) { - return; - } - opal_output_close(opal_common_ofi.output); + return OPAL_SUCCESS; } /* check that the tx attributes match */ @@ -484,61 +558,6 @@ static uint32_t get_package_rank(opal_process_info_t *process_info) return (uint32_t) package_ranks[process_info->my_local_rank]; } -/* Selects a NIC based on hardware locality between process cpuset and device BDF. - * - * Initializes opal_hwloc_topology to access hardware topology if not previously - * initialized - * - * There are 3 main cases that this covers: - * - * 1. If the first provider passed into this function is the only valid - * provider, this provider is returned. - * - * 2. If there is more than 1 provider that matches the type of the first - * provider in the list, and the BDF data - * is available then a provider is selected based on locality of device - * cpuset and process cpuset and tries to ensure that processes are distributed - * evenly across NICs. This has two separate cases: - * - * i. There is one or more provider local to the process: - * - * (local rank % number of providers of the same type that share the process cpuset) - * is used to select one of these providers. - * - * ii. There is no provider that is local to the process: - * - * (local rank % number of providers of the same type) - * is used to select one of these providers - * - * 3. If there is more than 1 providers of the same type in the list, and the BDF data - * is not available (the ofi version does not support fi_info.nic or the - * provider does not support BDF) then (local rank % number of providers of the same type) - * is used to select one of these providers - * - * @param provider_list (IN) struct fi_info* An initially selected - * provider NIC. The provider name and - * attributes are used to restrict NIC - * selection. This provider is returned if the - * NIC selection fails. - * - * @param package_rank (IN) uint32_t The rank of the process. Used to - * select one valid NIC if there is a case - * where more than one can be selected. This - * could occur when more than one provider - * shares the same cpuset as the process. - * This could either be a package_rank if one is - * successfully calculated, or the process id. - * - * @param provider (OUT) struct fi_info* object with the selected - * provider if the selection succeeds - * if the selection fails, returns the fi_info - * object that was initially provided. - * - * All errors should be recoverable and will return the initially provided - * provider. However, if an error occurs we can no longer guarantee - * that the provider returned is local to the process or that the processes will - * balance across available NICs. - */ struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, opal_process_info_t *process_info) { diff --git a/opal/mca/common/ofi/common_ofi.h b/opal/mca/common/ofi/common_ofi.h index d2e17c57ebe..ec21fd732b6 100644 --- a/opal/mca/common/ofi/common_ofi.h +++ b/opal/mca/common/ofi/common_ofi.h @@ -18,34 +18,74 @@ #ifndef OPAL_MCA_COMMON_OFI_H #define OPAL_MCA_COMMON_OFI_H -#include "opal_config.h" -#include "opal/mca/base/mca_base_framework.h" -#include "opal/mca/base/mca_base_var.h" #include "opal/util/proc.h" #include "opal/memoryhooks/memory.h" -#include -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT -#include -#endif BEGIN_C_DECLS typedef struct opal_common_ofi_module { char **prov_include; char **prov_exclude; - int verbose; - int registered; int output; } opal_common_ofi_module_t; extern opal_common_ofi_module_t opal_common_ofi; -extern mca_base_framework_t opal_memory_base_framework; -OPAL_DECLSPEC int opal_common_ofi_register_mca_variables(const mca_base_component_t *component); -OPAL_DECLSPEC void opal_common_ofi_mca_register(void); -OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); +/** + * Common MCA registration + * + * Common MCA registration handlinge. After calling this function, + * \code opal_common_ofi.output will be properly initialized. + * + * @param component (IN) OFI component being initialized + * + * @returns OPAL_SUCCESS on success, OPAL error code on failure + */ +OPAL_DECLSPEC int opal_common_ofi_mca_register(const mca_base_component_t *component); -/* +/** + * Initializes common objects for libfabric + * + * Initialize common libfabric interface. This should be called from + * any other OFI component's component_open() call. + * + * @note This function is not thread safe and must be called in a + * serial portion of the code. + */ +OPAL_DECLSPEC int opal_common_ofi_open(void); + +/** + * Cleans up common objects for libfabric + * + * Clean up common libfabric interface. This should be called from + * any other OFI component's component_close() call. Resource cleanup + * is reference counted, so any successful call to + * opal_common_ofi_init(). + * + * @note This function is not thread safe and must be called in a + * serial portion of the code. + */ +OPAL_DECLSPEC int opal_common_ofi_close(void); + +/** + * Export our memory hooks into Libfabric monitor + * + * Use Open MPI's memory hooks to provide monitor notifications to + * Libfabric via the external mr_cache facility. This must be called + * before any domain is initialized (ie, before any Libfabric memory + * monitor is configured). + * + * @returns A libfabric error code is returned on error + */ +OPAL_DECLSPEC int opal_common_ofi_export_memory_monitor(void); + +/** + * Search function for provider names + * + * This function will take a provider name string and a list of lower + * provider name strings as inputs. It will return true if the lower + * provider in the item string matches a lower provider in the list. + * * @param list (IN) List of strings corresponding to lower providers. * @param item (IN) Single string corresponding to a provider. * @@ -54,44 +94,63 @@ OPAL_DECLSPEC void opal_common_ofi_mca_deregister(void); * @return 1 The lower provider of the item string matches * a string in the item list. * - * This function will take a provider name string and a list of lower - * provider name strings as inputs. It will return true if the lower - * provider in the item string matches a lower provider in the list. - * */ OPAL_DECLSPEC int opal_common_ofi_is_in_list(char **list, char *item); -#if OPAL_OFI_IMPORT_MONITOR_SUPPORT -/* - * @param buf (IN) Pointer to the start of the allocation - * @param length (IN) Length of the allocation - * @param cbdata (IN) Data passed to memory hooks when callback - * was registered - * @param from_alloc (IN) True if the callback is caused by a call to the - * general allocation routines (malloc, calloc, free, - * etc.) or directly from the user (mmap, munmap, etc.) - * - * Callback function triggered when memory is about to be freed. - * is about to be freed. The callback will be triggered according to - * the note in opal_mem_hooks_register_release(). +/** + * Selects NIC (provider) based on hardware locality + * + * In multi-nic situations, use hardware topology to pick the "best" + * of the selected NICs. + * There are 3 main cases that this covers: + * + * 1. If the first provider passed into this function is the only valid + * provider, this provider is returned. + * + * 2. If there is more than 1 provider that matches the type of the first + * provider in the list, and the BDF data + * is available then a provider is selected based on locality of device + * cpuset and process cpuset and tries to ensure that processes + * are distributed evenly across NICs. This has two separate + * cases: + * + * i. There is one or more provider local to the process: + * + * (local rank % number of providers of the same type + * that share the process cpuset) is used to select one + * of these providers. + * + * ii. There is no provider that is local to the process: + * + * (local rank % number of providers of the same type) + * is used to select one of these providers + * + * 3. If there is more than 1 providers of the same type in the + * list, and the BDF data is not available (the ofi version does + * not support fi_info.nic or the provider does not support BDF) + * then (local rank % number of providers of the same type) is + * used to select one of these providers + * + * @param provider_list (IN) struct fi_info* An initially selected + * provider NIC. The provider name and + * attributes are used to restrict NIC + * selection. This provider is returned if the + * NIC selection fails. + * + * @param provider (OUT) struct fi_info* object with the selected + * provider if the selection succeeds + * if the selection fails, returns the fi_info + * object that was initially provided. + * + * All errors should be recoverable and will return the initially provided + * provider. However, if an error occurs we can no longer guarantee + * that the provider returned is local to the process or that the processes will + * balance across available NICs. * */ -OPAL_DECLSPEC void opal_common_ofi_mem_release_cb(void *buf, size_t length, void *cbdata, bool from_alloc); -#endif /* OPAL_OFI_IMPORT_MONITOR_SUPPORT */ - -/* - * Initializes common objects for libfabric - */ -OPAL_DECLSPEC int opal_common_ofi_init(void); - -/* - * Cleans up common objects for libfabric - */ -OPAL_DECLSPEC int opal_common_ofi_fini(void); +OPAL_DECLSPEC struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, + opal_process_info_t *process_info); END_C_DECLS -struct fi_info *opal_mca_common_ofi_select_provider(struct fi_info *provider_list, - opal_process_info_t *process_info); - #endif /* OPAL_MCA_COMMON_OFI_H */