diff --git a/config/opal_configure_options.m4 b/config/opal_configure_options.m4 index c20e2eb4e4e..d5be1630aa9 100644 --- a/config/opal_configure_options.m4 +++ b/config/opal_configure_options.m4 @@ -10,7 +10,7 @@ dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, dnl University of Stuttgart. All rights reserved. dnl Copyright (c) 2004-2005 The Regents of the University of California. dnl All rights reserved. -dnl Copyright (c) 2006-2020 Cisco Systems, Inc. All rights reserved +dnl Copyright (c) 2006-2022 Cisco Systems, Inc. All rights reserved dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved. dnl Copyright (c) 2009 IBM Corporation. All rights reserved. dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights @@ -327,25 +327,24 @@ fi # AC_MSG_CHECKING([for default value of mca_base_component_show_load_errors]) -AC_ARG_ENABLE([show-load-errors-by-default], - [AS_HELP_STRING([--enable-show-load-errors-by-default], - [Set the default value for the MCA parameter - mca_base_component_show_load_errors (but can be - overridden at run time by the usual - MCA-variable-setting mechansism). This MCA variable - controls whether warnings are displayed when an MCA - component fails to load at run time due to an error. - (default: enabled, meaning that - mca_base_component_show_load_errors is enabled - by default])]) -if test "$enable_show_load_errors_by_default" = "no" ; then - OPAL_SHOW_LOAD_ERRORS_DEFAULT=0 - AC_MSG_RESULT([disabled by default]) -else - OPAL_SHOW_LOAD_ERRORS_DEFAULT=1 - AC_MSG_RESULT([enabled by default]) -fi -AC_DEFINE_UNQUOTED(OPAL_SHOW_LOAD_ERRORS_DEFAULT, $OPAL_SHOW_LOAD_ERRORS_DEFAULT, +AC_ARG_WITH([show-load-errors], + [AS_HELP_STRING([--with-show-load-errors], + [Set the default value for the MCA + parameter + mca_base_component_show_load_errors (but + can be overridden at run time by the usual + MCA-variable-setting mechansism). + (default: "all")])]) + +AS_IF([test -z "$with_show_load_errors" -o "$with_show_load_errors" = "yes"], + [with_show_load_errors=all + AC_MSG_RESULT([enabled for all])], + [AS_IF([test "$with_show_load_errors" = "no"], + [with_show_load_errors=none + AC_MSG_RESULT([disabled for all])], + [AC_MSG_RESULT([$with_show_load_errors])])]) + +AC_DEFINE_UNQUOTED(OPAL_SHOW_LOAD_ERRORS_DEFAULT, ["$with_show_load_errors"], [Default value for mca_base_component_show_load_errors MCA variable]) diff --git a/docs/running-apps/tuning.rst b/docs/running-apps/tuning.rst index 88d2a782f90..ec95bc0d5f4 100644 --- a/docs/running-apps/tuning.rst +++ b/docs/running-apps/tuning.rst @@ -445,3 +445,66 @@ presented here so that they can easily be found via internet searches: .. important:: You can only use the "include" *or* the "exclude" parameter |mdash| they are mutually exclusive from each other. +* ``opal_mca_base_component_show_load_errors``: By default, Open MPI + emits a warning message if it fails to open a DSO component at run + time. This typically happens when a shared library that the DSO + requires is not available. + + .. admonition:: Rationale + :class: tip + + In prior versions of Open MPI, components defaulted to building + as DSOs (vs. being included in their parent libraries, such as + ``libmpi.so``). On misconfigured systems, sometimes network + acceleration libraries would not be present, meaning that + HPC-class networking components failed to open at run time. As + such, Open MPI would typically fall back to TCP as a network + transport, which usually led to poor performance of end-user + applications. + + Having Open MPI warn about such failures to load was useful + because it alerted users to the misconfiguration. + + .. note:: By default, Open MPI |ompi_ver| includes all components in + its base libraries (e.g., on Linux, ``libmpi.so`` includes + all the components that were built with Open MPI, and + therefore no component need to be opened dynamically), and + does not build its components as DSOs. + + This MCA parameter *only* affects the behavior of when a + component DSO fails to open. + + This MCA parameter can take four general values: + + #. ``yes`` or a boolean "true" value (e.g., ``1``): Open MPI will + emit a warning about every component DSO that fails to load. + + #. ``no`` or a boolean "false" value (e.g., ``0``): Open MPI will + never emit warnings about component DSOs that fail to load. + + #. A comma-delimited list of frameworks and/or components: Open MPI + will emit a warning about any dynamic component that fails to + open and matches a token in the list. "Match" is defined as: + + * If a token in the list is only a framework name, then any + component in that framework will match. + * If a token in the list specifies both a framework name and a + component name (in the form ``framework/component``), then + only the specified component in the specified framework will + match. + + For example, if the value of this MCA parameter is + ``accelerator,btl/uct``, then Open MPI warn if any component in + the accelerator framework or if the UCT BTL fails to load at run + time. + + #. The value can also be a ``^`` character followed by a + comma-delimited list of ``framework[/component]`` values: This + is similar to the comma-delimited list of tokens, except it will + only emit warnings about dynamic components that fail to load + and do *not* match a token in the list. + + For example, if the value of this MCA parameter is + ``^accelerator,btl/uct``, then Open MPI will only warn about the + failure to load DSOs that are neither in the accelerator + framework nor are the UCT BTL. diff --git a/opal/mca/base/base.h b/opal/mca/base/base.h index 45887a99a95..789cbc52030 100644 --- a/opal/mca/base/base.h +++ b/opal/mca/base/base.h @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -69,7 +69,7 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_component_priority_list_item_t); * Public variables */ OPAL_DECLSPEC extern char *mca_base_component_path; -OPAL_DECLSPEC extern bool mca_base_component_show_load_errors; +OPAL_DECLSPEC extern char *mca_base_component_show_load_errors; OPAL_DECLSPEC extern bool mca_base_component_track_load_errors; OPAL_DECLSPEC extern bool mca_base_component_disable_dlopen; OPAL_DECLSPEC extern char *mca_base_system_default_path; @@ -214,6 +214,10 @@ OPAL_DECLSPEC int mca_base_framework_components_register(struct mca_base_framewo mca_base_register_flag_t flags); /* mca_base_components_open.c */ +OPAL_DECLSPEC int mca_base_show_load_errors_init(void); +OPAL_DECLSPEC int mca_base_show_load_errors_finalize(void); +OPAL_DECLSPEC bool mca_base_show_load_errors(const char *framework_name, + const char *component_name); OPAL_DECLSPEC int mca_base_framework_components_open(struct mca_base_framework_t *framework, mca_base_open_flag_t flags); diff --git a/opal/mca/base/help-mca-base.txt b/opal/mca/base/help-mca-base.txt index 543b5c51f25..2637fcce1fa 100644 --- a/opal/mca/base/help-mca-base.txt +++ b/opal/mca/base/help-mca-base.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved # $COPYRIGHT$ # # Additional copyrights may follow @@ -59,3 +59,23 @@ all components *except* a and b", while "c,d" specifies the inclusive behavior and means "use *only* components c and d." You cannot mix inclusive and exclusive behavior. +# +[internal error during init] +An internal error has occurred during the startup of Open MPI. This +is highly unusual and shouldn't happen. Open MPI will now abort your +job. + +The following message may provide additional insight into the error: + + Failure at: %s (%s:%d) + Error: %d (%s) +# +[show_load_errors: too many /] +The opal_mca_base_component_show_load_errors MCA variable cannot +contain a token that has more than one "/" character in it. + +The opal_mca_base_component_show_load_errors MCA variable can only +contain the values: all, none, or a comma-delimited list of tokens in +the form of "framework" or "framework/component". + + Erroneous value: %s diff --git a/opal/mca/base/mca_base_close.c b/opal/mca/base/mca_base_close.c index 2ffd5ae6568..0f2f0fa9ff2 100644 --- a/opal/mca/base/mca_base_close.c +++ b/opal/mca/base/mca_base_close.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * $COPYRIGHT$ @@ -61,6 +61,9 @@ void mca_base_close(void) /* Shut down the dynamic component finder */ mca_base_component_find_finalize(); + /* Shut down the show_load_errors processing */ + mca_base_show_load_errors_finalize(); + /* Close opal output stream 0 */ opal_output_close(0); } diff --git a/opal/mca/base/mca_base_component_repository.c b/opal/mca/base/mca_base_component_repository.c index 34c50d639af..169188f12d1 100644 --- a/opal/mca/base/mca_base_component_repository.c +++ b/opal/mca/base/mca_base_component_repository.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -372,7 +372,8 @@ int mca_base_component_repository_open(mca_base_framework_t *framework, "%s MCA component \"%s\" at path %s", ri->ri_type, ri->ri_name, ri->ri_path); - vl = mca_base_component_show_load_errors ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_INFO; + vl = mca_base_show_load_errors(ri->ri_type, + ri->ri_name) ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_INFO; /* Ensure that this component is not already loaded (should only happen if it was statically loaded). It's an error if it's already diff --git a/opal/mca/base/mca_base_components_open.c b/opal/mca/base/mca_base_components_open.c index 4947929fc45..7963f973396 100644 --- a/opal/mca/base/mca_base_components_open.c +++ b/opal/mca/base/mca_base_components_open.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2011-2015 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014 Hochschule Esslingen. All rights reserved. @@ -28,11 +28,13 @@ #include #include "opal/class/opal_list.h" +#include "opal/class/opal_cstring.h" #include "opal/constants.h" #include "opal/mca/base/base.h" #include "opal/mca/mca.h" #include "opal/util/argv.h" #include "opal/util/output.h" +#include "opal/util/show_help.h" /* * Local functions @@ -44,6 +46,235 @@ struct mca_base_dummy_framework_list_item_t { mca_base_framework_t framework; }; +typedef struct fc_pair { + opal_list_item_t li; + char *framework_name; + char *component_name; +} fc_pair_t; + +OBJ_CLASS_DECLARATION(fc_pair_t); + +typedef enum { + SHOW_LOAD_ERRORS_ALL, + SHOW_LOAD_ERRORS_INCLUDE, + SHOW_LOAD_ERRORS_EXCLUDE, + SHOW_LOAD_ERRORS_NONE, +} show_load_type_t; + +static show_load_type_t show_load_errors = SHOW_LOAD_ERRORS_ALL; +static opal_list_t show_load_errors_include = {0}; +static opal_list_t show_load_errors_exclude = {0}; + + +static void fc_pair_constructor(struct fc_pair *obj) +{ + obj->framework_name = NULL; + obj->component_name = NULL; +} + +static void fc_pair_destructor(struct fc_pair *obj) +{ + free(obj->framework_name); + obj->framework_name = NULL; + free(obj->component_name); + obj->component_name = NULL; +} + +OBJ_CLASS_INSTANCE(fc_pair_t, opal_list_item_t, + fc_pair_constructor, + fc_pair_destructor); + +/* + * Parse the content of the show_load_errors value + * + * Valid values: + * - "all" + * - "none" + * - comma-delimited list of items, each of which is + * "framework[/component]" + * + * The comma-delimited list may be prefixed with a "^". + */ +int mca_base_show_load_errors_init(void) +{ + OBJ_CONSTRUCT(&show_load_errors_include, opal_list_t); + OBJ_CONSTRUCT(&show_load_errors_exclude, opal_list_t); + + // Check to see if mca_base_component_show_load_errors is a + // boolean value + opal_cstring_t *cstr = opal_cstring_create(mca_base_component_show_load_errors); + if (NULL == cstr) { + int ret = OPAL_ERROR; + opal_show_help("help-mca-base.txt", + "internal error during init", true, + __func__, __FILE__, __LINE__, + ret, + "Failed to create opal_cstring"); + return ret; + } + bool value; + int ret = opal_cstring_to_bool(cstr, &value); + OBJ_RELEASE(cstr); + + if (OPAL_SUCCESS == ret) { + // True true values as a synonym for "all", and false values + // as a synonym for "none". This is mainly for backwards + // compatibility with Open MPI <= v4.x, where + // mca_base_component_show_load_errors was a boolean value. + if (value) { + show_load_errors = SHOW_LOAD_ERRORS_ALL; + } else { + show_load_errors = SHOW_LOAD_ERRORS_NONE; + } + } else if (strcasecmp(mca_base_component_show_load_errors, "all") == 0) { + show_load_errors = SHOW_LOAD_ERRORS_ALL; + } + else if (strcasecmp(mca_base_component_show_load_errors, "none") == 0) { + show_load_errors = SHOW_LOAD_ERRORS_NONE; + } else { + // We have a comma-delimited list of values. Is it + // "include"-style, or "exclude" style? + size_t pos = 0; + opal_list_t *list = &show_load_errors_include; + show_load_errors = SHOW_LOAD_ERRORS_INCLUDE; + if (mca_base_component_show_load_errors[0] == '^') { + pos = 1; + list = &show_load_errors_exclude; + show_load_errors = SHOW_LOAD_ERRORS_EXCLUDE; + } + + // Examine each of the values in the comma-delimited list. + // Each value can be of the form "framework" or + // "framework/component". + char **values = opal_argv_split(mca_base_component_show_load_errors + pos, + ','); + if (values == NULL) { + ret = OPAL_ERROR; + opal_show_help("help-mca-base.txt", + "internal error during init", true, + __func__, __FILE__, __LINE__, + ret, + "Failed to argv split opal_mca_base_component_show_load_errors"); + return ret; + } + + char **split; + int argc; + fc_pair_t *fcp; + for (int i = 0; values[i] != NULL; ++i) { + split = opal_argv_split(values[i], '/'); + if (NULL == split) { + ret = OPAL_ERROR; + opal_show_help("help-mca-base.txt", + "internal error during init", true, + __func__, __FILE__, __LINE__, + ret, + "Failed to argv split opal_mca_base_component_show_load_errors value"); + return ret; + } + + argc = opal_argv_count(split); + if (0 == argc) { + // This should never happen + ret = OPAL_ERROR; + opal_show_help("help-mca-base.txt", + "internal error during init", true, + __func__, __FILE__, __LINE__, + ret, + "Argv split resulted in 0 tokens"); + return ret; + } + + if (strlen(split[0]) == 0) { + // Empty entry (e.g., consecutive commas); silently + // skip it + continue; + } + + if (argc > 2) { + ret = OPAL_ERR_BAD_PARAM; + opal_show_help("help-mca-base.txt", + "show_load_errors: too many /", true, + values[i]); + return ret; + } + + fcp = OBJ_NEW(fc_pair_t); + if (NULL == fcp) { + ret = OPAL_ERR_OUT_OF_RESOURCE; + opal_show_help("help-mca-base.txt", + "internal error during init", true, + __func__, __FILE__, __LINE__, + ret, + "Failed to alloc new fc_pair_t"); + return ret; + } + + fcp->framework_name = split[0]; + if (2 == argc) { + fcp->component_name = split[1]; + } + + opal_list_append(list, &fcp->li); + } + opal_argv_free(values); + } + + return OPAL_SUCCESS; +} + + +bool mca_base_show_load_errors(const char *framework_name, + const char *component_name) +{ + if (SHOW_LOAD_ERRORS_ALL == show_load_errors) { + return true; + } else if (SHOW_LOAD_ERRORS_NONE == show_load_errors) { + return false; + } + + // If we get here, it means we have an include or exclude list. + // Setup for what to do based on whether it's an include or + // exclude list. + opal_list_t *list; + bool value_if_match_found; + + if (SHOW_LOAD_ERRORS_INCLUDE == show_load_errors) { + list = &show_load_errors_include; + value_if_match_found = true; + } else { + list = &show_load_errors_exclude; + value_if_match_found = false; + } + + // See if the framework_name/component_name pair is found in the + // active list. + fc_pair_t *item; + OPAL_LIST_FOREACH(item, list, fc_pair_t) { + if (strcmp(framework_name, item->framework_name) == 0) { + if (NULL == item->component_name) { + // If there's no component name, then we're matching + // all components in this framework. + return value_if_match_found; + } else if (strcmp(component_name, item->component_name) == 0) { + // We matched both the framework *and* component name. + return value_if_match_found; + } + } + } + + // We didn't find a match. + return !value_if_match_found; +} + +int mca_base_show_load_errors_finalize(void) +{ + OBJ_DESTRUCT(&show_load_errors_include); + OBJ_DESTRUCT(&show_load_errors_exclude); + + return OPAL_SUCCESS; +} + /** * Function for finding and opening either all MCA components, or the * one that was specifically requested via a MCA parameter. @@ -134,7 +365,8 @@ static int open_components(mca_base_framework_t *framework) display the error in the stream where it was expected. */ - if (mca_base_component_show_load_errors) { + if (mca_base_show_load_errors(component->mca_type_name, + component->mca_component_name)) { opal_output_verbose(MCA_BASE_VERBOSE_ERROR, output_id, "mca: base: components_open: component %s " "/ %s open function failed", diff --git a/opal/mca/base/mca_base_components_register.c b/opal/mca/base/mca_base_components_register.c index eb40c7e5197..18bf74ccec5 100644 --- a/opal/mca/base/mca_base_components_register.c +++ b/opal/mca/base/mca_base_components_register.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2012 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2011-2015 Los Alamos National Security, LLC. * All rights reserved. * $COPYRIGHT$ @@ -115,7 +115,8 @@ static int register_components(mca_base_framework_t *framework) display the error in the stream where it was expected. */ - if (mca_base_component_show_load_errors) { + if (mca_base_show_load_errors(component->mca_type_name, + component->mca_component_name)) { opal_output_verbose(MCA_BASE_VERBOSE_ERROR, output_id, "mca: base: components_register: component %s " "/ %s register function failed", diff --git a/opal/mca/base/mca_base_open.c b/opal/mca/base/mca_base_open.c index 8389ba76c60..32f7890a781 100644 --- a/opal/mca/base/mca_base_open.c +++ b/opal/mca/base/mca_base_open.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2011-2017 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2011-2022 Cisco Systems, Inc. All rights reserved * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. @@ -52,7 +52,7 @@ char *mca_base_component_path = NULL; int mca_base_opened = 0; char *mca_base_system_default_path = NULL; char *mca_base_user_default_path = NULL; -bool mca_base_component_show_load_errors = (bool) OPAL_SHOW_LOAD_ERRORS_DEFAULT; +char *mca_base_component_show_load_errors = NULL; bool mca_base_component_track_load_errors = false; bool mca_base_component_disable_dlopen = false; @@ -104,15 +104,24 @@ int mca_base_open(void) MCA_BASE_VAR_SYN_FLAG_DEPRECATED); free(value); - mca_base_component_show_load_errors = (bool) OPAL_SHOW_LOAD_ERRORS_DEFAULT; + mca_base_component_show_load_errors = OPAL_SHOW_LOAD_ERRORS_DEFAULT; var_id - = mca_base_var_register("opal", "mca", "base", "component_show_load_errors", - "Whether to show errors for components that failed to load or not", - MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &mca_base_component_show_load_errors); + = mca_base_var_register("opal", "mca", "base", + "component_show_load_errors", + "Whether to show warnings for components that fail to load or not. Valid values are \"all\" (meaning: all load failures are reported), \"none\" (no load failures are reported), or a comma-delimited list of items, each of which can be a framework/component pair or a framework name (only load failures from the specifically-listed items are reported). If the comma-delimited list is prefixed with \"^\", then orientation of the list is negated: warn about all load failures *except* for the listed items.", + MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_base_component_show_load_errors); (void) mca_base_var_register_synonym(var_id, "opal", "mca", NULL, "component_show_load_errors", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); + // Parse the mca_base_component_show_load_errors value + int ret = mca_base_show_load_errors_init(); + if (OPAL_SUCCESS != ret) { + return ret; + } + mca_base_component_track_load_errors = false; var_id = mca_base_var_register("opal", "mca", "base", "component_track_load_errors",