From dd54af94508dc9ccee3e589276a9ede62fc8e409 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 16 Nov 2020 20:24:16 +0100 Subject: [PATCH 1/4] coll/base: Fix collective module selection preference treatment The selectable list is sorted with lowest to highest priority so the user-defined preferences should be appended to the list. The preference treatment should also maintain the order provided by the user (first item has highest priority) so switch the loop order. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/base/coll_base_comm_select.c | 30 ++++++++++------------ 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/ompi/mca/coll/base/coll_base_comm_select.c b/ompi/mca/coll/base/coll_base_comm_select.c index 906e34c47ef..38574bdbfc8 100644 --- a/ompi/mca/coll/base/coll_base_comm_select.c +++ b/ompi/mca/coll/base/coll_base_comm_select.c @@ -337,6 +337,7 @@ static opal_list_t *check_components(opal_list_t * components, ompi_communicator_t * comm) { int priority, flag; + int count_include = 0; const mca_base_component_t *component; mca_base_component_list_item_t *cli; mca_coll_base_module_2_3_0_t *module; @@ -363,7 +364,8 @@ static opal_list_t *check_components(opal_list_t * components, if(NULL == coll_argv) { goto proceed_to_select; } - int idx2, count_include = opal_argv_count(coll_argv); + int idx2; + count_include = opal_argv_count(coll_argv); /* Allocate the coll_include argv */ coll_include = (char**)malloc((count_include + 1) * sizeof(char*)); coll_include[count_include] = NULL; /* NULL terminated array */ @@ -385,15 +387,6 @@ static opal_list_t *check_components(opal_list_t * components, } coll_include[idx] = coll_argv[idx]; } - /* Reverse the order of the coll_inclide argv to faciliate the ordering of - * the selected components reverse. - */ - for( idx2 = 0; idx2 < (count_include - 1); idx2++ ) { - char* temp = coll_include[idx2]; - coll_include[idx2] = coll_include[count_include - 1]; - coll_include[count_include - 1] = temp; - count_include--; - } } proceed_to_select: /* Make a list of the components that query successfully */ @@ -453,14 +446,17 @@ static opal_list_t *check_components(opal_list_t * components, /* For all valid component reorder them not on their provided priorities but on * the order requested in the info key. As at this point the coll_include is - * already ordered backward we can simply prepend the components. + * already ordered backward we can simply append the components. + * Note that the last element in selectable will have the highest priorty. */ - mca_coll_base_avail_coll_t *item, *item_next; - OPAL_LIST_FOREACH_SAFE(item, item_next, - selectable, mca_coll_base_avail_coll_t) { - if( component_in_argv(coll_include, item->ac_component_name) ) { - opal_list_remove_item(selectable, &item->super); - opal_list_prepend(selectable, &item->super); + for (int idx = count_include-1; idx >= 0; --idx) { + mca_coll_base_avail_coll_t *item; + OPAL_LIST_FOREACH(item, selectable, mca_coll_base_avail_coll_t) { + if (0 == strcmp(item->ac_component_name, coll_include[idx])) { + opal_list_remove_item(selectable, &item->super); + opal_list_append(selectable, &item->super); + break; + } } } From 09c2f4af9437accd747e823c591927481c2103ad Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 19 Nov 2020 19:10:09 +0100 Subject: [PATCH 2/4] coll/[sm|han|adapt]: don't disqualify on priority 0 Signed-off-by: Joseph Schuchart --- ompi/mca/coll/adapt/coll_adapt_module.c | 6 +++--- ompi/mca/coll/han/coll_han_module.c | 2 +- ompi/mca/coll/sm/coll_sm_module.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ompi/mca/coll/adapt/coll_adapt_module.c b/ompi/mca/coll/adapt/coll_adapt_module.c index fd8c448f20c..54d295294ae 100644 --- a/ompi/mca/coll/adapt/coll_adapt_module.c +++ b/ompi/mca/coll/adapt/coll_adapt_module.c @@ -3,9 +3,9 @@ * of Tennessee Research Foundation. All rights * reserved. * $COPYRIGHT$ - * + * * Additional copyrights may follow - * + * * $HEADER$ */ @@ -146,7 +146,7 @@ mca_coll_base_module_t *ompi_coll_adapt_comm_query(struct ompi_communicator_t * /* Get the priority level attached to this module. If priority is less than or equal to 0, then the module is unavailable. */ *priority = mca_coll_adapt_component.adapt_priority; - if (mca_coll_adapt_component.adapt_priority <= 0) { + if (mca_coll_adapt_component.adapt_priority < 0) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:adapt:comm_query (%d/%s): priority too low; " "disqualifying myself", diff --git a/ompi/mca/coll/han/coll_han_module.c b/ompi/mca/coll/han/coll_han_module.c index 1a3a7e5c667..441ef58ad4b 100644 --- a/ompi/mca/coll/han/coll_han_module.c +++ b/ompi/mca/coll/han/coll_han_module.c @@ -188,7 +188,7 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority) /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_han_component.han_priority; - if (mca_coll_han_component.han_priority <= 0) { + if (mca_coll_han_component.han_priority < 0) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:han:comm_query (%d/%s): priority too low; disqualifying myself", comm->c_contextid, comm->c_name); diff --git a/ompi/mca/coll/sm/coll_sm_module.c b/ompi/mca/coll/sm/coll_sm_module.c index 25e9c779467..b8d02b7a4e8 100644 --- a/ompi/mca/coll/sm/coll_sm_module.c +++ b/ompi/mca/coll/sm/coll_sm_module.c @@ -182,10 +182,10 @@ mca_coll_sm_comm_query(struct ompi_communicator_t *comm, int *priority) /* Get the priority level attached to this module. If priority is less * than or equal to 0, then the module is unavailable. */ *priority = mca_coll_sm_component.sm_priority; - if (mca_coll_sm_component.sm_priority <= 0) { + if (mca_coll_sm_component.sm_priority < 0) { opal_output_verbose(10, ompi_coll_base_framework.framework_output, "coll:sm:comm_query (%d/%s): priority too low; disqualifying myself", comm->c_contextid, comm->c_name); - return NULL; + return NULL; } sm_module = OBJ_NEW(mca_coll_sm_module_t); From 971d58c52454a6edecdbb1a44ebd037a86e69a69 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 19 Nov 2020 19:22:06 +0100 Subject: [PATCH 3/4] coll/han: remove references to experimental solo and shared collective components Also make coll/tuned the default for shared memory communication as coll/sm has shown performance issues that need investigation. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/han/coll_han_component.c | 13 ++++++------- ompi/mca/coll/han/coll_han_dynamic.h | 1 - ompi/mca/coll/han/coll_han_subcomms.c | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index ef55a6ac99d..44f46a8874c 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -39,7 +39,6 @@ ompi_coll_han_components available_components[COMPONENTS_COUNT] = { { LIBNBC, "libnbc", NULL }, { TUNED, "tuned", NULL }, { SM, "sm", NULL }, - { SHARED, "shared", NULL }, { ADAPT, "adapt", NULL }, { HAN, "han", NULL } }; @@ -179,7 +178,7 @@ static int han_register(void) cs->han_bcast_low_module = 0; (void) mca_base_component_var_register(c, "bcast_low_module", - "low level module for bcast, 0 sm, 1 solo", + "low level module for bcast, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_low_module); @@ -200,7 +199,7 @@ static int han_register(void) cs->han_reduce_low_module = 0; (void) mca_base_component_var_register(c, "reduce_low_module", - "low level module for allreduce, 0 sm, 1 shared", + "low level module for allreduce, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); @@ -220,7 +219,7 @@ static int han_register(void) cs->han_allreduce_low_module = 0; (void) mca_base_component_var_register(c, "allreduce_low_module", - "low level module for allreduce, 0 sm, 1 shared", + "low level module for allreduce, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allreduce_low_module); @@ -234,7 +233,7 @@ static int han_register(void) cs->han_allgather_low_module = 0; (void) mca_base_component_var_register(c, "allgather_low_module", - "low level module for allgather, 0 sm, 1 shared", + "low level module for allgather, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_allgather_low_module); @@ -248,7 +247,7 @@ static int han_register(void) cs->han_gather_low_module = 0; (void) mca_base_component_var_register(c, "gather_low_module", - "low level module for gather, 0 sm, 1 shared", + "low level module for gather, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_gather_low_module); @@ -262,7 +261,7 @@ static int han_register(void) cs->han_scatter_low_module = 0; (void) mca_base_component_var_register(c, "scatter_low_module", - "low level module for scatter, 0 sm, 1 shared", + "low level module for scatter, 0 tuned, 1 sm", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_scatter_low_module); diff --git a/ompi/mca/coll/han/coll_han_dynamic.h b/ompi/mca/coll/han/coll_han_dynamic.h index 0ccecb63ba3..f95fdae6180 100644 --- a/ompi/mca/coll/han/coll_han_dynamic.h +++ b/ompi/mca/coll/han/coll_han_dynamic.h @@ -102,7 +102,6 @@ typedef enum COMPONENTS { LIBNBC, TUNED, SM, - SHARED, ADAPT, HAN, COMPONENTS_COUNT diff --git a/ompi/mca/coll/han/coll_han_subcomms.c b/ompi/mca/coll/han/coll_han_subcomms.c index bf5b4df523b..65a444757ef 100644 --- a/ompi/mca/coll/han/coll_han_subcomms.c +++ b/ompi/mca/coll/han/coll_han_subcomms.c @@ -258,7 +258,7 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade sm module priority to set up low_comms[0] with sm module * This sub-communicator contains the ranks that share my node. */ - opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "tuned,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[0])); @@ -272,7 +272,7 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm, * Upgrade shared module priority to set up low_comms[1] with shared module * This sub-communicator contains the ranks that share my node. */ - opal_info_set(&comm_info, "ompi_comm_coll_preference", "shared,^han"); + opal_info_set(&comm_info, "ompi_comm_coll_preference", "sm,^han"); ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, &comm_info, &(low_comms[1])); From 1cdc85564ed6c771f301c63d6bc6d8c1c8cf4a4c Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 19 Nov 2020 19:23:32 +0100 Subject: [PATCH 4/4] coll/han: reduce default segment size for reduce/allreduce to 64k This has shown to be more effective in achieving overlap of inter- and intra-node communication and reduces the inital delay before hitting the network. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/han/coll_han_component.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_component.c b/ompi/mca/coll/han/coll_han_component.c index 44f46a8874c..f45f4de0441 100644 --- a/ompi/mca/coll/han/coll_han_component.c +++ b/ompi/mca/coll/han/coll_han_component.c @@ -183,7 +183,7 @@ static int han_register(void) OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_bcast_low_module); - cs->han_reduce_segsize = 524288; + cs->han_reduce_segsize = 65536; (void) mca_base_component_var_register(c, "reduce_segsize", "segment size for reduce", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, @@ -203,7 +203,7 @@ static int han_register(void) MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &cs->han_reduce_low_module); - cs->han_allreduce_segsize = 524288; + cs->han_allreduce_segsize = 65536; (void) mca_base_component_var_register(c, "allreduce_segsize", "segment size for allreduce", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,