Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 38 additions & 16 deletions ompi/instance/instance.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* reserved.
* Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2026 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -586,11 +587,16 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
fence_release,
(void*)&active))) {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
if (PMIX_SUCCESS != rc) {
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
}
}
}
} else {
Expand All @@ -602,12 +608,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if( PMIX_SUCCESS != rc) {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence() failed", ret);
if (PMIX_SUCCESS != rc) {
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence() failed", ret);
}
} else {
/* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
/* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

Expand Down Expand Up @@ -748,7 +761,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
if (active) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
} else if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
Expand All @@ -757,12 +772,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
fence_release, (void*)&active))) {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence_nb() failed", ret);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if (PMIX_SUCCESS != rc) {
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
return ompi_instance_print_error ("PMIx_Fence() failed", ret);
}
} else {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

Expand Down
20 changes: 16 additions & 4 deletions ompi/runtime/ompi_mpi_finalize.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* reserved.
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* Copyright (c) 2026 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -281,14 +282,25 @@ int ompi_mpi_finalize(void)
* communications/actions to complete. See
* https://github.com/open-mpi/ompi/issues/1576 for the
* original bug report. */
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active))) {
ret = opal_pmix_convert_status(rc);
OMPI_ERROR_LOG(ret);
rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_cbfunc, (void*)&active);
if (PMIX_SUCCESS != rc) {
/* Reset the active flag to false, to avoid waiting for
* completion when the fence was failed. */
active = false;
// can return operation_succeeded if atomically completed
if (PMIX_OPERATION_SUCCEEDED == rc) {
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
OMPI_ERROR_LOG(ret);
}
} else {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
/* NOTE: we lose the fence return status here. This can be
* a problem as the fence CAN fail. Might consider retrieving
* the returned status so you can respond if it doesn't
* successfully complete? */
}
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}

ompi_mpi_instance_finalize (&ompi_mpi_instance_default);
Expand Down
59 changes: 40 additions & 19 deletions ompi/runtime/ompi_mpi_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
* Copyright (c) 2018 FUJITSU LIMITED. All rights reserved.
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
* All Rights reserved.
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2026 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Expand Down Expand Up @@ -464,12 +464,17 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
fence_release,
(void*)&active))) {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
rc = PMIx_Fence_nb(NULL, 0, NULL, 0, fence_release, (void*)&active);
if (PMIX_SUCCESS != rc) {
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
}
}
}
} else {
Expand All @@ -482,12 +487,19 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if( PMIX_SUCCESS != rc) {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence() failed";
goto error;
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
}
} else {
/* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
/* cannot just wait on thread as we need to call opal_progress */
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

Expand Down Expand Up @@ -537,7 +549,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
* we have to wait here for it to complete. However, there
* is no reason to do two barriers! */
if (background_fence) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
if (active) {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
} else if (!ompi_async_mpi_init) {
/* wait for everyone to reach this point - this is a hard
* barrier requirement at this time, though we hope to relax
Expand All @@ -546,13 +560,20 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
active = true;
OPAL_POST_OBJECT(&active);
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
fence_release, (void*)&active))) {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
if (PMIX_SUCCESS != rc) {
active = false;
if (PMIX_OPERATION_SUCCEEDED == rc) {
// can return operation_succeeded if atomically completed
ret = MPI_SUCCESS;
} else {
ret = opal_pmix_convert_status(rc);
error = "PMIx_Fence_nb() failed";
goto error;
}
} else {
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
}
}

Expand Down