diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index dc5a8181825..6901a46127e 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -104,44 +104,58 @@ AC_DEFUN([OMPI_CHECK_UCX],[ old_CPPFLAGS="$CPPFLAGS" AS_IF([test -n "$ompi_check_ucx_dir"], [CPPFLAGS="$CPPFLAGS -I$ompi_check_ucx_dir/include"]) - AC_CHECK_DECLS([ucp_tag_send_nbr], - [AC_DEFINE([HAVE_UCP_TAG_SEND_NBR],[1], - [have ucp_tag_send_nbr()])], [], - [#include ]) - AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb, - ucp_request_check_status, ucp_put_nb, ucp_get_nb, - ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx], - [], [], - [#include ]) - AC_CHECK_DECLS([ucm_test_events, - ucm_test_external_events], - [], [], - [#include ]) - AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND, - UCP_ATOMIC_POST_OP_OR, - UCP_ATOMIC_POST_OP_XOR, - UCP_ATOMIC_FETCH_OP_FAND, - UCP_ATOMIC_FETCH_OP_FOR, - UCP_ATOMIC_FETCH_OP_FXOR, - UCP_PARAM_FIELD_ESTIMATED_NUM_PPN], - [], [], - [#include ]) - AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], - [AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1], - [have worker address attribute])], [], - [#include ]) - AC_CHECK_DECLS([UCP_ATTR_FIELD_MEMORY_TYPES], - [AC_DEFINE([HAVE_UCP_ATTR_MEMORY_TYPES], [1], - [have memory types attribute])], [], - [#include ]) - AC_CHECK_DECLS([ucp_tag_send_nbx, - ucp_tag_send_sync_nbx, - ucp_tag_recv_nbx], - [], [], - [#include ]) - AC_CHECK_TYPES([ucp_request_param_t], - [], [], - [[#include ]]) + # Turn off UCX version v1.8 due to issue #8321 + AC_MSG_CHECKING([UCX version]) + AC_PREPROC_IFELSE([AC_LANG_PROGRAM([#include + #if (UCP_API_MAJOR == 1) && (UCP_API_MINOR == 8) + #error "Invalid version" + #endif], [])], + [AC_MSG_RESULT([ok (not 1.8.x)])], + [AC_MSG_RESULT([bad (1.8.x)]) + AC_MSG_WARN([UCX support skipped because version 1.8.x was found, which has a known catastrophic issue.]) + AC_MSG_WARN([Please upgrade to UCX version 1.9 or higher.]) + ompi_check_ucx_happy=no]) + AS_IF([test "$ompi_check_ucx_happy" = yes], + [ + AC_CHECK_DECLS([ucp_tag_send_nbr], + [AC_DEFINE([HAVE_UCP_TAG_SEND_NBR],[1], + [have ucp_tag_send_nbr()])], [], + [#include ]) + AC_CHECK_DECLS([ucp_ep_flush_nb, ucp_worker_flush_nb, + ucp_request_check_status, ucp_put_nb, ucp_get_nb, + ucp_put_nbx, ucp_get_nbx, ucp_atomic_op_nbx], + [], [], + [#include ]) + AC_CHECK_DECLS([ucm_test_events, + ucm_test_external_events], + [], [], + [#include ]) + AC_CHECK_DECLS([UCP_ATOMIC_POST_OP_AND, + UCP_ATOMIC_POST_OP_OR, + UCP_ATOMIC_POST_OP_XOR, + UCP_ATOMIC_FETCH_OP_FAND, + UCP_ATOMIC_FETCH_OP_FOR, + UCP_ATOMIC_FETCH_OP_FXOR, + UCP_PARAM_FIELD_ESTIMATED_NUM_PPN], + [], [], + [#include ]) + AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], + [AC_DEFINE([HAVE_UCP_WORKER_ADDRESS_FLAGS], [1], + [have worker address attribute])], [], + [#include ]) + AC_CHECK_DECLS([UCP_ATTR_FIELD_MEMORY_TYPES], + [AC_DEFINE([HAVE_UCP_ATTR_MEMORY_TYPES], [1], + [have memory types attribute])], [], + [#include ]) + AC_CHECK_DECLS([ucp_tag_send_nbx, + ucp_tag_send_sync_nbx, + ucp_tag_recv_nbx], + [], [], + [#include ]) + AC_CHECK_TYPES([ucp_request_param_t], + [], [], + [[#include ]]) + ]) CPPFLAGS=$old_CPPFLAGS OPAL_SUMMARY_ADD([[Transports]],[[Open UCX]],[$1],[$ompi_check_ucx_happy])])]) diff --git a/ompi/mca/pml/ucx/pml_ucx.c b/ompi/mca/pml/ucx/pml_ucx.c index 6be18579d99..fc46995d940 100644 --- a/ompi/mca/pml/ucx/pml_ucx.c +++ b/ompi/mca/pml/ucx/pml_ucx.c @@ -186,12 +186,23 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc, int mca_pml_ucx_open(void) { + unsigned major_version, minor_version, release_number; ucp_context_attr_t attr; ucp_params_t params; ucp_config_t *config; ucs_status_t status; - PML_UCX_VERBOSE(1, "mca_pml_ucx_open"); + /* Check version */ + ucp_get_version(&major_version, &minor_version, &release_number); + PML_UCX_VERBOSE(1, "mca_pml_ucx_open: UCX version %u.%u.%u", + major_version, minor_version, release_number); + + if ((major_version == 1) && (minor_version == 8)) { + /* disabled due to issue #8321 */ + PML_UCX_VERBOSE(1, "UCX PML is disabled because the run-time UCX version " + "is 1.8, which has a known catastrophic issue"); + return OMPI_ERROR; + } /* Read options */ status = ucp_config_read("MPI", NULL, &config); @@ -694,7 +705,7 @@ int mca_pml_ucx_isend_init(const void *buf, size_t count, ompi_datatype_t *datat } static ucs_status_ptr_t -mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count, +mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count, ompi_datatype_t *datatype, uint64_t pml_tag) { ompi_request_t *req; @@ -717,7 +728,7 @@ mca_pml_ucx_bsend(ucp_ep_h ep, const void *buf, size_t count, PML_UCX_ERROR("bsend: failed to allocate buffer"); return UCS_STATUS_PTR(OMPI_ERROR); } - + iov_count = 1; iov.iov_base = packed_data; iov.iov_len = packed_length; @@ -805,8 +816,8 @@ int mca_pml_ucx_isend(const void *buf, size_t count, ompi_datatype_t *datatype, ompi_request_t *req; ucp_ep_h ep; - PML_UCX_TRACE_SEND("i%ssend request *%p", - buf, count, datatype, dst, tag, mode, comm, + PML_UCX_TRACE_SEND("i%ssend request *%p", + buf, count, datatype, dst, tag, mode, comm, mode == MCA_PML_BASE_SEND_BUFFERED ? "b" : "", (void*)request)