openucx · vspetrov · Jun 21, 2022 · May 31, 2022 · May 31, 2022 · Jun 7, 2022
diff --git a/config/m4/cuda.m4 b/config/m4/cuda.m4
@@ -84,6 +84,8 @@ AS_IF([test "x$cuda_checked" != "xyes"],
                                            [])],
                              [],
                              [[#include <nvml.h>]])])
+        AC_CHECK_SIZEOF(cuFloatComplex,,[#include <cuComplex.h>])
+        AC_CHECK_SIZEOF(cuDoubleComplex,,[#include <cuComplex.h>])
 
          # Check for NVCC
          AC_ARG_VAR(NVCC, [NVCC compiler command])

diff --git a/configure.ac b/configure.ac
@@ -1,5 +1,5 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2001-2021.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2001-2022.  ALL RIGHTS RESERVED.
 # This software product is a proprietary product of Mellanox Technologies Ltd.
 # (the "Company") and all right, title, and interest and to the software product,
 # including all associated intellectual property rights, are and shall
@@ -77,6 +77,12 @@ AC_PROG_LIBTOOL
 AC_HEADER_STDC
 CFLAGS="$CFLAGS_save"
 
+AC_CHECK_SIZEOF(float)
+AC_CHECK_SIZEOF(double)
+AC_CHECK_SIZEOF(long double)
+AC_CHECK_SIZEOF(float _Complex)
+AC_CHECK_SIZEOF(double _Complex)
+AC_CHECK_SIZEOF(long double _Complex)
 #
 # Check if 'ln' supports creating relative links
 #

diff --git a/src/components/mc/cpu/Makefile.am b/src/components/mc/cpu/Makefile.am
@@ -1,25 +1,33 @@
 #
-# Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
+# Copyright (C) Mellanox Technologies Ltd. 2020-2022.  ALL RIGHTS RESERVED.
 #
 
-sources =                                           \
-	mc_cpu.h                                    \
-	mc_cpu.c                                    \
-	reduce/mc_cpu_reduce.h                      \
-	reduce/mc_cpu_reduce_int8.c                 \
-	reduce/mc_cpu_reduce_int16.c                \
-	reduce/mc_cpu_reduce_int32.c                \
-	reduce/mc_cpu_reduce_int64.c                \
-	reduce/mc_cpu_reduce_uint8.c                \
-	reduce/mc_cpu_reduce_uint16.c               \
-	reduce/mc_cpu_reduce_uint32.c               \
-	reduce/mc_cpu_reduce_uint64.c               \
-	reduce/mc_cpu_reduce_float.c                \
-	reduce/mc_cpu_reduce_bfloat16.c             \
-	reduce/mc_cpu_reduce_double.c               \
-	reduce_alpha/mc_cpu_reduce_alpha_float.c    \
-	reduce_alpha/mc_cpu_reduce_alpha_bfloat16.c \
-	reduce_alpha/mc_cpu_reduce_alpha_double.c
+sources =                                              \
+	mc_cpu.h                                           \
+	mc_cpu.c                                           \
+	reduce/mc_cpu_reduce.h                             \
+	reduce/mc_cpu_reduce_int8.c                        \
+	reduce/mc_cpu_reduce_int16.c                       \
+	reduce/mc_cpu_reduce_int32.c                       \
+	reduce/mc_cpu_reduce_int64.c                       \
+	reduce/mc_cpu_reduce_uint8.c                       \
+	reduce/mc_cpu_reduce_uint16.c                      \
+	reduce/mc_cpu_reduce_uint32.c                      \
+	reduce/mc_cpu_reduce_uint64.c                      \
+	reduce/mc_cpu_reduce_float.c                       \
+	reduce/mc_cpu_reduce_bfloat16.c                    \
+	reduce/mc_cpu_reduce_double.c                      \
+	reduce/mc_cpu_reduce_long_double.c                 \
+	reduce/mc_cpu_reduce_float_complex.c               \
+	reduce/mc_cpu_reduce_double_complex.c              \
+	reduce/mc_cpu_reduce_long_double_complex.c         \
+	reduce_alpha/mc_cpu_reduce_alpha_float.c           \
+	reduce_alpha/mc_cpu_reduce_alpha_bfloat16.c        \
+	reduce_alpha/mc_cpu_reduce_alpha_double.c          \
+	reduce_alpha/mc_cpu_reduce_alpha_long.c            \
+	reduce_alpha/mc_cpu_reduce_alpha_float_complex.c   \
+	reduce_alpha/mc_cpu_reduce_alpha_double_complex.c  \
+	reduce_alpha/mc_cpu_reduce_alpha_long_complex.c
 
 module_LTLIBRARIES        = libucc_mc_cpu.la
 libucc_mc_cpu_la_SOURCES  = $(sources)

diff --git a/src/components/mc/cpu/mc_cpu.c b/src/components/mc/cpu/mc_cpu.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2022.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -189,16 +189,50 @@ static ucc_status_t ucc_mc_cpu_reduce_multi(const void *src1, const void *src2,
         return ucc_mc_cpu_reduce_multi_uint64(src1, src2, dst, n_vectors,
                                               count, stride, op);
     case UCC_DT_FLOAT32:
-        ucc_assert(4 == sizeof(float));
-        return ucc_mc_cpu_reduce_multi_float(src1, src2, dst, n_vectors,
-                                             count, stride, op);
+#if SIZEOF_FLOAT == 4
+        return ucc_mc_cpu_reduce_multi_float(src1, src2, dst, n_vectors, count,
+                                             stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     case UCC_DT_FLOAT64:
-        ucc_assert(8 == sizeof(double));
+#if SIZEOF_DOUBLE == 8
         return ucc_mc_cpu_reduce_multi_double(src1, src2, dst, n_vectors,
                                               count, stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT128:
+#if SIZEOF_LONG_DOUBLE == 16
+        return ucc_mc_cpu_reduce_multi_long_double(src1, src2, dst, n_vectors,
+                                                   count, stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     case UCC_DT_BFLOAT16:
         return ucc_mc_cpu_reduce_multi_bfloat16(src1, src2, dst, n_vectors,
                                                 count, stride, op);
+    case UCC_DT_FLOAT32_COMPLEX:
+#if SIZEOF_FLOAT__COMPLEX == 8
+        return ucc_mc_cpu_reduce_multi_float_complex(src1, src2, dst, n_vectors,
+                                                     count, stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT64_COMPLEX:
+#if SIZEOF_DOUBLE__COMPLEX == 16
+        return ucc_mc_cpu_reduce_multi_double_complex(
+            src1, src2, dst, n_vectors, count, stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT128_COMPLEX:
+#if SIZEOF_LONG_DOUBLE__COMPLEX == 32
+        return ucc_mc_cpu_reduce_multi_long_double_complex(
+            src1, src2, dst, n_vectors, count, stride, op);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     default:
         mc_error(&ucc_mc_cpu.super, "unsupported reduction type (%s)",
                  ucc_datatype_str(dt));
@@ -222,19 +256,57 @@ ucc_mc_cpu_reduce_multi_alpha(const void *src1, const void *src2, void *dst,
 {
     switch (dt) {
     case UCC_DT_FLOAT32:
-        ucc_assert(4 == sizeof(float));
+#if SIZEOF_FLOAT == 4
         return ucc_mc_cpu_reduce_multi_alpha_float(src1, src2, dst, n_vectors,
                                                    count, stride, reduce_op,
                                                    vector_op, (float)alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     case UCC_DT_FLOAT64:
-        ucc_assert(8 == sizeof(double));
+#if SIZEOF_DOUBLE == 8
         return ucc_mc_cpu_reduce_multi_alpha_double(src1, src2, dst, n_vectors,
                                                     count, stride, reduce_op,
                                                     vector_op, alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT128:
+#if SIZEOF_LONG_DOUBLE == 16
+        return ucc_mc_cpu_reduce_multi_alpha_long(
+            src1, src2, dst, n_vectors, count, stride, reduce_op, vector_op,
+            (long double)alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     case UCC_DT_BFLOAT16:
         return ucc_mc_cpu_reduce_multi_alpha_bfloat16(src1, src2, dst, n_vectors,
                                                       count, stride, reduce_op,
                                                       vector_op, (float)alpha);
+    case UCC_DT_FLOAT32_COMPLEX:
+#if SIZEOF_FLOAT__COMPLEX == 8
+        return ucc_mc_cpu_reduce_multi_alpha_float_complex(
+            src1, src2, dst, n_vectors, count, stride, reduce_op, vector_op,
+            (float)alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT64_COMPLEX:
+#if SIZEOF_DOUBLE__COMPLEX == 16
+        return ucc_mc_cpu_reduce_multi_alpha_double_complex(
+            src1, src2, dst, n_vectors, count, stride, reduce_op, vector_op,
+            (double)alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
+    case UCC_DT_FLOAT128_COMPLEX:
+#if SIZEOF_LONG_DOUBLE__COMPLEX == 32
+        return ucc_mc_cpu_reduce_multi_alpha_long_complex(
+            src1, src2, dst, n_vectors, count, stride, reduce_op, vector_op,
+            (long double)alpha);
+#else
+        return UCC_ERR_NOT_SUPPORTED;
+#endif
     default:
         mc_error(&ucc_mc_cpu.super, "unsupported reduction type (%s)",
                  ucc_datatype_str(dt));

diff --git a/src/components/mc/cpu/reduce/mc_cpu_reduce.h b/src/components/mc/cpu/reduce/mc_cpu_reduce.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (C) Mellanox Technologies Ltd. 2020-2021.  ALL RIGHTS RESERVED.
+ * Copyright (C) Mellanox Technologies Ltd. 2021-2022.  ALL RIGHTS RESERVED.
  *
  * See file LICENSE for terms.
  */
@@ -8,6 +8,8 @@
 #define UCC_MC_CPU_REDUCE_H_
 
 #include "utils/ucc_math.h"
+#include <complex.h>
+
 #define OP_1(_s1, _s2, _i, _sc, _OP) _OP(_s1[_i], _s2[_i])
 #define OP_2(_s1, _s2, _i, _sc, _OP)                                           \
     _OP((OP_1(_s1, _s2, _i, _sc, _OP)), _s2[_i + 1 * _sc])
@@ -163,6 +165,32 @@
         }                                                                      \
     } while (0)
 
+#define DO_DT_REDUCE_FLOAT_COMPLEX(type, reduce_op, src1_p, src2_p, dest_p,    \
+                                   size, count, stride)                        \
+    do {                                                                       \
+        const type *restrict s1 = (const type *restrict)src1_p;                \
+        const type *restrict s2 = (const type *restrict)src2_p;                \
+        type *restrict       d  = (type * restrict) dest_p;                    \
+        ucc_assert((ptrdiff_t)d <= (ptrdiff_t)src2_p ||                        \
+                   (ptrdiff_t)d > (ptrdiff_t)src2_p + (size - 1) * stride +    \
+                                      count * sizeof(type));                   \
+        switch (reduce_op) {                                                   \
+        case UCC_OP_SUM:                                                       \
+        case UCC_OP_AVG:                                                       \
+            DO_DT_REDUCE_WITH_OP(s1, s2, d, size, count, stride, DO_OP_SUM);   \
+            break;                                                             \
+        case UCC_OP_PROD:                                                      \
+            DO_DT_REDUCE_WITH_OP(s1, s2, d, size, count, stride, DO_OP_PROD);  \
+            break;                                                             \
+        default:                                                               \
+            mc_error(&ucc_mc_cpu.super,                                        \
+                     "float complex dtype does not support "                   \
+                     "requested reduce op: %s",                                \
+                     ucc_reduction_op_str(reduce_op));                         \
+            return UCC_ERR_NOT_SUPPORTED;                                      \
+        }                                                                      \
+    } while (0)
+
 #define VEC_OP(_d, OP)                                                         \
     do {                                                                       \
         size_t _i;                                                             \
@@ -201,18 +229,23 @@ REDUCE_FN_DECLARE(uint32);
 REDUCE_FN_DECLARE(uint64);
 REDUCE_FN_DECLARE(float);
 REDUCE_FN_DECLARE(double);
+REDUCE_FN_DECLARE(long_double);
 REDUCE_FN_DECLARE(bfloat16);
+REDUCE_FN_DECLARE(float_complex);
+REDUCE_FN_DECLARE(double_complex);
+REDUCE_FN_DECLARE(long_double_complex);
 
-#define REDUCE_ALPHA_FN_DECLARE(_type)                                         \
+#define REDUCE_ALPHA_FN_DECLARE(_type, alpha_dt)                               \
     ucc_status_t ucc_mc_cpu_reduce_multi_alpha_##_type(                        \
         const void *src1, const void *src2, void *dst, size_t n_vectors,       \
         size_t count, size_t stride, ucc_reduction_op_t reduce_op,             \
-        ucc_reduction_op_t vector_op, _type alpha)
-REDUCE_ALPHA_FN_DECLARE(float);
-REDUCE_ALPHA_FN_DECLARE(double);
-ucc_status_t
-ucc_mc_cpu_reduce_multi_alpha_bfloat16(const void *src1, const void *src2,
-                                       void *dst, size_t n_vectors, size_t count,
-                                       size_t stride, ucc_reduction_op_t reduce_op,
-                                       ucc_reduction_op_t vector_op, float alpha);
+        ucc_reduction_op_t vector_op, alpha_dt alpha)
+REDUCE_ALPHA_FN_DECLARE(float, float);
+REDUCE_ALPHA_FN_DECLARE(double, double);
+REDUCE_ALPHA_FN_DECLARE(long, long double);
+REDUCE_ALPHA_FN_DECLARE(bfloat16, float);
+REDUCE_ALPHA_FN_DECLARE(float_complex, float);
+REDUCE_ALPHA_FN_DECLARE(double_complex, double);
+REDUCE_ALPHA_FN_DECLARE(long_complex, long double);
+
 #endif
diff --git a/src/components/mc/cpu/reduce/mc_cpu_reduce_double_complex.c b/src/components/mc/cpu/reduce/mc_cpu_reduce_double_complex.c
@@ -0,0 +1,19 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2022.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "mc_cpu.h"
+#include "reduce/mc_cpu_reduce.h"
+
+ucc_status_t ucc_mc_cpu_reduce_multi_double_complex(const void *src1,
+                                                    const void *src2, void *dst,
+                                                    size_t n_vectors,
+                                                    size_t count, size_t stride,
+                                                    ucc_reduction_op_t op)
+{
+    DO_DT_REDUCE_FLOAT_COMPLEX(double complex, op, src1, src2, dst, n_vectors,
+                               count, stride);
+    return UCC_OK;
+}
diff --git a/src/components/mc/cpu/reduce/mc_cpu_reduce_float_complex.c b/src/components/mc/cpu/reduce/mc_cpu_reduce_float_complex.c
@@ -0,0 +1,19 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2022.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "mc_cpu.h"
+#include "reduce/mc_cpu_reduce.h"
+
+ucc_status_t ucc_mc_cpu_reduce_multi_float_complex(const void *src1,
+                                                   const void *src2, void *dst,
+                                                   size_t n_vectors,
+                                                   size_t count, size_t stride,
+                                                   ucc_reduction_op_t op)
+{
+    DO_DT_REDUCE_FLOAT_COMPLEX(float complex, op, src1, src2, dst, n_vectors,
+                               count, stride);
+    return UCC_OK;
+}
diff --git a/src/components/mc/cpu/reduce/mc_cpu_reduce_long_double.c b/src/components/mc/cpu/reduce/mc_cpu_reduce_long_double.c
@@ -0,0 +1,19 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2022.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "mc_cpu.h"
+#include "reduce/mc_cpu_reduce.h"
+
+ucc_status_t ucc_mc_cpu_reduce_multi_long_double(const void *src1,
+                                                 const void *src2, void *dst,
+                                                 size_t n_vectors, size_t count,
+                                                 size_t             stride,
+                                                 ucc_reduction_op_t op)
+{
+    DO_DT_REDUCE_FLOAT(long double, op, src1, src2, dst, n_vectors, count,
+                       stride);
+    return UCC_OK;
+}
diff --git a/src/components/mc/cpu/reduce/mc_cpu_reduce_long_double_complex.c b/src/components/mc/cpu/reduce/mc_cpu_reduce_long_double_complex.c
@@ -0,0 +1,17 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2022.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "mc_cpu.h"
+#include "reduce/mc_cpu_reduce.h"
+
+ucc_status_t ucc_mc_cpu_reduce_multi_long_double_complex(
+    const void *src1, const void *src2, void *dst, size_t n_vectors,
+    size_t count, size_t stride, ucc_reduction_op_t op)
+{
+    DO_DT_REDUCE_FLOAT_COMPLEX(long double complex, op, src1, src2, dst,
+                               n_vectors, count, stride);
+    return UCC_OK;
+}
diff --git a/src/components/mc/cpu/reduce_alpha/mc_cpu_reduce_alpha_double_complex.c b/src/components/mc/cpu/reduce_alpha/mc_cpu_reduce_alpha_double_complex.c
@@ -0,0 +1,19 @@
+/**
+ * Copyright (C) Mellanox Technologies Ltd. 2020-2022.  ALL RIGHTS RESERVED.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "mc_cpu.h"
+#include "reduce/mc_cpu_reduce.h"
+
+ucc_status_t ucc_mc_cpu_reduce_multi_alpha_double_complex(
+    const void *src1, const void *src2, void *dst, size_t n_vectors,
+    size_t count, size_t stride, ucc_reduction_op_t reduce_op,
+    ucc_reduction_op_t vector_op, double alpha)
+{
+    DO_DT_REDUCE_FLOAT_COMPLEX(double complex, reduce_op, src1, src2, dst,
+                               n_vectors, count, stride);
+    DO_VEC_OP(double complex, dst);
+    return UCC_OK;
+}