ICLDisco · eddy16112 · Nov 7, 2014 · Nov 14, 2014 · Apr 9, 2015 · Apr 22, 2015
diff --git a/config/opal_check_cuda.m4 b/config/opal_check_cuda.m4
@@ -55,6 +55,8 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                      AC_MSG_ERROR([Cannot continue])],
                     [AC_MSG_RESULT([found])
                      opal_check_cuda_happy=yes
+                     opal_cuda_prefix=/usr/local/
+                     opal_cuda_libdir=/usr/local/cuda/lib64
                      opal_cuda_incdir=/usr/local/cuda/include])],
              [AS_IF([test ! -d "$with_cuda"],
                     [AC_MSG_RESULT([not found])
@@ -66,10 +68,14 @@ AS_IF([test "$with_cuda" = "no" || test "x$with_cuda" = "x"],
                                    AC_MSG_WARN([Could not find cuda.h in $with_cuda/include or $with_cuda])
                                    AC_MSG_ERROR([Cannot continue])],
                                   [opal_check_cuda_happy=yes
+                                   opal_cuda_prefix=$with_cuda
                                    opal_cuda_incdir=$with_cuda
+                                   opal_cuda_libdir="$with_cuda/lib64"
                                    AC_MSG_RESULT([found ($with_cuda/cuda.h)])])],
                            [opal_check_cuda_happy=yes
+                            opal_cuda_prefix="$with_cuda"
                             opal_cuda_incdir="$with_cuda/include"
+                            opal_cuda_libdir="$with_cuda/lib64"
                             AC_MSG_RESULT([found ($opal_cuda_incdir/cuda.h)])])])])])
 
 dnl We cannot have CUDA support without dlopen support.  HOWEVER, at
@@ -119,6 +125,8 @@ if test "$opal_check_cuda_happy" = "yes"; then
     CUDA_SUPPORT=1
     opal_datatype_cuda_CPPFLAGS="-I$opal_cuda_incdir"
     AC_SUBST([opal_datatype_cuda_CPPFLAGS])
+    opal_datatype_cuda_LDFLAGS="-L$opal_cuda_libdir"
+    AC_SUBST([opal_datatype_cuda_LDFLAGS])
 else
     AC_MSG_RESULT([no])
     CUDA_SUPPORT=0
@@ -144,6 +152,14 @@ AM_CONDITIONAL([OPAL_cuda_gdr_support], [test "x$CUDA_VERSION_60_OR_GREATER" = "
 AC_DEFINE_UNQUOTED([OPAL_CUDA_GDR_SUPPORT],$CUDA_VERSION_60_OR_GREATER,
                    [Whether we have CUDA GDR support available])
 
+# Checking for nvcc
+AC_MSG_CHECKING([nvcc in $opal_cuda_prefix/bin])
+if test -x "$opal_cuda_prefix/bin/nvcc"; then
+    AC_MSG_RESULT([found])
+    AC_DEFINE_UNQUOTED([NVCC], ["$opal_cuda_prefix/bin/nvcc"], [Path to nvcc binary])
+fi
+
+AC_SUBST([NVCC],[$opal_cuda_prefix/bin/nvcc])
 ])
 
 dnl

diff --git a/configure.ac b/configure.ac
@@ -1416,6 +1416,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 

diff --git a/ompi/mca/bml/bml.h b/ompi/mca/bml/bml.h
@@ -361,6 +361,15 @@ static inline void mca_bml_base_deregister_mem (mca_bml_base_btl_t* bml_btl, mca
     btl->btl_deregister_mem (btl, handle);
 }
 
+static inline void mca_bml_base_register_convertor (mca_bml_base_btl_t* bml_btl, mca_btl_base_registration_handle_t *handle, opal_convertor_t *convertor)
+{
+    mca_btl_base_module_t* btl = bml_btl->btl;
+
+    if (btl->btl_register_convertor != NULL) {
+        btl->btl_register_convertor (btl, handle, convertor);
+    }
+}
+
 /*
  *  BML component interface functions and datatype.
  */

diff --git a/ompi/mca/pml/ob1/pml_ob1_component.c b/ompi/mca/pml/ob1/pml_ob1_component.c
@@ -184,7 +184,7 @@ static int mca_pml_ob1_component_register(void)
     mca_pml_ob1_param_register_int("free_list_max", -1, &mca_pml_ob1.free_list_max);
     mca_pml_ob1_param_register_int("free_list_inc", 64, &mca_pml_ob1.free_list_inc);
     mca_pml_ob1_param_register_int("priority", 20, &mca_pml_ob1.priority);
-    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 3, &mca_pml_ob1.send_pipeline_depth);
+    mca_pml_ob1_param_register_sizet("send_pipeline_depth", 4, &mca_pml_ob1.send_pipeline_depth);
     mca_pml_ob1_param_register_sizet("recv_pipeline_depth", 4, &mca_pml_ob1.recv_pipeline_depth);
 
     /* NTH: we can get into a live-lock situation in the RDMA failure path so disable

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,22 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor);
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -54,18 +65,21 @@ void mca_pml_ob1_cuda_add_ipc_support(struct mca_btl_base_module_t* btl, int32_t
  */
 int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
-                                        size_t size) {
+                                        size_t size)
+{
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     int rc;
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -75,6 +89,14 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+
+            rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint,
+                                                         sendreq->req_rdma, sendreq->req_rdma_cnt,
+                                                         convertor); 
+            if (rc != 0) {
+                OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
+                return rc;
+            }  
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -91,14 +113,90 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     } else {
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
+        unsigned char *base;
+        size_t buffer_size = 0;
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+
+        /* cuda kernel support is not enabled */
+        if (opal_datatype_cuda_kernel_support == 0) {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            return rc;
+        }
+        /* cuda kernel support is enabled */
+        if ((bml_btl->btl->btl_cuda_ddt_allow_rdma == 1) &&
+            (mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0)) {
+
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            if (NULL == base) {
+                return OPAL_ERR_OUT_OF_RESOURCE;
+            }
+            convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = buffer_size;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                                 "RDMA malloc GPU BUFFER %p for pack, local size %lu, "
+                                 "pipeline size %lu, depth %d\n",
+                                 base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size,
+                                 bml_btl->btl->btl_cuda_ddt_pipeline_depth));
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+
+                rc = mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_endpoint,
+                                                             sendreq->req_rdma, sendreq->req_rdma_cnt,
+                                                             convertor); 
+                if (rc != 0) {
+                    OPAL_OUTPUT_VERBOSE((0, mca_common_cuda_output, "Failed to register convertor, rc= %d\n", rc));
+                    return rc;
+                }
+                convertor->flags |= CONVERTOR_CUDA_ASYNC;
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+                return rc;  /* ready to return */
+            } else {
+                /* We failed to use the last GPU buffer, release it and realloc it with the new size */
+                opal_cuda_free_gpu_buffer(base, 0);
+            }
+        }
+        /* In all other cases fall-back on copy in/out protocol */
+        if (bml_btl->btl->btl_cuda_max_send_size != 0) {
+            convertor->pipeline_size = bml_btl->btl->btl_cuda_max_send_size;
+        } else {
+            convertor->pipeline_size = bml_btl->btl->btl_max_send_size;    
+        }
+        convertor->pipeline_depth = mca_pml_ob1.send_pipeline_depth;
+        if (convertor->local_size > convertor->pipeline_size) {
+            buffer_size = convertor->pipeline_size * convertor->pipeline_depth;
+        } else {
+            buffer_size = convertor->local_size;
+        }
+        base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+        if (NULL == base) {
+            return OPAL_ERR_OUT_OF_RESOURCE;
+        }
+        OPAL_OUTPUT_VERBOSE((OPAL_DATATYPE_CUDA_VERBOSE_LEVEL, mca_common_cuda_output,
+                             "Copy in/out malloc GPU buffer %p, pipeline_size %ld\n",
+                             base, convertor->pipeline_size));
+        convertor->gpu_buffer_ptr = base;
+        convertor->gpu_buffer_size = buffer_size;
+        convertor->pipeline_seq = 0;
         rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
     }
+
     return rc;
 }
 
-
-
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
@@ -152,6 +250,55 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_bml_base_endpoint_t* bml_endpoint,
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor)
+{
+    uint32_t i;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_bml_base_btl_t* bml_btl = mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, i);
+        mca_bml_base_register_convertor(bml_btl, handle, pack_convertor);
+    }
+    return 0;
+}
+
+/* return how many btl can have RDMA support */
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check if GET is supported by the BTL */
+    for(n = 0;
+        (n < num_btls) && (num_btls_used < mca_pml_ob1.max_rdma_per_request);
+        n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+     * registered amount to less then half of available bandwidth - fall back to
+     * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {