Use RTC for elementwise and broadcast ops (#18622)

* Reapplying PR #17767 * Making RTC required * Move cuda utils to src/common/cuda and refactor RTC part * Unary ops via RTC * Support binary_scalar forward Remove elemwise_scatter_op.* Fix BinaryScalar usage in NumPy * Backward of binary scalar * Binary forward * Fix for binary_scalar * Moving all binary forward to RTC Reorganization * Backward of binary ops * Suuport broadcast Add RTC to NumPy ops * RTC for elementwise sum Fixes * RTC for backward usenone of broadcast * RTC for broadcast bwd usein * Remove non-RTC vectorization support * Remove template from ReduceWorkspaceSize * Fixes from rebase * Guarding RTC usage behing MXNET_USE_CUDA * More guards * C++17 for CUDA code * MixedUnaryBackwardInOut as RTC * Removing unused variable * Revert "C++17 for CUDA code" This reverts commit b09090c. * Get rid of CI tests without RTC Get rid of if constexpr as CUDA 10 does not support it * Fix lint * Change a few more elemwise functions Fix for too long value * Fix large tensor build * Another try with DBL_MAX * Fix Windows compilation * Fix the large int test * Add the printing of error code value to CUDA_DRIVER_CALL * Fix * Fix binary scalar * Get more information when cuLaunchKernel fails * Going easy on Windows compiler * Fix lint * Reorganization to split strings due to Windows compilation problems * Fix error with uninitialized value * Fix handling of different types for backward of binary scalar * Decreasing RTC overhead * Fix lint and remove rest of mentions of ENABLE_RTC * Jetson with RTC * Fix the aws s3 command * Debugging Windows failure * More debugging of Windows failure * Debug * Fix the issue on Windows (long -> long long for 8B) * libcuda.so for Jetson * Enable debug information for RTC kernels and cleaning debug ptx dump * Fix lint * Try without linking the stub of libcuda.so to different place in Jetson * Add docstring * Answering review comments * Unifying vectorization * Fix * Fixes for reduce ops * Fix M=1 case * Fixes from rebase Fixes for mixed type gradient functions Set the launch bounds on RTC kernels * Fix * Fix tests * Adding tutorial for RTC * Fixes after merge * Fixes from review * Change env var doc and undo the change to toctree
apache · Aug 20, 2020 · 29d6f27 · 29d6f27
1 parent bbc39fa
commit 29d6f27
Show file tree

Hide file tree

Showing 141 changed files with 7,274 additions and 3,548 deletions.
diff --git a/3rdparty/mshadow/mshadow/base.h b/3rdparty/mshadow/mshadow/base.h
@@ -272,7 +272,6 @@ extern "C" {
   }
 
 #include "./half.h"
-#include "./half2.h"
 #include "./bfloat.h"
 #define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP)                                               \
   MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
@@ -387,11 +386,6 @@ struct DataType<half::half_t> {
 #endif
 };
 template<>
-struct DataType<half::half2_t> {
-  static const int kFlag = kFloat16;
-  static const int kLanes = 2;
-};
-template<>
 struct DataType<bfloat::bf16_t> {
   static const int kFlag = kBfloat16;
   static const int kLanes = 1;
@@ -1144,48 +1138,6 @@ struct minimum {
   }
 #endif
 
-#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...)  \
-  switch (type) {                                         \
-  case mshadow::kFloat32:                                 \
-    {                                                     \
-      typedef float DType;                                \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kFloat64:                                 \
-    {                                                     \
-      typedef double DType;                               \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kFloat16:                                 \
-    {                                                     \
-      typedef mshadow::half::half2_t DType;               \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kUint8:                                   \
-    {                                                     \
-      typedef uint8_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kInt32:                                   \
-    {                                                     \
-      typedef int32_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  case mshadow::kInt64:                                   \
-    {                                                     \
-      typedef int64_t DType;                              \
-      {__VA_ARGS__}                                       \
-    }                                                     \
-    break;                                                \
-  default:                                                \
-    LOG(FATAL) << "Unknown type enum " << type;           \
-  }
-
 #define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...)  \
   switch (type) {                                      \
   case mshadow::kFloat32:                              \

diff --git a/3rdparty/mshadow/mshadow/half2.h b/3rdparty/mshadow/mshadow/half2.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -79,7 +79,6 @@ option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
 option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
 option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
-option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
 option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
 option(INSTALL_EXAMPLES "Install the example source files." OFF)
 option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
@@ -547,18 +546,11 @@ if(USE_CUDA)
 
   string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")
 
-  find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
-    OPTIONAL_COMPONENTS nvToolsExt nvrtc)
+  find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver
+    OPTIONAL_COMPONENTS nvToolsExt)
 
-  list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
-  if(ENABLE_CUDA_RTC)
-    if(CUDA_nvrtc_LIBRARY)
-      list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
-      add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
-    else()
-      message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
-    endif()
-  endif()
+  list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand
+                                CUDA::nvrtc CUDA::cuda_driver)
   list(APPEND SOURCE ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
 

diff --git a/ci/build_windows.py b/ci/build_windows.py
@@ -61,7 +61,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -76,7 +75,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -91,7 +89,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -106,7 +103,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
-        '-DENABLE_CUDA_RTC=OFF '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=mkl '
@@ -121,7 +117,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON  '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '
@@ -136,7 +131,6 @@ class BuildFlavour(Enum):
         '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
-        '-DENABLE_CUDA_RTC=ON '
         '-DUSE_OPENCV=ON '
         '-DUSE_OPENMP=ON '
         '-DUSE_BLAS=open '

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -142,7 +142,6 @@ build_jetson() {
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="5.2" \
-        -DENABLE_CUDA_RTC=OFF \
         -DUSE_OPENCV=OFF \
         -DUSE_OPENMP=ON \
         -DUSE_LAPACK=OFF \
@@ -670,27 +669,6 @@ build_ubuntu_gpu_cmake() {
     ninja
 }
 
-build_ubuntu_gpu_cmake_no_rtc() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
-        -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=ON                         \
-        -DUSE_DIST_KVSTORE=ON                   \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DBUILD_CYTHON_MODULES=1                \
-        -DENABLE_CUDA_RTC=OFF                   \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build

diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
@@ -258,20 +258,6 @@ def compile_unix_cmake_gpu(lib_name) {
     }]
 }
 
-def compile_unix_cmake_gpu_no_rtc(lib_name) {
-    return ['GPU: CMake CUDA RTC OFF': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu-no-rtc') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false)
-            utils.pack_lib(lib_name, mx_cmake_lib)
-          }
-        }
-      }
-    }]
-}
-
 def compile_unix_tensorrt_gpu(lib_name) {
     return ['TensorRT': {
       node(NODE_LINUX_CPU) {

diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -41,7 +41,6 @@ core_logic: {
     custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
     custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
     custom_steps.compile_unix_int64_gpu('gpu_int64'),
-    custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
   ])
 
   utils.parallel_stage('Tests', [

diff --git a/config/darwin.cmake b/config/darwin.cmake
@@ -126,5 +126,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/config/linux.cmake b/config/linux.cmake
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
diff --git a/config/linux_gpu.cmake b/config/linux_gpu.cmake
@@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
 # Other GPU features
 set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
-set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
 set(USE_NVTX ON CACHE BOOL "Build with NVTX support")