Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Use RTC for elementwise and broadcast ops #18622

Merged
merged 76 commits into from
Aug 20, 2020
Merged
Show file tree
Hide file tree
Changes from 69 commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
98aceba
Reapplying PR #17767
ptrendx Feb 14, 2020
6b235fd
Making RTC required
ptrendx May 18, 2020
38f30f0
Move cuda utils to src/common/cuda and refactor RTC part
ptrendx May 19, 2020
5d01ad2
Unary ops via RTC
ptrendx May 22, 2020
c5aeeef
Support binary_scalar forward
ptrendx May 29, 2020
39f6fee
Backward of binary scalar
ptrendx Jun 1, 2020
fa732a5
Binary forward
ptrendx Jun 1, 2020
a2933f8
Fix for binary_scalar
ptrendx Jun 1, 2020
479c0f8
Moving all binary forward to RTC
ptrendx Jun 2, 2020
18f1c91
Backward of binary ops
ptrendx Jun 8, 2020
2e27675
Suuport broadcast
ptrendx Jun 9, 2020
3ea932f
RTC for elementwise sum
ptrendx Jun 11, 2020
9da292c
RTC for backward usenone of broadcast
ptrendx Jun 15, 2020
d6074e9
RTC for broadcast bwd usein
ptrendx Jun 18, 2020
8174ca2
Remove non-RTC vectorization support
ptrendx Jun 22, 2020
3a75411
Remove template from ReduceWorkspaceSize
ptrendx Jun 23, 2020
cd87eb2
Fixes from rebase
ptrendx Jun 23, 2020
e7ad72b
Guarding RTC usage behing MXNET_USE_CUDA
ptrendx Jun 26, 2020
d889bdc
More guards
ptrendx Jun 26, 2020
b09090c
C++17 for CUDA code
ptrendx Jun 26, 2020
7415264
MixedUnaryBackwardInOut as RTC
ptrendx Jun 26, 2020
49ee749
Removing unused variable
ptrendx Jun 26, 2020
c23fa6a
Revert "C++17 for CUDA code"
ptrendx Jun 26, 2020
e2065f2
Get rid of CI tests without RTC
ptrendx Jun 26, 2020
784c082
Fix lint
ptrendx Jun 26, 2020
69f0232
Change a few more elemwise functions
ptrendx Jun 26, 2020
762f56c
Fix large tensor build
ptrendx Jun 26, 2020
2ad556e
Another try with DBL_MAX
ptrendx Jun 26, 2020
21d468e
Fix Windows compilation
ptrendx Jun 29, 2020
eb31935
Fix the large int test
ptrendx Jun 29, 2020
fe5cbfb
Add the printing of error code value to CUDA_DRIVER_CALL
ptrendx Jun 29, 2020
157751d
Fix
ptrendx Jun 29, 2020
c1c8071
Fix binary scalar
ptrendx Jun 29, 2020
6d1a46d
Get more information when cuLaunchKernel fails
ptrendx Jun 30, 2020
98e542b
Going easy on Windows compiler
ptrendx Jun 30, 2020
1195a02
Fix lint
ptrendx Jun 30, 2020
d597d83
Reorganization to split strings due to Windows compilation problems
ptrendx Jun 30, 2020
67d115c
Fix error with uninitialized value
ptrendx Jul 2, 2020
edf3a8a
Fix handling of different types for backward of binary scalar
ptrendx Jul 2, 2020
71deaff
Decreasing RTC overhead
ptrendx Jul 2, 2020
ff545f3
Fix lint and remove rest of mentions of ENABLE_RTC
ptrendx Jul 6, 2020
503046b
Jetson with RTC
ptrendx Jul 6, 2020
611276c
Fix the aws s3 command
ptrendx Jul 7, 2020
724571b
Debugging Windows failure
ptrendx Jul 7, 2020
c402a22
More debugging of Windows failure
ptrendx Jul 9, 2020
98acaef
Debug
ptrendx Jul 10, 2020
b4a6794
Fix the issue on Windows (long -> long long for 8B)
ptrendx Jul 16, 2020
7223408
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 16, 2020
59dcbe0
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 17, 2020
5eef300
libcuda.so for Jetson
ptrendx Jul 17, 2020
b05075b
Enable debug information for RTC kernels and cleaning debug ptx dump
ptrendx Jul 17, 2020
3e43e07
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 20, 2020
55337df
Fix lint
ptrendx Jul 20, 2020
50388d8
Try without linking the stub of libcuda.so to different place in Jetson
ptrendx Jul 20, 2020
3ce8984
Add docstring
ptrendx Jul 20, 2020
0bd007a
Answering review comments
ptrendx Jul 21, 2020
ee82cd6
Unifying vectorization
ptrendx Jul 21, 2020
e1f3d82
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 21, 2020
b13ae5a
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 21, 2020
242aba8
Fix
ptrendx Jul 23, 2020
9fac801
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 28, 2020
57d72a7
Fixes for reduce ops
ptrendx Jul 28, 2020
5e9d582
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Jul 31, 2020
e547594
Fix M=1 case
ptrendx Aug 1, 2020
2758622
Fixes from rebase
ptrendx Aug 3, 2020
2b29a4f
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Aug 3, 2020
a60c483
Fix
ptrendx Aug 3, 2020
f1a2f54
Fix tests
ptrendx Aug 4, 2020
34c9d00
Adding tutorial for RTC
ptrendx Aug 7, 2020
3888cc2
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Aug 13, 2020
f4e039c
Fixes after merge
ptrendx Aug 13, 2020
9acab01
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Aug 14, 2020
12c8356
Fixes from review
ptrendx Aug 14, 2020
ece1943
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Aug 16, 2020
d6b2083
Change env var doc and undo the change to toctree
ptrendx Aug 18, 2020
f0ceed1
Merge branch 'upstream' into pr_rtc_elementwise_ops
ptrendx Aug 18, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 0 additions & 48 deletions 3rdparty/mshadow/mshadow/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ extern "C" {
}

#include "./half.h"
#include "./half2.h"
#include "./bfloat.h"
#define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP) \
MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
Expand Down Expand Up @@ -387,11 +386,6 @@ struct DataType<half::half_t> {
#endif
};
template<>
struct DataType<half::half2_t> {
static const int kFlag = kFloat16;
static const int kLanes = 2;
};
template<>
struct DataType<bfloat::bf16_t> {
static const int kFlag = kBfloat16;
static const int kLanes = 1;
Expand Down Expand Up @@ -1144,48 +1138,6 @@ struct minimum {
}
#endif

#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
{ \
typedef float DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat64: \
{ \
typedef double DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat16: \
{ \
typedef mshadow::half::half2_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kUint8: \
{ \
typedef uint8_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt32: \
{ \
typedef int32_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt64: \
{ \
typedef int64_t DType; \
{__VA_ARGS__} \
} \
break; \
default: \
LOG(FATAL) << "Unknown type enum " << type; \
}

#define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
Expand Down
162 changes: 0 additions & 162 deletions 3rdparty/mshadow/mshadow/half2.h

This file was deleted.

16 changes: 4 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
option(INSTALL_EXAMPLES "Install the example source files." OFF)
option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
Expand Down Expand Up @@ -570,18 +569,11 @@ if(USE_CUDA)

string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")

find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
OPTIONAL_COMPONENTS nvToolsExt nvrtc)
find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver
OPTIONAL_COMPONENTS nvToolsExt)

list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
if(ENABLE_CUDA_RTC)
if(CUDA_nvrtc_LIBRARY)
list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
else()
message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
endif()
endif()
list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand
CUDA::nvrtc CUDA::cuda_driver)
list(APPEND SOURCE ${CUDA})
add_definitions(-DMXNET_USE_CUDA=1)

Expand Down
6 changes: 0 additions & 6 deletions ci/build_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -76,7 +75,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -91,7 +89,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -106,7 +103,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -121,7 +117,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -136,7 +131,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand Down
22 changes: 0 additions & 22 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ build_jetson() {
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
-DUSE_CUDA=ON \
-DMXNET_CUDA_ARCH="5.2" \
-DENABLE_CUDA_RTC=OFF \
-DUSE_OPENCV=OFF \
-DUSE_OPENMP=ON \
-DUSE_LAPACK=OFF \
Expand Down Expand Up @@ -669,27 +668,6 @@ build_ubuntu_gpu_cmake() {
ninja
}

build_ubuntu_gpu_cmake_no_rtc() {
set -ex
cd /work/build
CC=gcc-7 CXX=g++-7 cmake \
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=ON \
-DUSE_DIST_KVSTORE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DBUILD_CYTHON_MODULES=1 \
-DENABLE_CUDA_RTC=OFF \
-G Ninja \
/work/mxnet

ninja
}

build_ubuntu_cpu_large_tensor() {
set -ex
cd /work/build
Expand Down
14 changes: 0 additions & 14 deletions ci/jenkins/Jenkins_steps.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -258,20 +258,6 @@ def compile_unix_cmake_gpu(lib_name) {
}]
}

def compile_unix_cmake_gpu_no_rtc(lib_name) {
return ['GPU: CMake CUDA RTC OFF': {
node(NODE_LINUX_CPU) {
ws('workspace/build-cmake-gpu-no-rtc') {
timeout(time: max_time, unit: 'MINUTES') {
utils.init_git()
utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false)
utils.pack_lib(lib_name, mx_cmake_lib)
}
}
}
}]
}

def compile_unix_tensorrt_gpu(lib_name) {
return ['TensorRT': {
node(NODE_LINUX_CPU) {
Expand Down
1 change: 0 additions & 1 deletion ci/jenkins/Jenkinsfile_unix_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ core_logic: {
custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
custom_steps.compile_unix_int64_gpu('gpu_int64'),
custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
])

utils.parallel_stage('Tests', [
Expand Down
1 change: 0 additions & 1 deletion config/darwin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,5 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
set(USE_CXX11_ABI ON CACHE BOOL "Build with GLIBCXX_USE_CXX11_ABI")
1 change: 0 additions & 1 deletion config/linux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,5 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
set(USE_CXX11_ABI ON CACHE BOOL "Build with GLIBCXX_USE_CXX11_ABI")
1 change: 0 additions & 1 deletion config/linux_gpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,5 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
set(USE_CXX11_ABI ON CACHE BOOL "Build with GLIBCXX_USE_CXX11_ABI")
Loading