diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d022a529b81..1e3bbceb362f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,6 @@ mxnet_option(USE_GPROF "Compile with gprof (profiling) flag" OFF) mxnet_option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF) mxnet_option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path mxnet_option(USE_TVM_OP "Enable use of TVM operator build system." OFF) -mxnet_option(USE_TVM_OP_CUDA_ARCH "Specify the CUDA ARCH for which TVM-generated kernels are compiled." none) mxnet_option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON) mxnet_option(BUILD_CPP_EXAMPLES "Build cpp examples" ON) mxnet_option(INSTALL_EXAMPLES "Install the example source files." OFF) @@ -758,8 +757,8 @@ if(USE_TVM_OP) endif() set(TVM_OP_COMPILE_OPTIONS "-o${CMAKE_CURRENT_BINARY_DIR}/libtvmop.so") - if(NOT USE_TVM_OP_CUDA_ARCH STREQUAL "none") - set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS} --cuda-arch ${USE_TVM_OP_CUDA_ARCH}") + if(CUDA_ARCH_BIN) + set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS} --cuda-arch ${CUDA_ARCH_BIN}") endif() add_custom_command(TARGET mxnet POST_BUILD COMMAND ${CMAKE_COMMAND} -E env diff --git a/Makefile b/Makefile index 4b95eaf8e848..0a1e355ee5e8 100644 --- a/Makefile +++ b/Makefile @@ -473,13 +473,11 @@ CFLAGS += -I$(TVM_PATH)/include -DMXNET_USE_TVM_OP=1 LDFLAGS += -L$(ROOTDIR)/lib -ltvm_runtime -Wl,-rpath,'$${ORIGIN}' TVM_USE_CUDA := OFF -TVM_OP_CUDA_ARCH := NONE ifeq ($(USE_CUDA), 1) TVM_USE_CUDA := ON ifneq ($(USE_CUDA_PATH), NONE) TVM_USE_CUDA := $(USE_CUDA_PATH) endif - TVM_OP_CUDA_ARCH = $(USE_TVM_OP_CUDA_ARCH) endif endif @@ -633,8 +631,8 @@ lib/libtvm_runtime.so: cd $(ROOTDIR) TVM_OP_COMPILE_OPTIONS = -o $(ROOTDIR)/lib/libtvmop.so -ifneq ($(TVM_OP_CUDA_ARCH), NONE) - TVM_OP_COMPILE_OPTIONS += --cuda-arch $(TVM_OP_CUDA_ARCH) +ifneq ($(CUDA_ARCH),) + TVM_OP_COMPILE_OPTIONS += --cuda-arch "$(CUDA_ARCH)" endif lib/libtvmop.so: lib/libtvm_runtime.so $(wildcard contrib/tvmop/*/*.py contrib/tvmop/*.py) echo "Compile TVM operators" diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 7ba5534eac16..7d91d4e5d121 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -26,7 +26,6 @@ NOSE_COVERAGE_ARGUMENTS="--with-coverage --cover-inclusive --cover-xml --cover-b NOSE_TIMER_ARGUMENTS="--with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error" CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70" CI_CMAKE_CUDA_ARCH_BIN="52,70" -CI_CUDA_ARCH="sm_70" clean_repo() { set -ex @@ -753,7 +752,6 @@ build_ubuntu_gpu_mkldnn() { USE_TVM_OP=1 \ CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \ USE_SIGNAL_HANDLER=1 \ - USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH" \ -j$(nproc) } @@ -771,7 +769,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() { USE_TVM_OP=1 \ CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \ USE_SIGNAL_HANDLER=1 \ - USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH" \ -j$(nproc) } @@ -790,7 +787,6 @@ build_ubuntu_gpu_cuda101_cudnn7() { USE_DIST_KVSTORE=1 \ CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \ USE_SIGNAL_HANDLER=1 \ - USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH" \ -j$(nproc) make cython PYTHON=python2 @@ -828,7 +824,6 @@ build_ubuntu_gpu_cmake_mkldnn() { -DUSE_CUDA=1 \ -DUSE_CUDNN=1 \ -DUSE_TVM_OP=1 \ - -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH \ -DPython3_EXECUTABLE=/usr/bin/python3 \ -DUSE_MKLML_MKL=1 \ -DCMAKE_BUILD_TYPE=Release \ @@ -855,7 +850,6 @@ build_ubuntu_gpu_cmake() { -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ -DUSE_TVM_OP=ON \ - -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH \ -DPython3_EXECUTABLE=/usr/bin/python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ @@ -903,7 +897,6 @@ build_ubuntu_gpu_large_tensor() { -DUSE_CUDA=ON \ -DUSE_CUDNN=ON \ -DUSE_TVM_OP=ON \ - -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH \ -DPython3_EXECUTABLE=/usr/bin/python3 \ -DUSE_MKL_IF_AVAILABLE=OFF \ -DUSE_MKLML_MKL=OFF \ diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py index 0f041c666379..3c0efdd6b806 100644 --- a/contrib/tvmop/compile.py +++ b/contrib/tvmop/compile.py @@ -21,9 +21,13 @@ import os import argparse +import re +import logging from tvmop.opdef import __OP_DEF__ from tvm.autotvm.measure.measure_methods import set_cuda_target_arch +logging.basicConfig(level=logging.INFO) + def get_target(device): if device == "cpu": @@ -33,6 +37,31 @@ def get_target(device): assert False, "Unknown device " + device +def get_cuda_arch(arch): + if arch is None: + return None + + if not isinstance(arch, str): + raise TypeError('Expecting parameter arch as a str, while got a {}'.format(str(type(arch)))) + + if len(arch) == 0: + return None + + # the arch string contains '-arch=sm_xx' + flags = arch.split() + for flag in flags: + if flag.startswith('-arch='): + return flag[len('-arch='):] + + # find the highest compute capability + comp_caps = re.findall(r'\d+', arch) + if len(comp_caps) == 0: + return None + + comp_caps = [int(c) for c in comp_caps] + return 'sm_' + str(max(comp_caps)) + + if __name__ == "__main__": import sys sys.path.append(os.path.dirname(sys.path[0])) @@ -59,6 +88,11 @@ def get_target(device): lowered_funcs = {get_target("cpu"): func_list_llvm} if len(func_list_cuda) > 0: lowered_funcs[get_target("cuda")] = func_list_cuda - set_cuda_target_arch(arguments.cuda_arch) + cuda_arch = get_cuda_arch(arguments.cuda_arch) + if cuda_arch is None: + logging.info('No cuda arch specified. TVM will try to detect it from the build platform.') + else: + logging.info('Cuda arch {} set for compiling TVM operator kernels.'.format(cuda_arch)) + set_cuda_target_arch(cuda_arch) func_binary = tvm.build(lowered_funcs, name="tvmop") func_binary.export_library(arguments.target_path)