diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d022a529b81..1e3bbceb362f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,6 @@ mxnet_option(USE_GPROF            "Compile with gprof (profiling) flag" OFF)
 mxnet_option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF)
 mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
 mxnet_option(USE_TVM_OP           "Enable use of TVM operator build system." OFF)
-mxnet_option(USE_TVM_OP_CUDA_ARCH "Specify the CUDA ARCH for which TVM-generated kernels are compiled." none)
 mxnet_option(ENABLE_CUDA_RTC      "Build with CUDA runtime compilation support" ON)
 mxnet_option(BUILD_CPP_EXAMPLES   "Build cpp examples" ON)
 mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
@@ -758,8 +757,8 @@ if(USE_TVM_OP)
   endif()
 
   set(TVM_OP_COMPILE_OPTIONS "-o${CMAKE_CURRENT_BINARY_DIR}/libtvmop.so")
-  if(NOT USE_TVM_OP_CUDA_ARCH STREQUAL "none")
-    set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS} --cuda-arch ${USE_TVM_OP_CUDA_ARCH}")
+  if(CUDA_ARCH_BIN)
+    set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS} --cuda-arch ${CUDA_ARCH_BIN}")
   endif()
   add_custom_command(TARGET mxnet POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E env
diff --git a/Makefile b/Makefile
index 4b95eaf8e848..0a1e355ee5e8 100644
--- a/Makefile
+++ b/Makefile
@@ -473,13 +473,11 @@ CFLAGS += -I$(TVM_PATH)/include -DMXNET_USE_TVM_OP=1
 LDFLAGS += -L$(ROOTDIR)/lib -ltvm_runtime -Wl,-rpath,'$${ORIGIN}'
 
 TVM_USE_CUDA := OFF
-TVM_OP_CUDA_ARCH := NONE
 ifeq ($(USE_CUDA), 1)
 	TVM_USE_CUDA := ON
 	ifneq ($(USE_CUDA_PATH), NONE)
 		TVM_USE_CUDA := $(USE_CUDA_PATH)
 	endif
-	TVM_OP_CUDA_ARCH = $(USE_TVM_OP_CUDA_ARCH)
 endif
 endif
 
@@ -633,8 +631,8 @@ lib/libtvm_runtime.so:
 	cd $(ROOTDIR)
 
 TVM_OP_COMPILE_OPTIONS = -o $(ROOTDIR)/lib/libtvmop.so
-ifneq ($(TVM_OP_CUDA_ARCH), NONE)
-	TVM_OP_COMPILE_OPTIONS += --cuda-arch $(TVM_OP_CUDA_ARCH)
+ifneq ($(CUDA_ARCH),)
+	TVM_OP_COMPILE_OPTIONS += --cuda-arch "$(CUDA_ARCH)"
 endif
 lib/libtvmop.so: lib/libtvm_runtime.so $(wildcard contrib/tvmop/*/*.py contrib/tvmop/*.py)
 	echo "Compile TVM operators"
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 7ba5534eac16..7d91d4e5d121 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -26,7 +26,6 @@ NOSE_COVERAGE_ARGUMENTS="--with-coverage --cover-inclusive --cover-xml --cover-b
 NOSE_TIMER_ARGUMENTS="--with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error"
 CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
 CI_CMAKE_CUDA_ARCH_BIN="52,70"
-CI_CUDA_ARCH="sm_70"
 
 clean_repo() {
     set -ex
@@ -753,7 +752,6 @@ build_ubuntu_gpu_mkldnn() {
         USE_TVM_OP=1                              \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
-        USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH"      \
         -j$(nproc)
 }
 
@@ -771,7 +769,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_TVM_OP=1                              \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
-        USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH"      \
         -j$(nproc)
 }
 
@@ -790,7 +787,6 @@ build_ubuntu_gpu_cuda101_cudnn7() {
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
-        USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH"      \
         -j$(nproc)
 
     make cython PYTHON=python2
@@ -828,7 +824,6 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_TVM_OP=1                          \
-        -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH    \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKLML_MKL=1                       \
         -DCMAKE_BUILD_TYPE=Release              \
@@ -855,7 +850,6 @@ build_ubuntu_gpu_cmake() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=ON                         \
-        -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH    \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
@@ -903,7 +897,6 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
         -DUSE_TVM_OP=ON                         \
-        -DUSE_TVM_OP_CUDA_ARCH=$CI_CUDA_ARCH    \
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py
index 0f041c666379..3c0efdd6b806 100644
--- a/contrib/tvmop/compile.py
+++ b/contrib/tvmop/compile.py
@@ -21,9 +21,13 @@
 
 import os
 import argparse
+import re
+import logging
 from tvmop.opdef import __OP_DEF__
 from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
 
+logging.basicConfig(level=logging.INFO)
+
 
 def get_target(device):
     if device == "cpu":
@@ -33,6 +37,31 @@ def get_target(device):
     assert False, "Unknown device " + device
 
 
+def get_cuda_arch(arch):
+    if arch is None:
+        return None
+
+    if not isinstance(arch, str):
+        raise TypeError('Expecting parameter arch as a str, while got a {}'.format(str(type(arch))))
+
+    if len(arch) == 0:
+        return None
+
+    # the arch string contains '-arch=sm_xx'
+    flags = arch.split()
+    for flag in flags:
+        if flag.startswith('-arch='):
+            return flag[len('-arch='):]
+
+    # find the highest compute capability
+    comp_caps = re.findall(r'\d+', arch)
+    if len(comp_caps) == 0:
+        return None
+
+    comp_caps = [int(c) for c in comp_caps]
+    return 'sm_' + str(max(comp_caps))
+
+
 if __name__ == "__main__":
     import sys
     sys.path.append(os.path.dirname(sys.path[0]))
@@ -59,6 +88,11 @@ def get_target(device):
     lowered_funcs = {get_target("cpu"): func_list_llvm}
     if len(func_list_cuda) > 0:
         lowered_funcs[get_target("cuda")] = func_list_cuda
-        set_cuda_target_arch(arguments.cuda_arch)
+        cuda_arch = get_cuda_arch(arguments.cuda_arch)
+        if cuda_arch is None:
+            logging.info('No cuda arch specified. TVM will try to detect it from the build platform.')
+        else:
+            logging.info('Cuda arch {} set for compiling TVM operator kernels.'.format(cuda_arch))
+            set_cuda_target_arch(cuda_arch)
     func_binary = tvm.build(lowered_funcs, name="tvmop")
     func_binary.export_library(arguments.target_path)