diff --git a/Makefile b/Makefile
index b3b188a2cf3b..4b95eaf8e848 100644
--- a/Makefile
+++ b/Makefile
@@ -473,11 +473,13 @@ CFLAGS += -I$(TVM_PATH)/include -DMXNET_USE_TVM_OP=1
 LDFLAGS += -L$(ROOTDIR)/lib -ltvm_runtime -Wl,-rpath,'$${ORIGIN}'
 
 TVM_USE_CUDA := OFF
+TVM_OP_CUDA_ARCH := NONE
 ifeq ($(USE_CUDA), 1)
 	TVM_USE_CUDA := ON
 	ifneq ($(USE_CUDA_PATH), NONE)
 		TVM_USE_CUDA := $(USE_CUDA_PATH)
 	endif
+	TVM_OP_CUDA_ARCH = $(USE_TVM_OP_CUDA_ARCH)
 endif
 endif
 
@@ -630,11 +632,15 @@ lib/libtvm_runtime.so:
 	ls $(ROOTDIR)/lib; \
 	cd $(ROOTDIR)
 
+TVM_OP_COMPILE_OPTIONS = -o $(ROOTDIR)/lib/libtvmop.so
+ifneq ($(TVM_OP_CUDA_ARCH), NONE)
+	TVM_OP_COMPILE_OPTIONS += --cuda-arch $(TVM_OP_CUDA_ARCH)
+endif
 lib/libtvmop.so: lib/libtvm_runtime.so $(wildcard contrib/tvmop/*/*.py contrib/tvmop/*.py)
 	echo "Compile TVM operators"
 	PYTHONPATH=$(TVM_PATH)/python:$(TVM_PATH)/topi/python:$(ROOTDIR)/contrib \
 		LD_LIBRARY_PATH=$(ROOTDIR)/lib \
-	    python3 $(ROOTDIR)/contrib/tvmop/compile.py -o $(ROOTDIR)/lib/libtvmop.so
+	    python3 $(ROOTDIR)/contrib/tvmop/compile.py $(TVM_OP_COMPILE_OPTIONS)
 
 NNVM_INC = $(wildcard $(NNVM_PATH)/include/*/*.h)
 NNVM_SRC = $(wildcard $(NNVM_PATH)/src/*/*/*.cc $(NNVM_PATH)/src/*/*.cc $(NNVM_PATH)/src/*.cc)
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index b2a50f30af4e..1080dc57eaec 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -26,6 +26,7 @@ NOSE_COVERAGE_ARGUMENTS="--with-coverage --cover-inclusive --cover-xml --cover-b
 NOSE_TIMER_ARGUMENTS="--with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error"
 CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
 CI_CMAKE_CUDA_ARCH_BIN="52,70"
+CI_CUDA_ARCH="sm_70"
 
 clean_repo() {
     set -ex
@@ -228,7 +229,7 @@ build_ubuntu_gpu_mkldnn_release() {
 # $1 -> mxnet_variant: the mxnet variant to build, e.g. cpu, cu100, cu92mkl, etc.
 build_dynamic_libmxnet() {
     set -ex
-    
+
     local mxnet_variant=${1:?"This function requires a mxnet variant as the first argument"}
 
     # relevant licenses will be placed in the licenses directory
@@ -769,6 +770,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         USE_TVM_OP=1                              \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
         USE_SIGNAL_HANDLER=1                      \
+        USE_TVM_OP_CUDA_ARCH="$CI_CUDA_ARCH $CI_CUDA_COMPUTE_CAPABILITIES" \
         -j$(nproc)
 }
 
@@ -948,7 +950,7 @@ cd_unittest_ubuntu() {
     fi
 
     $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/unittest
-    $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/quantization 
+    $nose_cmd $NOSE_TIMER_ARGUMENTS --verbose tests/python/quantization
 
     # https://github.com/apache/incubator-mxnet/issues/11801
     # if [[ ${mxnet_variant} = "cpu" ]] || [[ ${mxnet_variant} = "mkl" ]]; then
diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py
index e6af0a276560..0f041c666379 100644
--- a/contrib/tvmop/compile.py
+++ b/contrib/tvmop/compile.py
@@ -22,6 +22,8 @@
 import os
 import argparse
 from tvmop.opdef import __OP_DEF__
+from tvm.autotvm.measure.measure_methods import set_cuda_target_arch
+
 
 def get_target(device):
     if device == "cpu":
@@ -37,6 +39,8 @@ def get_target(device):
     parser = argparse.ArgumentParser(description="Generate tvm operators")
     parser.add_argument("-o", action="store", required=True, dest="target_path",
                         help="Target path which stores compiled library")
+    parser.add_argument('--cuda-arch', type=str, default=None, dest='cuda_arch',
+                        help='The cuda arch for compiling kernels for')
     arguments = parser.parse_args()
 
     func_list_llvm = []
@@ -52,8 +56,9 @@ def get_target(device):
                                        binds=operator_def.get_binds(args))
                 func_list.append(func_lower)
 
-    lowered_funcs = {get_target("cpu") : func_list_llvm}
+    lowered_funcs = {get_target("cpu"): func_list_llvm}
     if len(func_list_cuda) > 0:
         lowered_funcs[get_target("cuda")] = func_list_cuda
+        set_cuda_target_arch(arguments.cuda_arch)
     func_binary = tvm.build(lowered_funcs, name="tvmop")
     func_binary.export_library(arguments.target_path)
diff --git a/make/config.mk b/make/config.mk
index 982d15b19656..7af34bff0a75 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -65,6 +65,10 @@ ADD_CFLAGS =
 # whether to build operators written in TVM
 USE_TVM_OP = 0
 
+# specify the CUDA ARCH compilation flag for building
+# operator kernels implemented using TVM
+USE_TVM_OP_CUDA_ARCH = NONE
+
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU
 #---------------------------------------------