Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor TRT #6

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@
[submodule "3rdparty/tvm"]
path = 3rdparty/tvm
url = https://github.com/dmlc/tvm
[submodule "3rdparty/onnx-tensorrt"]
path = 3rdparty/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt.git
1 change: 1 addition & 0 deletions 3rdparty/onnx-tensorrt
Submodule onnx-tensorrt added at e7be19
46 changes: 46 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ mxnet_option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support"
mxnet_option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
mxnet_option(INSTALL_EXAMPLES "Install the example source files." OFF)
mxnet_option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." OFF)
mxnet_option(USE_TENSORRT "Enable infeference optimization with TensorRT." OFF)

message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
Expand Down Expand Up @@ -185,6 +186,51 @@ if(USE_VTUNE)
list(APPEND mxnet_LINKER_LIBS dl)
endif()

if(USE_TENSORRT)
message(STATUS "Using TensorRT")
set(ONNX_PATH 3rdparty/onnx-tensorrt/third_party/onnx/build/)
set(ONNX_TRT_PATH 3rdparty/onnx-tensorrt/build/)

include_directories(${ONNX_PATH})
include_directories(3rdparty/onnx-tensorrt/)
include_directories(3rdparty/)
add_definitions(-DMXNET_USE_TENSORRT=1)
add_definitions(-DONNX_NAMESPACE=onnx)
list(APPEND mxnet_LINKER_LIBS libnvinfer.so)

find_package(Protobuf REQUIRED)
list(APPEND mxnet_LINKER_LIBS ${PROTOBUF_LIBRARY})

message(STATUS "target path ${ONNX_PATH}")
find_library(ONNX_LIBRARY NAMES libonnx.so REQUIRED
PATHS ${ONNX_PATH}
DOC "Path to onnx library.")
message(STATUS "linking onnx ${ONNX_LIBRARY}")
list(APPEND mxnet_LINKER_LIBS ${ONNX_LIBRARY})

message(STATUS "target path ${ONNX_PATH}")
find_library(ONNX_PROTO_LIBRARY NAMES libonnx_proto.so REQUIRED
PATHS ${ONNX_PATH}
DOC "Path to onnx_proto library.")
message(STATUS "linking proto onnx ${ONNX_PROTO_LIBRARY}")
list(APPEND mxnet_LINKER_LIBS ${ONNX_PROTO_LIBRARY})

message(STATUS "target path ${ONNX_TRT_PATH}")
find_library(ONNX_TRT_RUNTIME_LIBRARY NAMES libnvonnxparser_runtime.so REQUIRED
PATHS ${ONNX_TRT_PATH}
DOC "Path to onnx_proto library.")
message(STATUS "linking proto onnx ${ONNX_TRT_RUNTIME_LIBRARY}")
list(APPEND mxnet_LINKER_LIBS ${ONNX_TRT_RUNTIME_LIBRARY})

message(STATUS "target path ${ONNX_TRT_PATH}")
find_library(ONNX_TRT_PARSER_LIBRARY NAMES libnvonnxparser.so REQUIRED
PATHS ${ONNX_TRT_PATH}
DOC "Path to onnx_proto library.")
message(STATUS "linking proto onnx ${ONNX_TRT_PARSER_LIBRARY}")
list(APPEND mxnet_LINKER_LIBS ${ONNX_TRT_PARSER_LIBRARY})

endif()

if(USE_MKLDNN)
include(cmake/MklDnn.cmake)
# CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
Expand Down
28 changes: 28 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3r
mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
// timeout in minutes
max_time = 120
// assign any caught errors here
Expand Down Expand Up @@ -372,6 +373,17 @@ try {
}
}
},
'TensorRT': {
node('mxnetlinux-cpu') {
ws('workspace/build-tensorrt') {
timeout(time: max_time, unit: 'MINUTES') {
init_git()
docker_run('ubuntu_gpu_tensorrt', 'build_ubuntu_gpu_tensorrt', false)
pack_lib('tensorrt', mx_tensorrt_lib)
}
}
}
},
'Build CPU windows':{
node('mxnetwindows-cpu') {
timeout(time: max_time, unit: 'MINUTES') {
Expand Down Expand Up @@ -740,6 +752,22 @@ try {
}
}
},
'Python3: TensorRT GPU': {
node('mxnetlinux-gpu-p3') {
ws('workspace/build-tensorrt') {
timeout(time: max_time, unit: 'MINUTES') {
try {
init_git()
unpack_lib('tensorrt', mx_tensorrt_lib)
docker_run('ubuntu_gpu_tensorrt', 'unittest_ubuntu_tensorrt_gpu', true)
publish_test_coverage()
} finally {
collect_test_results_unix('nosetests_tensorrt.xml', 'nosetests_python3_tensorrt_gpu.xml')
}
}
}
}
},
'Scala: CPU': {
node('mxnetlinux-cpu') {
ws('workspace/ut-scala-cpu') {
Expand Down
8 changes: 8 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,14 @@ else
endif
CFLAGS += -I$(TPARTYDIR)/mshadow/ -I$(TPARTYDIR)/dmlc-core/include -fPIC -I$(NNVM_PATH)/include -I$(DLPACK_PATH)/include -I$(TPARTYDIR)/tvm/include -Iinclude $(MSHADOW_CFLAGS)
LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)


ifeq ($(USE_TENSORRT), 1)
CFLAGS += -I$(ROOTDIR) -I$(TPARTYDIR) -DONNX_NAMESPACE=$(ONNX_NAMESPACE) -DMXNET_USE_TENSORRT=1
LDFLAGS += -lprotobuf -pthread -lonnx -lonnx_proto -lnvonnxparser -lnvonnxparser_runtime -lnvinfer -lnvinfer_plugin
endif
# -L/usr/local/lib

ifeq ($(DEBUG), 1)
NVCCFLAGS += -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
else
Expand Down
14 changes: 7 additions & 7 deletions amalgamation/amalgamation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,12 @@
import platform

blacklist = [
'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h',
'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh', 'cuda_runtime.h', 'cudnn.h',
'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h', 'glog/logging.h', 'io/azure_filesys.h',
'io/hdfs_filesys.h', 'io/s3_filesys.h', 'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h', 'NvInfer.h', 'nvml.h',
'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h', 'omp.h',
'onnx/onnx.pb.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
'relacy_shims.h', 'ittnotify.h', 'shared_mutex'
]
Expand Down Expand Up @@ -150,6 +149,7 @@ def expand(x, pending, stage):
h not in sysheaders and
'mkl' not in h and
'nnpack' not in h and
'tensorrt' not in h and
not h.endswith('.cuh')): sysheaders.append(h)
else:
expand.treeDepth += 1
Expand Down
41 changes: 41 additions & 0 deletions ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# -*- mode: dockerfile -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Dockerfile to run MXNet on Ubuntu 16.04 for CPU

FROM nvidia/cuda:9.0-cudnn7-devel

WORKDIR /work/deps

COPY install/ubuntu_core.sh /work/
RUN /work/ubuntu_core.sh
COPY install/deb_ubuntu_ccache.sh /work/
RUN /work/deb_ubuntu_ccache.sh
COPY install/ubuntu_python.sh /work/
RUN /work/ubuntu_python.sh
COPY install/tensorrt.sh /work
RUN /work/tensorrt.sh

ARG USER_ID=0
COPY install/ubuntu_adduser.sh /work/
RUN /work/ubuntu_adduser.sh

COPY runtime_functions.sh /work/

WORKDIR /work/mxnet
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
45 changes: 45 additions & 0 deletions ci/docker/install/tensorrt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Install gluoncv since we're testing Gluon models as well
pip2 install gluoncv==0.2.0
pip3 install gluoncv==0.2.0

# Install Protobuf
# Install protoc 3.5 and build protobuf here (for onnx and onnx-tensorrt)
pushd .
cd ..
apt-get update
apt-get install -y automake libtool
git clone --recursive -b 3.5.1.1 https://github.com/google/protobuf.git
cd protobuf
./autogen.sh
./configure
make -j$(nproc)
make install
ldconfig
popd

# Install TensorRT
echo "TensorRT build enabled. Installing TensorRT."
wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0_1-1_amd64.deb
dpkg -i tensorrt.deb
apt-get update
apt-get install -y --allow-downgrades libnvinfer-dev
rm tensorrt.deb
67 changes: 67 additions & 0 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,62 @@ build_ubuntu_gpu() {
build_ubuntu_gpu_cuda91_cudnn7
}

build_ubuntu_gpu_tensorrt() {

set -ex

build_ccache_wrappers

# Build ONNX
pushd .
echo "Installing ONNX."
cd 3rdparty/onnx-tensorrt/third_party/onnx
rm -rf build
mkdir -p build
cd build
cmake \
-DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
-DBUILD_SHARED_LIBS=ON ..\
-G Ninja
ninja -v
export LIBRARY_PATH=`pwd`:`pwd`/onnx/:$LIBRARY_PATH
export CPLUS_INCLUDE_PATH=`pwd`:$CPLUS_INCLUDE_PATH
popd

# Build ONNX-TensorRT
pushd .
cd 3rdparty/onnx-tensorrt/
mkdir -p build
cd build
cmake ..
make -j$(nproc)
export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
popd

mkdir -p /work/mxnet/lib/
cp 3rdparty/onnx-tensorrt/third_party/onnx/build/*.so /work/mxnet/lib/
cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser_runtime.so.0 /work/mxnet/lib/
cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser.so.0 /work/mxnet/lib/

rm -rf build
make \
DEV=1 \
USE_BLAS=openblas \
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
USE_OPENCV=0 \
USE_DIST_KVSTORE=0 \
USE_TENSORRT=1 \
USE_JEMALLOC=0 \
USE_GPERFTOOLS=0 \
ONNX_NAMESPACE=onnx \
CUDA_ARCH="-gencode arch=compute_70,code=compute_70"\
-j$(nproc)

report_ccache_usage
}

build_ubuntu_gpu_mkldnn() {
set -ex

Expand Down Expand Up @@ -638,6 +694,15 @@ unittest_ubuntu_python3_gpu_nocudnn() {
nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_gpu.xml --verbose tests/python/gpu
}

unittest_ubuntu_tensorrt_gpu() {
set -ex
export PYTHONPATH=./python/
export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
python tests/python/tensorrt/lenet5_train.py
nosetests-3.4 $NOSE_COVERAGE_ARGUMENTS --with-xunit --xunit-file nosetests_trt_gpu.xml --verbose tests/python/tensorrt/
}

# quantization gpu currently only runs on P3 instances
# need to separte it from unittest_ubuntu_python2_gpu()
unittest_ubuntu_python2_quantization_gpu() {
Expand Down Expand Up @@ -970,3 +1035,5 @@ EOF
declare -F | cut -d' ' -f3
echo
fi


45 changes: 45 additions & 0 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1445,6 +1445,9 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
const int **aux_type_data,
int *complete);

MXNET_DLL int MXTRTSymbolOptimize(SymbolHandle sym_handle,
SymbolHandle *ret_sym_handle);

/*!
* \brief Convert a symbol into a quantized symbol where FP32 operators are replaced with INT8
* \param sym_handle symbol to be converted
Expand Down Expand Up @@ -1674,6 +1677,41 @@ MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
ExecutorHandle shared_exec_handle,
ExecutorHandle* out);

MXNET_DLL int MXExecutorTensorRTBind(SymbolHandle symbol_handle,
int dev_type,
int dev_id,
const mx_uint num_g2c_keys,
const char** g2c_keys,
const int* g2c_dev_types,
const int* g2c_dev_ids,
const mx_uint provided_grad_req_list_len,
const char** provided_grad_req_names,
const char** provided_grad_req_types,
const mx_uint num_provided_arg_shapes,
const char** provided_arg_shape_names,
const mx_uint* provided_arg_shape_data,
const mx_uint* provided_arg_shape_idx,
const mx_uint num_provided_arg_dtypes,
const char** provided_arg_dtype_names,
const int* provided_arg_dtypes,
const mx_uint num_provided_arg_stypes,
const char** provided_arg_stype_names,
const int* provided_arg_stypes,
const mx_uint num_shared_arg_names,
const char** shared_arg_name_list,
int* shared_buffer_len,
const char** shared_buffer_name_list,
NDArrayHandle* shared_buffer_handle_list,
const char*** updated_shared_buffer_name_list,
NDArrayHandle** updated_shared_buffer_handle_list,
mx_uint* num_in_args,
NDArrayHandle** in_args,
NDArrayHandle** arg_grads,
mx_uint* num_aux_states,
NDArrayHandle** aux_states,
ExecutorHandle shared_exec_handle,
ExecutorHandle* out);

/*!
* \brief Return a new executor with the same symbol and shared memory,
* but different input/output shapes.
Expand Down Expand Up @@ -1714,6 +1752,13 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
NDArrayHandle** aux_states,
ExecutorHandle shared_exec,
ExecutorHandle *out);

/*!
* \brief get optimized graph from graph executor
*/
MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
SymbolHandle *out);

/*!
* \brief set a call back to notify the completion of operation
*/
Expand Down
Loading