From 5402e42dbf0b8a6c3219d319b1d8c415dc03914d Mon Sep 17 00:00:00 2001
From: Marek Kolodziej <mkolodziej@nvidia.com>
Date: Mon, 16 Jul 2018 21:38:56 -0700
Subject: [PATCH] User Dockerfile update and Markdown documentation fix

---
 docker/Dockerfiles/Dockerfile.tensorrt | 142 +++++++++++++++++++++++++
 docs/api/python/contrib/tensorrt.md    |  34 +++++-
 2 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 docker/Dockerfiles/Dockerfile.tensorrt

diff --git a/docker/Dockerfiles/Dockerfile.tensorrt b/docker/Dockerfiles/Dockerfile.tensorrt
new file mode 100644
index 000000000000..fc922a2e4b46
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.tensorrt
@@ -0,0 +1,142 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for CPU
+
+FROM nvidia/cuda:9.0-cudnn7-devel
+
+WORKDIR /work/deps
+
+# Ubuntu-core
+RUN apt-get update && \
+  apt-get install -y \
+    apt-transport-https \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    git \
+    libatlas-base-dev \
+    libcurl4-openssl-dev \
+    libjemalloc-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    libopencv-dev \
+    libzmq3-dev \
+    ninja-build \
+    software-properties-common \
+    sudo \
+    unzip \
+    wget \
+    ninja-build \
+    vim
+
+# Ubuntu-Python
+
+RUN apt-get install -y python3-dev python3-pip
+
+RUN unlink /usr/bin/python && \
+    ln -s $(which python3) /usr/bin/python && \
+    ln -s $(which pip3) /usr/bin/pip
+
+RUN  pip install nose cpplint==1.3.0 pylint==1.8.3 'numpy<1.15.0,>=1.8.2' \
+     nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 gluoncv
+
+# TensorRT
+
+WORKDIR /work/incubator-mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+
+ENV foo bar
+
+# Clone MxNet
+#RUN git clone --recursive -b tensorrt_integration \
+#  https://github.com/mkolod/incubator-mxnet.git   
+
+COPY . .
+
+WORKDIR /work/incubator-mxnet
+
+# Protobuf
+
+RUN cd /work && \
+  apt-get update && \
+  apt-get install -y automake libtool && \
+  git clone --recursive -b 3.5.1.1 https://github.com/google/protobuf.git && \
+  cd protobuf && \
+  ./autogen.sh && \
+  ./configure && \
+  make -j$(nproc) && \
+  make install && \
+  ldconfig
+
+# Build ONNX
+
+ENV PYVER 3.5
+
+RUN  echo "Installing ONNX." && \
+    cd 3rdparty/onnx-tensorrt/third_party/onnx && \
+    rm -rf build && \
+    mkdir -p build && \
+    cd build && \
+    cmake \
+        -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
+        -DBUILD_SHARED_LIBS=ON .. && \
+    make -j$(nproc) && \
+    make install
+
+#ENV LIBRARY_PATH `pwd`:`pwd`/onnx/:$LIBRARY_PATH
+#ENV CPLUS_INCLUDE_PATH `pwd`:$CPLUS_INCLUDE_PATH
+
+
+RUN echo "Installing TensorRT." && \
+  wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0_1-1_amd64.deb && \
+  dpkg -i tensorrt.deb && \
+  apt-get update && \
+  apt-get install -y --allow-downgrades libnvinfer-dev && \
+  rm tensorrt.deb
+#
+
+# Build ONNX-TensorRT
+RUN cd 3rdparty/onnx-tensorrt/ && \
+    mkdir -p build && \
+    cd build && \
+    cmake .. && \
+    make -j$(nproc) && \
+    make install
+
+RUN make \
+        DEV=1                                               \
+        USE_BLAS=openblas                                   \
+        USE_CUDA=1                                          \
+        USE_CUDA_PATH=/usr/local/cuda                       \
+        USE_CUDNN=1                                         \
+        USE_DIST_KVSTORE=0                                  \
+        USE_TENSORRT=1                                      \
+        ONNX_NAMESPACE=onnx                                 \
+        CUDA_ARCH="-gencode arch=compute_60,code=sm_60 \
+                   -gencode arch=compute_61,code=sm_61 \
+                   -gencode arch=compute_70,code=sm_70 \
+                   -gencode arch=compute_70,code=compute_70" \
+        -j$(nproc)
+
+RUN mv lib/libmxnet.so /usr/local/lib && \
+    ldconfig && \
+    make clean && \
+    cd python && \
+    pip install -e .
diff --git a/docs/api/python/contrib/tensorrt.md b/docs/api/python/contrib/tensorrt.md
index 06c847221153..349b7a2b84fc 100644
--- a/docs/api/python/contrib/tensorrt.md
+++ b/docs/api/python/contrib/tensorrt.md
@@ -27,7 +27,7 @@ The above points ensure that we find a compromise between the flexibility of MXN
 Building MXNet together with TensorRT is somewhat complex. The recipe will hopefully be simplified in the near future, but for now, it's easiest to build a Docker container with a Ubuntu 16.04 base. This Dockerfile can be found under the ci subdirectory of the MXNet repository. You can build the container as follows:
 
 ```
-docker build -t ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt mxnet_with_tensorrt
+docker build -f docker/Dockerfiles/Dockerfile.tensorrt -t mxnet_with_tensorrt .
 ```
 
 Next, we can run this container as follows (don't forget to install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker)):
@@ -160,7 +160,7 @@ trt_pct = run_inference(sym, arg_params, aux_params, mnist,
 ```
 Simply switching the flag allows us to go back and forth between MXNet and MXNet-TensorRT inference. See the details in the unit test at `${MXNET_HOME}/tests/python/tensorrt/test_tensorrt_lenet5.py`.
 
-## Running TensorRT with your own models with the Gluon API
+## Running TensorRT with your own image classification models with the Gluon API
 
 **Note:** Please first read the previous section titled "Running TensorRT with your own models with the symbolic API" - it contains information that will also be useful for Gluonusers.
 
@@ -199,6 +199,36 @@ gluon.data.DataLoader(
 
 For more details, see the unit test examples at `${MXNET_HOME}/tests/python/tensorrt/test_tensorrt_resnet_resnext.py`.
 
+## Running TensorRT with your own object detection models with the Gluon API
+
+The process for object detection is almost exactly the same as for image classification. However, let's note that an object detection network, such as [SSD](https://arxiv.org/pdf/1512.02325.pdf), has 3 outputs:
+
+* anchors
+* predicted classes
+* bounding box coordinates
+
+The symbols that are being bound for execution are the model outputs. If we didn't group them, we'd have 3 bound symbols, and most of the TensorRT segmentation, memory allocation, etc., would be repeated. In order to work around that, we can group the output symbols using the concatenation symbol. This will concatenate the tensors, which we can later split when manipulating the model output in NumPy. Let's start with the concatenation:
+
+```python
+net = gluoncv.model_zoo.get_model(model_name='ssd_512_mobilenet1_0_coco', pretrained=True)
+data = mx.sym.var('data')
+anchors, class_preds, box_preds = net(data)
+all_preds = mx.sym.concat(anchors, class_preds, box_preds, dim=2)
+all_params = dict([(k, v.data()) for k, v in net.collect_params().items()])
+executor = all_preds.simple_bind(ctx=ctx, data=(batch_size, 3, 224, 224), grad_req='null',
+                                   shared_buffer=all_params, force_rebind=True)
+```
+Now that the symbol to be bound concatenates anchors, class predictions and box predictions, we can run inference and split the output at the end:
+
+```python
+executor.arg_dict["data"] = data
+executor.forward(is_train = False)
+results = executor.outputs[0].asnumpy()
+_, anchors, class_preds, box_preds = tuple(np.split(results, indices_or_sections=[0, 1, 2], axis=2))
+```
+
+As you can see, splitting the three outputs after prediction is as easy as their initial concatenation.
+
 ## Examples
 
 The sections above describe how to launch unit tests on pre-trained models as examples. For cross-reference, the launch shell scripts have also been added [here](../../../../example/image-classification/tensorrt/README.md).