diff --git a/3rdparty/onnx-tensorrt b/3rdparty/onnx-tensorrt
index f1c7aa63d88d..1e209e546061 160000
--- a/3rdparty/onnx-tensorrt
+++ b/3rdparty/onnx-tensorrt
@@ -1 +1 @@
-Subproject commit f1c7aa63d88d8d8ef70490f2ebb6b33f7450218b
+Subproject commit 1e209e546061173ccc37b25bbca69a795c6c86e4
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
index f4844115c0fd..8ad90aedeb6f 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
@@ -39,3 +39,4 @@ COPY runtime_functions.sh /work/
 
 WORKDIR /work/mxnet
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+ENV CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:/usr/local/cuda-10.0/targets/x86_64-linux/include/
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 9a24b7516128..333e32746567 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1877,7 +1877,6 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
  */
 MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
                                            SymbolHandle *out);
-
 /*!
  * \brief set a call back to notify the completion of operation
  */
diff --git a/python/mxnet/contrib/tensorrt.py b/python/mxnet/contrib/tensorrt.py
index 4ff39c4b4829..d94600e1cee3 100644
--- a/python/mxnet/contrib/tensorrt.py
+++ b/python/mxnet/contrib/tensorrt.py
@@ -16,95 +16,50 @@
 # under the License.
 
 """ Module to enable the use of TensorRT optimized graphs."""
-
-import ctypes
-import logging
 import os
 
-from .. import symbol as sym
-
-from ..base import _LIB, SymbolHandle, MXNetError
-from ..base import check_call
-
-
-def set_use_tensorrt(status):
+def set_use_fp16(status):
     """
-    Set an environment variable which will enable or disable the use of TensorRT in the backend.
-    Note: this is useful for A/B testing purposes.
-    :param status: Boolean, true if TensorRT optimization should be applied, False for legacy
-    behaviour.
+    Set an environment variable which will enable or disable the use of FP16 precision in
+    TensorRT
+    Note: The mode FP16 force the whole TRT node to be executed in FP16
+    :param status: Boolean, True if TensorRT should run in FP16, False for FP32
     """
-    os.environ["MXNET_USE_TENSORRT"] = str(int(status))
-
+    os.environ["MXNET_TENSORRT_USE_FP16"] = str(int(status))
 
-def get_use_tensorrt():
+def get_use_fp16():
     """
-    Get an environment variable which describes if TensorRT is currently enabled in the backend.
-    Note: this is useful for A/B testing purposes.
-    :return: Boolean, true if TensorRT optimization should be applied, False for legacy
-    behaviour.
+    Get an environment variable which describes if TensorRT is currently running in FP16
+    :return: Boolean, true if TensorRT is running in FP16, False for FP32
     """
-    return bool(int(os.environ.get("MXNET_USE_TENSORRT", 0)) == 1)
+    return bool(int(os.environ.get("MXNET_TENSORRT_USE_FP16", 1)) == 1)
 
-
-def get_optimized_symbol(executor):
+def init_tensorrt_params(sym, arg_params, aux_params):
     """
-    Take an executor's underlying symbol graph and return its generated optimized version.
-
-    Parameters
-    ----------
-    executor :
-        An executor for which you want to see an optimized symbol. Getting an optimized symbol
-        is useful to compare and verify the work TensorRT has done against a legacy behaviour.
-
-    Returns
-    -------
-    symbol : nnvm::Symbol
-        The nnvm symbol optimized.
-    """
-    handle = SymbolHandle()
-    try:
-        check_call(_LIB.MXExecutorGetOptimizedSymbol(executor.handle, ctypes.byref(handle)))
-        result = sym.Symbol(handle=handle)
-        return result
-    except MXNetError:
-        logging.error('Error while trying to fetch TRT optimized symbol for graph. Please ensure '
-                      'build was compiled with MXNET_USE_TENSORRT enabled.')
-        raise
-
-
-def tensorrt_bind(symbol, ctx, all_params, type_dict=None, stype_dict=None, group2ctx=None,
-                  **kwargs):
-    """Bind current symbol to get an optimized trt executor.
-
-    Parameters
-    ----------
-    symbol : Symbol
-        The symbol you wish to bind, and optimize with TensorRT.
-
-    ctx : Context
-        The device context the generated executor to run on.
-
-    all_params : Dict of str->ndarray
-        A dictionary of mappings from parameter names to parameter NDArrays.
-
-    type_dict  : Dict of str->numpy.dtype
-        Input type dictionary, name->dtype
-
-    stype_dict  : Dict of str->str
-        Input storage type dictionary, name->storage_type
-
-    group2ctx : Dict of string to mx.Context
-        The dict mapping the `ctx_group` attribute to the context assignment.
-
-    kwargs : Dict of str->shape
-        Input shape dictionary, name->shape
-
-    Returns
-    -------
-    executor : mxnet.Executor
-        An optimized TensorRT executor.
+    Set weights in attributes of TensorRT nodes
+    :param sym: Symbol, the symbol graph should contains some TensorRT nodes
+    :param arg_params: arg_params
+    :param aux_params: aux_params
+    :return arg_params, aux_params: remaining params that are not in TensorRT nodes
     """
-    kwargs['shared_buffer'] = all_params
-    return symbol.simple_bind(ctx, type_dict=type_dict, stype_dict=stype_dict,
-                              group2ctx=group2ctx, **kwargs)
+    for s in sym.get_internals():
+        new_params_names = ""
+        tensorrt_params = {}
+        if 'subgraph_params_names' in s.list_attr():
+            keys = s.list_attr()['subgraph_params_names'].split(';')
+            for k in keys:
+                if k in arg_params:
+                    new_params_names += k + ";"
+                    tensorrt_params['subgraph_param_' + k] = arg_params[k]
+                    arg_params.pop(k)
+                elif k in aux_params:
+                    new_params_names += k + ";"
+                    tensorrt_params['subgraph_param_' + k] = aux_params[k]
+                    aux_params.pop(k)
+            new_attrs = {}
+            for k, v in tensorrt_params.items():
+                new_attrs[k] = str(v.handle.value)
+            if len(new_attrs) > 0:
+                s._set_attr(**new_attrs)
+                s._set_attr(subgraph_params_names=new_params_names[:-1])
+    return arg_params, aux_params
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index a2e8bb810e6f..0440e7cfb0fc 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -27,9 +27,6 @@
 #include <mxnet/executor.h>
 #include "./c_api_common.h"
 #include "../executor/graph_executor.h"
-#if MXNET_USE_TENSORRT
-#include "../executor/trt_graph_executor.h"
-#endif  // MXNET_USE_TENSORRT
 
 int MXExecutorPrint(ExecutorHandle handle, const char **out_str) {
   Executor *exec = static_cast<Executor*>(handle);
@@ -441,38 +438,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   std::vector<NDArray> in_arg_vec;
   std::vector<NDArray> arg_grad_vec;
   std::vector<NDArray> aux_state_vec;
-#if MXNET_USE_TENSORRT
-  // If we've built with TensorRT support we by default return an TRTExecutor.
-  // Users can override this behaviour via env var, which is useful for example for A/B
-  // performance testing.
-  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
-    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
-                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
-                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
-                                                &grad_req_type_vec, shared_arg_name_set,
-                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
-                                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                                reinterpret_cast<Executor*>(shared_exec_handle));
-  } else {
-    // Checks to see if this env var has been set to true or false by the user.
-    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
-    // them and describe further steps.
-    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
-    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
-      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
-                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
-                   "to enable.";
-    }
-#endif  // MXNET_USE_TENSORRT
-    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
-                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
-                                &arg_grad_vec, &aux_state_vec,
-                                use_shared_buffer ? &shared_buffer_map : nullptr,
-                                reinterpret_cast<Executor*>(shared_exec_handle));
-#if MXNET_USE_TENSORRT
-  }
-#endif  // MXNET_USE_TENSORRT
+  *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                              &arg_grad_vec, &aux_state_vec,
+                              use_shared_buffer ? &shared_buffer_map : nullptr,
+                              reinterpret_cast<Executor*>(shared_exec_handle));
 
   // copy ndarray ptrs to ret->handles so that front end
   // can access them
@@ -633,14 +604,9 @@ int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
   auto s = new nnvm::Symbol();
   API_BEGIN();
 
-#if MXNET_USE_TENSORRT
-  auto exec = static_cast<exec::TrtGraphExecutor*>(handle);
+  auto exec = static_cast<exec::GraphExecutor*>(handle);
   *s = exec->GetOptimizedSymbol();
   *out = s;
-#else
-  LOG(FATAL) << "GetOptimizedSymbol may only be used when MXNet is compiled with "
-                "MXNET_USE_TENSORRT enabled.  Please re-compile MXNet with TensorRT support.";
-#endif  // MXNET_USE_TENSORRT
 
   API_END_HANDLE_ERROR(delete s);
 }
diff --git a/src/common/serialization.h b/src/common/serialization.h
deleted file mode 100644
index 8192ee210a1c..000000000000
--- a/src/common/serialization.h
+++ /dev/null
@@ -1,318 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file serialization.h
- * \brief Serialization of some STL and nnvm data-structures
- * \author Clement Fuji Tsang
- */
-
-#ifndef MXNET_COMMON_SERIALIZATION_H_
-#define MXNET_COMMON_SERIALIZATION_H_
-
-#include <dmlc/logging.h>
-#include <mxnet/graph_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-
-#include <cstring>
-#include <map>
-#include <set>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-
-namespace mxnet {
-namespace common {
-
-template<typename T>
-inline size_t SerializedSize(const T &obj);
-
-template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj);
-
-template<typename K, typename V>
-inline size_t SerializedSize(const std::map <K, V> &obj);
-
-template<>
-inline size_t SerializedSize(const std::string &obj);
-
-template<typename... Args>
-inline size_t SerializedSize(const std::tuple<Args...> &obj);
-
-template<typename T>
-inline void Serialize(const T &obj, char **buffer);
-
-template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer);
-
-template<typename K, typename V>
-inline void Serialize(const std::map <K, V> &obj, char **buffer);
-
-template<>
-inline void Serialize(const std::string &obj, char **buffer);
-
-template<typename... Args>
-inline void Serialize(const std::tuple<Args...> &obj, char **buffer);
-
-template<typename T>
-inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename K, typename V>
-inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos);
-
-template<>
-inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos);
-
-template<typename... Args>
-inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos);
-
-
-template<typename T>
-struct is_container {
-  static const bool value = !std::is_pod<T>::value;
-};
-
-template<typename T>
-inline size_t SerializedSize(const T &obj) {
-  return sizeof(T);
-}
-
-template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj) {
-  if (is_container<T>::value) {
-    size_t sum_val = 4;
-    for (const auto& el : obj) {
-      sum_val += SerializedSize(el);
-    }
-    return sum_val;
-  } else {
-    return 4 + (obj.ndim() * sizeof(T));
-  }
-}
-
-template<typename K, typename V>
-inline size_t SerializedSize(const std::map <K, V> &obj) {
-  size_t sum_val = 4;
-  if (is_container<K>::value && is_container<V>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.first) + SerializedSize(p.second);
-    }
-  } else if (is_container<K>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.first);
-    }
-    sum_val += sizeof(V) * obj.size();
-  } else if (is_container<V>::value) {
-    for (const auto& p : obj) {
-      sum_val += SerializedSize(p.second);
-    }
-    sum_val += sizeof(K) * obj.size();
-  } else {
-    sum_val += (sizeof(K) + sizeof(V)) * obj.size();
-  }
-  return sum_val;
-}
-
-template<>
-inline size_t SerializedSize(const std::string &obj) {
-  return obj.size() + 4;
-}
-
-template<int I>
-struct serialized_size_tuple {
-  template<typename... Args>
-  static inline size_t Compute(const std::tuple<Args...> &obj) {
-    return SerializedSize(std::get<I>(obj)) + serialized_size_tuple<I-1>::Compute(obj);
-  }
-};
-
-template<>
-struct serialized_size_tuple<0> {
-  template<typename... Args>
-  static inline size_t Compute(const std::tuple<Args...> &obj) {
-    return SerializedSize(std::get<0>(obj));
-  }
-};
-
-template<typename... Args>
-inline size_t SerializedSize(const std::tuple<Args...> &obj) {
-  return serialized_size_tuple<sizeof... (Args)-1>::Compute(obj);
-}
-
-//  Serializer
-
-template<typename T>
-inline size_t SerializedContainerSize(const T &obj, char **buffer) {
-  uint32_t size = obj.size();
-  std::memcpy(*buffer, &size, 4);
-  *buffer += 4;
-  return (size_t) size;
-}
-
-template<typename T>
-inline void Serialize(const T &obj, char **buffer) {
-  std::memcpy(*buffer, &obj, sizeof(T));
-  *buffer += sizeof(T);
-}
-
-template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer) {
-  uint32_t size = obj.ndim();
-  std::memcpy(*buffer, &size, 4);
-  *buffer += 4;
-  for (auto& el : obj) {
-    Serialize(el, buffer);
-  }
-}
-
-template<typename K, typename V>
-inline void Serialize(const std::map <K, V> &obj, char **buffer) {
-  SerializedContainerSize(obj, buffer);
-  for (auto& p : obj) {
-    Serialize(p.first, buffer);
-    Serialize(p.second, buffer);
-  }
-}
-
-template<>
-inline void Serialize(const std::string &obj, char **buffer) {
-  auto size = SerializedContainerSize(obj, buffer);
-  std::memcpy(*buffer, &obj[0], size);
-  *buffer += size;
-}
-
-template<int I>
-struct serialize_tuple {
-  template<typename... Args>
-  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
-    serialize_tuple<I-1>::Compute(obj, buffer);
-    Serialize(std::get<I>(obj), buffer);
-  }
-};
-
-template<>
-struct serialize_tuple<0> {
-  template<typename... Args>
-  static inline void Compute(const std::tuple<Args...> &obj, char **buffer) {
-    Serialize(std::get<0>(obj), buffer);
-  }
-};
-
-template<typename... Args>
-inline void Serialize(const std::tuple<Args...> &obj, char **buffer) {
-  serialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer);
-}
-
-// Deserializer
-
-template<typename T>
-inline size_t DeserializedContainerSize(T *obj, const std::string &buffer, size_t *curr_pos) {
-  uint32_t size = obj->size();
-  std::memcpy(&size, &buffer[*curr_pos], 4);
-  *curr_pos += 4;
-  return (size_t) size;
-}
-
-template<typename T>
-inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos) {
-  std::memcpy(obj, &buffer[*curr_pos], sizeof(T));
-  *curr_pos += sizeof(T);
-}
-
-template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
-  uint32_t size = obj->ndim();
-  std::memcpy(&size, &buffer[*curr_pos], 4);
-  *curr_pos += 4;
-  obj->SetDim(size);
-  for (size_t i = 0; i < size; ++i) {
-    Deserialize((*obj)[i], buffer, curr_pos);
-  }
-}
-
-template<typename K, typename V>
-inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos) {
-  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
-  K first;
-  for (size_t i = 0; i < size; ++i) {
-    Deserialize(&first, buffer, curr_pos);
-    Deserialize(&(*obj)[first], buffer, curr_pos);
-  }
-}
-
-template<>
-inline void Deserialize(std::string *obj, const std::string &buffer, size_t *curr_pos) {
-  auto size = DeserializedContainerSize(obj, buffer, curr_pos);
-  obj->resize(size);
-  std::memcpy(&(obj->front()), &buffer[*curr_pos], size);
-  *curr_pos += size;
-}
-
-template<int I>
-struct deserialize_tuple {
-  template<typename... Args>
-  static inline void Compute(std::tuple<Args...> *obj,
-                             const std::string &buffer, size_t *curr_pos) {
-    deserialize_tuple<I-1>::Compute(obj, buffer, curr_pos);
-    Deserialize(&std::get<I>(*obj), buffer, curr_pos);
-  }
-};
-
-template<>
-struct deserialize_tuple<0> {
-  template<typename... Args>
-  static inline void Compute(std::tuple<Args...> *obj,
-                             const std::string &buffer, size_t *curr_pos) {
-    Deserialize(&std::get<0>(*obj), buffer, curr_pos);
-  }
-};
-
-template<typename... Args>
-inline void Deserialize(std::tuple<Args...> *obj, const std::string &buffer, size_t *curr_pos) {
-  deserialize_tuple<sizeof... (Args)-1>::Compute(obj, buffer, curr_pos);
-}
-
-
-template<typename T>
-inline void Serialize(const T& obj, std::string* serialized_data) {
-  serialized_data->resize(SerializedSize(obj));
-  char* curr_pos = &(serialized_data->front());
-  Serialize(obj, &curr_pos);
-  CHECK_EQ((int64_t)curr_pos - (int64_t)&(serialized_data->front()),
-           serialized_data->size());
-}
-
-template<typename T>
-inline void Deserialize(T* obj, const std::string& serialized_data) {
-  size_t curr_pos = 0;
-  Deserialize(obj, serialized_data, &curr_pos);
-  CHECK_EQ(curr_pos, serialized_data.size());
-}
-
-}  // namespace common
-}  // namespace mxnet
-#endif  // MXNET_COMMON_SERIALIZATION_H_
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index dd4132301346..acf20de54beb 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -202,18 +202,6 @@ Graph InferStorageType(Graph&& graph,
                        StorageTypeVector&& storage_type_inputs = StorageTypeVector(),
                        const std::string& storage_type_attr_key = "");
 
-#if MXNET_USE_TENSORRT
-/*!
- * \brief Replace subgraphs by TRT (forward only)
- */
-Graph ReplaceSubgraph(Graph&& g,
-                      const std::unordered_set<nnvm::Node*>& set_subgraph,
-                      std::unordered_map<std::string, NDArray>* const params_map);
-
-std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
-    std::unordered_map<std::string, NDArray>* const params_map);
-#endif
-
 }  // namespace exec
 }  // namespace mxnet
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 28862a49fae1..1813c0723bb4 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -101,6 +101,16 @@ void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
+/*!
+ * \brief Return the "optimized" symbol contained in the executor graph.
+ */
+nnvm::Symbol GraphExecutor::GetOptimizedSymbol() {
+  Symbol ret;
+  ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
+      graph_.outputs.begin() + num_forward_outputs_);
+  return ret.Copy();
+}
+
 void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback, bool monitor_all) {
   CHECK(callback) << "invalid callback";
   monitor_callback_ = callback;
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index b556a2bd0fe9..9a8660916357 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -68,6 +68,7 @@ class GraphExecutor : public Executor {
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const override;
   const std::unordered_map<std::string, NDArray>& aux_state_map() const override;
   void Print(std::ostream &os) const override; // NOLINT(*)
+  nnvm::Symbol GetOptimizedSymbol();
   void SetMonitorCallback(const MonitorCallback& callback, bool monitor_all = false) override;
   // Initialize the rest of attributes
   // after setting up arguments.
diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc
deleted file mode 100644
index f847d59a1298..000000000000
--- a/src/executor/tensorrt_pass.cc
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt_pass.cc
- * \brief Replace TRT compatible subgraphs by TRT engines
- * \author Clement Fuji Tsang
- */
-
-#if MXNET_USE_TENSORRT
-
-#include <NvInfer.h>
-#include <mxnet/base.h>
-#include <mxnet/op_attr_types.h>
-#include <mxnet/operator.h>
-#include <nnvm/graph_attr_types.h>
-#include <onnx/onnx_pb.h>
-
-#include "../operator/contrib/nnvm_to_onnx-inl.h"
-#include "./exec_pass.h"
-#include "./onnx_to_tensorrt.h"
-
-namespace mxnet {
-namespace exec {
-
-using NodePtr = nnvm::NodePtr;
-
-/*!
- * \brief Custom graph class, which will contain bi-directional nodes
- * we need to compute DFS and reverse DFS for graph partitioning
- */
-class BidirectionalGraph {
- public:
-  struct Node {
-    nnvm::Node* nnvmptr;
-    std::vector<Node*> inputs;
-    std::vector<Node*> outputs;
-  };
-  std::vector<Node> nodes;
-  std::unordered_map<nnvm::Node*, uint32_t> nnvm2nid;
-  std::vector<Node*> outputs;
-  static const std::unordered_set<std::string> unconditionalTRTop;
-
-  explicit BidirectionalGraph(const Graph &g) {
-    auto& idx = g.indexed_graph();
-    auto num_nodes = idx.num_nodes();
-    nodes.reserve(num_nodes);
-    nnvm2nid.reserve(num_nodes);
-    outputs.reserve(idx.outputs().size());
-    DFSVisit(g.outputs, [this](const nnvm::NodePtr& n) {
-      BidirectionalGraph::Node new_node;
-      new_node.nnvmptr = n.get();
-      nnvm2nid[n.get()] = static_cast<uint32_t>(nodes.size());
-      nodes.emplace_back(std::move(new_node));
-    });
-    for (const auto& it : nnvm2nid) {
-      nnvm::Node* nnvmnode = it.first;
-      uint32_t nid = it.second;
-      for (auto& n : nnvmnode->inputs) {
-        uint32_t input_nid = nnvm2nid[n.node.get()];
-        nodes[input_nid].outputs.emplace_back(&nodes[nid]);
-        nodes[nid].inputs.emplace_back(&nodes[input_nid]);
-      }
-    }
-    for (auto& e : g.outputs) {
-      uint32_t nid = nnvm2nid[e.node.get()];
-      outputs.emplace_back(&nodes[nid]);
-    }
-  }
-
-  template <typename FVisit>
-  void DFS(const std::vector<Node*>& heads, bool reverse, FVisit fvisit) {
-    std::unordered_set<Node*> visited;
-    std::vector<Node*> vec(heads.begin(), heads.end());
-    visited.reserve(heads.size());
-    while (!vec.empty()) {
-      Node* vertex = vec.back();
-      vec.pop_back();
-      if (visited.count(vertex) == 0) {
-        visited.insert(vertex);
-        fvisit(vertex);
-        std::vector<Node*> nexts = reverse ? vertex->inputs : vertex->outputs;
-        for (Node* node : nexts) {
-          if (visited.count(node) == 0) {
-            vec.emplace_back(node);
-          }
-        }
-      }
-    }
-  }
-
-  using t_pairset = std::pair<std::unordered_set<Node*>, std::unordered_set<Node*>>;
-  using t_pairvec = std::pair<std::vector<Node*>, std::vector<Node*>>;
-  using t_uncomp_map = std::unordered_map<Node*, std::unordered_set<Node*>>;
-
-  std::unordered_set<Node*> naive_grow_subgraph(Node* head,
-                                                std::unordered_set<Node*>* set_unused,
-                                                t_uncomp_map* uncomp_map) {
-    std::unordered_set<Node*> subgraph;
-    std::unordered_set<Node*> uncomp_set;
-    std::deque<Node*> stack;
-    stack.emplace_back(head);
-    while (!stack.empty()) {
-      Node* vertex = stack.back();
-      stack.pop_back();
-      if (set_unused->count(vertex) && !uncomp_set.count(vertex)) {
-        set_unused->erase(vertex);
-        subgraph.insert(vertex);
-        uncomp_set.insert((*uncomp_map)[vertex].begin(), (*uncomp_map)[vertex].end());
-        for (Node* input : vertex->inputs) {
-          if (set_unused->count(input) && !uncomp_set.count(input)) {
-            stack.emplace_back(input);
-          }
-        }
-        for (Node* output : vertex->outputs) {
-          if (set_unused->count(output) && !uncomp_set.count(output)) {
-            stack.emplace_back(output);
-          }
-        }
-      }
-    }
-    return subgraph;
-  }
-
-  std::vector<std::unordered_set<Node*>> get_subsets(
-    std::unordered_map<std::string, NDArray>* const params_map) {
-    std::vector<std::unordered_set<Node*>> subgraphs;
-    std::unordered_set<Node*> set_nonTRTnodes;
-    std::unordered_set<Node*> set_allnodes(nodes.size());
-    std::vector<t_pairset> separation_sets;
-    for (Node& node : nodes) {
-      if (!IsTRTCompatible(node.nnvmptr)) {
-        set_nonTRTnodes.insert(&node);
-        std::unordered_set<Node*> in_graph;
-        std::unordered_set<Node*> out_graph;
-        std::vector<Node*> dummy_head;
-        dummy_head.emplace_back(&node);
-        DFS(dummy_head, false, [&out_graph](Node* node) {
-          out_graph.insert(node);
-        });
-        DFS(dummy_head, true, [&in_graph](Node* node) {
-          in_graph.insert(node);
-        });
-        separation_sets.emplace_back(std::make_pair(in_graph, out_graph));
-      }
-      set_allnodes.emplace(&node);
-    }
-    t_uncomp_map uncomp_map;
-    std::unordered_set<Node*> set_TRTnodes;
-    set_TRTnodes.insert(set_allnodes.begin(), set_allnodes.end());
-    for (Node* n : set_nonTRTnodes) {
-      set_TRTnodes.erase(n);
-    }
-    for (Node* n : set_TRTnodes) {
-      for (t_pairset p : separation_sets) {
-        if (p.first.count(n)) {
-          uncomp_map[n].insert(p.second.begin(), p.second.end());
-        } else if (p.second.count(n)) {
-          uncomp_map[n].insert(p.first.begin(), p.first.end());
-        }
-      }
-      for (Node* nonTRTn : set_nonTRTnodes) {
-        uncomp_map[n].erase(nonTRTn);
-      }
-    }
-    std::unordered_set<Node*> set_unused;
-    set_unused.reserve(set_TRTnodes.size());
-
-    for (auto& n : set_TRTnodes) {
-      if (n->nnvmptr->attrs.op != nullptr || params_map->count(n->nnvmptr->attrs.name)) {
-        set_unused.insert(n);
-      }
-    }
-    std::unordered_set<Node*> visited;
-    std::deque<Node*> stack(outputs.begin(), outputs.end());
-    while (!stack.empty()) {
-      Node* vertex = stack.front();
-      stack.pop_front();
-      if (!visited.count(vertex)) {
-        visited.insert(vertex);
-        if (set_unused.count(vertex)) {
-          subgraphs.emplace_back(naive_grow_subgraph(vertex, &set_unused, &uncomp_map));
-        }
-        for (Node* input : vertex->inputs) {
-          stack.emplace_back(input);
-        }
-      }
-    }
-
-    return subgraphs;
-  }
-
-
- private:
-  friend class Graph;
-
-  bool IsTRTCompatible(nnvm::Node* nodeptr) {
-    if (nodeptr->op() == nullptr) {
-      return true;
-    }
-
-    const std::string op_name = nodeptr->op()->name;
-    if (op_name == "Pooling") {
-      return (nodeptr->attrs.dict.at("pool_type") == "avg" ||
-          nodeptr->attrs.dict.at("pool_type") == "max");
-    }
-
-    if (unconditionalTRTop.count(op_name)) {
-      return true;
-    }
-
-    if (op_name == "Activation") {
-      return nodeptr->attrs.dict.at("act_type") == "relu" ||
-        nodeptr->attrs.dict.at("act_type") == "tanh" ||
-        nodeptr->attrs.dict.at("act_type") == "sigmoid";
-    }
-
-    return false;
-  }
-};  // class BidirectionalGraph
-
-/*!
- * \brief function which transform std::vector<dmlc::any> back to Attrs (dmlc::any)
- */
-const std::unordered_set<std::string> BidirectionalGraph::unconditionalTRTop = {
-  "Convolution",
-  "BatchNorm",
-  "elemwise_add",
-  "elemwise_sub",
-  "elemwise_mul",
-  "rsqrt",
-  "pad",
-  "Pad",
-  "mean",
-  "FullyConnected",
-  "Flatten",
-  "SoftmaxOutput",
-};
-
-
-using NodeEntrySet = std::unordered_set<nnvm::NodeEntry, nnvm::NodeEntryHash,
-                                        nnvm::NodeEntryEqual>;
-
-/*!
- * \brief get the output nodes of the subgraph in the main graph
- * \return a vector of the output nodes
-*/
-std::vector<nnvm::NodeEntry> GetSubgraphNodeEntries(Graph g,
-    std::unordered_set<nnvm::Node*> set_subgraph) {
-  std::vector<nnvm::NodeEntry> outputs;
-  NodeEntrySet _outputs;
-  for (auto& e : g.outputs) {
-    if (set_subgraph.count(e.node.get())) {
-      _outputs.insert(e);
-    }
-  }
-  DFSVisit(g.outputs, [&set_subgraph, &_outputs](const nnvm::NodePtr &node){
-    if (!set_subgraph.count(node.get())) {
-      for (auto& e : node->inputs) {
-        if (set_subgraph.count(e.node.get())) {
-          _outputs.insert(e);
-        }
-      }
-    }
-  });
-  outputs.insert(outputs.begin(), _outputs.begin(), _outputs.end());
-  return outputs;
-}
-
-
-/*!
- * \brief get the nodes outside of the subgraph for which outputs are used in the subgraph
- * \return a vector the nodes
-*/
-std::vector<nnvm::NodeEntry> GetSubgraphInterfaceNodes(Graph g,
-    std::unordered_set<nnvm::Node*> set_subgraph) {
-  std::vector<nnvm::NodeEntry> inputs;
-  NodeEntrySet _inputs;
-  DFSVisit(g.outputs, [&set_subgraph, &_inputs](const nnvm::NodePtr &node){
-    if (set_subgraph.count(node.get())) {
-      for (auto& e : node->inputs) {
-        if (!set_subgraph.count(e.node.get())) {
-          _inputs.insert(e);
-        }
-      }
-    }
-  });
-  inputs.insert(inputs.begin(), _inputs.begin(), _inputs.end());
-  return inputs;
-}
-
-std::unordered_map<uint32_t, uint32_t> GetGraphInputsMap(const Graph& g) {
-  std::unordered_map<uint32_t, uint32_t> outputs;
-  auto& idx = g.indexed_graph();
-  outputs.reserve(idx.num_nodes());
-  std::vector<uint32_t> input_nodes = idx.input_nodes();
-  for (size_t i = 0; i < input_nodes.size(); ++i) {
-    outputs[input_nodes[i]] = static_cast<uint32_t>(i);
-  }
-  return outputs;
-}
-
-/*!
- * \brief Dummy function which creates a fake TensorRT Node
- */
-nnvm::NodePtr ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
-                                     std::unordered_map<std::string, NDArray>* const params_map) {
-  auto p = nnvm::Node::Create();
-  p->attrs.op = nnvm::Op::Get("_trt_op");
-  op::ONNXParam onnx_param = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(g, params_map);
-  p->attrs.dict["serialized_output_map"] = onnx_param.serialized_output_map;
-  p->attrs.dict["serialized_input_map"]  = onnx_param.serialized_input_map;
-  p->attrs.dict["serialized_onnx_graph"] = onnx_param.serialized_onnx_graph;
-  if (p->op()->attr_parser != nullptr) {
-    p->op()->attr_parser(&(p->attrs));
-  }
-  return p;
-}
-
-/*!
- * \brief Update attributes of the graph (such as some inputs properties)
- */
-Graph UpdateSubgraphAttrs(Graph&& subgraph, const Graph& g,
-                          const std::unordered_map<nnvm::Node*, nnvm::NodePtr>& old2new,
-                          const nnvm::NodeEntryMap<nnvm::NodeEntry>& main_input_entry_to_sub) {
-  const auto& idx     = g.indexed_graph();
-  const auto& sub_idx = subgraph.indexed_graph();
-
-  const auto& shape               = g.GetAttr<mxnet::ShapeVector>("shape");
-  const auto& dtype               = g.GetAttr<nnvm::DTypeVector>("dtype");
-  const auto& storage_type        = g.GetAttr<StorageTypeVector>("storage_type");
-  const auto& shape_inputs        = g.GetAttr<mxnet::ShapeVector>("shape_inputs");
-  const auto& dtype_inputs        = g.GetAttr<nnvm::DTypeVector>("dtype_inputs");
-  const auto& storage_type_inputs = g.GetAttr<StorageTypeVector>("storage_type_inputs");
-
-  mxnet::ShapeVector sub_shape(sub_idx.num_node_entries());
-  nnvm::DTypeVector sub_dtype(sub_idx.num_node_entries());
-  StorageTypeVector sub_storage_type(sub_idx.num_node_entries());
-  mxnet::ShapeVector sub_shape_inputs(sub_idx.input_nodes().size());
-  nnvm::DTypeVector sub_dtype_inputs(sub_idx.input_nodes().size());
-  StorageTypeVector sub_storage_type_inputs(sub_idx.input_nodes().size());
-
-  const std::unordered_map<uint32_t, uint32_t> inputsindex2pos     = GetGraphInputsMap(g);
-  const std::unordered_map<uint32_t, uint32_t> sub_inputsindex2pos = GetGraphInputsMap(subgraph);
-  // map attributes from graph to subgraph
-  for (auto& p : old2new) {
-    const uint32_t nid     = idx.node_id(p.first);
-    const uint32_t sub_nid = sub_idx.node_id(p.second.get());
-    const nnvm::Op* op = sub_idx[sub_nid].source->op();
-    if (op == nullptr) {  // if it's an input node, there is only one output node entry
-      const uint32_t sub_i       = sub_idx.entry_id(sub_nid, 0);
-      const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
-      const uint32_t i           = idx.entry_id(nid, 0);
-
-      sub_shape[sub_i] = shape[i];
-      sub_dtype[sub_i] = dtype[i];
-      sub_storage_type[sub_i]       = storage_type[i];
-      sub_shape_inputs[sub_input_i] = shape_inputs[inputsindex2pos.at(nid)];
-      sub_dtype_inputs[sub_input_i] = dtype_inputs[inputsindex2pos.at(nid)];
-      sub_storage_type_inputs[sub_input_i] = storage_type_inputs[inputsindex2pos.at(nid)];
-
-    } else {
-      for (size_t oi = 0; oi < op->num_outputs; ++oi) {
-        const uint32_t sub_i = sub_idx.entry_id(sub_nid, oi);
-        const uint32_t i = idx.entry_id(nid, oi);
-          sub_shape[sub_i] = shape[i];
-          sub_dtype[sub_i] = dtype[i];
-          sub_storage_type[sub_i] = storage_type[i];
-      }
-    }
-  }
-  // old2new doesn't contain placeholder / interfaces
-  for (auto& p : main_input_entry_to_sub) {
-    nnvm::NodeEntry main_entry = p.first;
-    nnvm::NodeEntry sub_entry = p.second;
-    const uint32_t sub_nid = sub_idx.node_id(sub_entry.node.get());
-    const uint32_t sub_i = sub_idx.entry_id(sub_entry);
-    const uint32_t i = idx.entry_id(main_entry);
-    const uint32_t sub_input_i = sub_inputsindex2pos.at(sub_nid);
-    sub_shape[sub_i] = shape[i];
-    sub_dtype[sub_i] = dtype[i];
-    sub_storage_type[sub_i] = storage_type[i];
-    sub_shape_inputs[sub_input_i] = sub_shape[sub_i];
-    sub_dtype_inputs[sub_input_i] = sub_dtype[sub_i];
-    sub_storage_type_inputs[sub_input_i] = sub_storage_type[sub_i];
-  }
-  subgraph.attrs["shape"] =
-      std::make_shared<dmlc::any>(std::move(sub_shape));
-  subgraph.attrs["dtype"] =
-      std::make_shared<dmlc::any>(std::move(sub_dtype));
-  subgraph.attrs["storage_type"] =
-      std::make_shared<dmlc::any>(std::move(sub_storage_type));
-  subgraph.attrs["shape_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_shape_inputs));
-  subgraph.attrs["dtype_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_dtype_inputs));
-  subgraph.attrs["storage_type_inputs"] =
-      std::make_shared<dmlc::any>(std::move(sub_storage_type_inputs));
-
-  return subgraph;
-}
-
-/*!
- * \brief Generate a name for a new TRT node, avoid collision if some TRT_nodes are already defined
- */
-const std::string GetNewTrtName(const Graph& g, const Graph& subgraph) {
-  const std::string name_prefix("TRT_node");
-  std::unordered_set<std::string> name_set;
-  DFSVisit(g.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
-    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
-      name_set.insert(node->attrs.name);
-    }
-  });
-  // name inside the subgraph will be avaible as they will be removed
-  DFSVisit(subgraph.outputs, [&name_set, &name_prefix](const nnvm::NodePtr& node) {
-    if (node->attrs.name.compare(0, name_prefix.size(), name_prefix) == 0) {
-      name_set.erase(node->attrs.name);
-    }
-  });
-  uint32_t name_suffix = 0;
-  std::string full_name = name_prefix + std::to_string(name_suffix);
-  while (name_set.count(full_name)) {
-    full_name = name_prefix + std::to_string(++name_suffix);
-  }
-  return full_name;
-}
-
-/*!
- * \brief helper function to display what nodes are in a specific subset
- */
-void dispNodesSet(Graph g, std::unordered_set<nnvm::Node*> s) {
-  DFSVisit(g.outputs, [&s](const nnvm::NodePtr n){
-    if (s.count(n.get())) {
-      std::cout << "  Y " << n->attrs.name << std::endl;
-    } else {
-      std::cout << "  N " << n->attrs.name << std::endl;
-    }
-  });
-}
-
-/*!
- * \brief Replace a set of nodes by a TensorRT node
- */
-Graph ReplaceSubgraph(Graph&& g,
-                      const std::unordered_set<nnvm::Node*>& set_subgraph,
-                      std::unordered_map<std::string, NDArray>* const params_map) {
-  // Create MXNet subgraph
-  Graph subgraph;
-
-  const auto sub_outputs_in_main = GetSubgraphNodeEntries(g, set_subgraph);
-  subgraph.outputs = sub_outputs_in_main;
-  // old2new will link raw pointer of the nodes in the graph to
-  // the corresponding shared_ptr of the nodes in the generated subgraph
-  std::unordered_map<nnvm::Node*, nnvm::NodePtr> old2new;
-  std::deque<nnvm::Node*> stack;
-  std::unordered_set<nnvm::Node*> visited;
-  int32_t reservation = set_subgraph.size();
-  old2new.reserve(reservation);
-  visited.reserve(reservation);
-
-  // Create the shared_ptr using the same raw pointer don't really matter
-  for (auto& n : set_subgraph) {
-    old2new[n] = std::make_shared<nnvm::Node>(*n);
-  }
-
-  // To generate a subgraph an input have to be replace by data node (no op)
-  // and it have to be agnostic to the node from which it's an output
-  // (For exemple even if two inputs are two different outputs from the same node)
-  nnvm::NodeEntryMap<nnvm::NodeEntry> main_input_entry_to_sub;
-  for (auto& e : GetSubgraphInterfaceNodes(g, set_subgraph)) {
-    auto node = nnvm::Node::Create();
-    node->attrs.name = e.node->attrs.name + "_" + std::to_string(e.index);
-    auto new_e = nnvm::NodeEntry{node, 0, 0};
-    main_input_entry_to_sub[e] = new_e;
-  }
-
-  for (nnvm::NodeEntry& e : subgraph.outputs) {
-    e.node = old2new[e.node.get()];
-    stack.emplace_back(e.node.get());
-  }
-  // link all nodes in the subgraph to nodes in the subgraph instead of main graph
-  while (!stack.empty()) {
-    auto vertex = stack.front();
-    stack.pop_front();
-    if (!visited.count(vertex)) {
-      visited.insert(vertex);
-      for (auto& e : vertex->inputs) {
-        auto it = main_input_entry_to_sub.find(e);
-        if (it != main_input_entry_to_sub.end()) {
-          e = it->second;
-        } else {
-          e.node = old2new[e.node.get()];
-        }
-      stack.emplace_back(e.node.get());
-      }
-    }
-  }
-  // Remove the control dependencies of the subgraph to nodes that are not in the subgraph
-  DFSVisit(subgraph.outputs, [&set_subgraph, &old2new](const nnvm::NodePtr& node) {
-    std::remove_if(node->control_deps.begin(),
-                   node->control_deps.end(),
-                   [&set_subgraph](nnvm::NodePtr n_ptr) {
-                    return !set_subgraph.count(n_ptr.get());
-                   });
-    for (nnvm::NodePtr& n_ptr : node->control_deps) {
-      n_ptr = old2new[n_ptr.get()];
-    }
-  });
-
-  subgraph = UpdateSubgraphAttrs(std::move(subgraph), g, old2new, main_input_entry_to_sub);
-  auto& sub_idx = subgraph.indexed_graph();
-
-  auto trtnodeptr = ConvertNnvmGraphToOnnx(subgraph, params_map);
-  trtnodeptr->attrs.name = GetNewTrtName(g, subgraph);
-
-  // Insert new trt node and unplug replaced nodes
-  std::unordered_map<uint32_t, nnvm::NodeEntry> sub_input_entryid_to_main;
-  for (auto& p : main_input_entry_to_sub) {
-    sub_input_entryid_to_main[sub_idx.entry_id(p.second)] = p.first;
-  }
-
-  // Plug the nodes from the main graph as inputs of the trt node
-  trtnodeptr->inputs.resize(main_input_entry_to_sub.size());
-  {
-    uint32_t counter = 0;
-    for (uint32_t i : sub_idx.input_nodes()) {
-      auto it = sub_input_entryid_to_main.find(sub_idx.entry_id(i, 0));
-      if (it != sub_input_entryid_to_main.end()) {
-        trtnodeptr->inputs[counter++] = it->second;
-      }
-    }
-  }
-  nnvm::NodeEntryMap<uint32_t> sub_outputs_in_main_to_pos;
-  for (uint32_t i = 0; i < sub_outputs_in_main.size(); ++i) {
-    sub_outputs_in_main_to_pos[sub_outputs_in_main[i]] = i;
-  }
-  // Plug the trt node as inputs to the main graph nodes
-  DFSVisit(g.outputs, [&sub_outputs_in_main_to_pos, &trtnodeptr](const nnvm::NodePtr& n) {
-    for (auto& e : n->inputs) {
-      auto it = sub_outputs_in_main_to_pos.find(e);
-      if (it != sub_outputs_in_main_to_pos.end()) {
-        e.index = it->second;
-        e.node = trtnodeptr;
-      }
-    }
-  });
-
-  for (auto& output : g.outputs) {
-    auto it = sub_outputs_in_main_to_pos.find(output);
-    if (it != sub_outputs_in_main_to_pos.end()) {
-      output.index = it->second;
-      output.node = trtnodeptr;
-    }
-  }
-
-  Graph new_graph;
-  new_graph.outputs = g.outputs;
-  return new_graph;
-}
-
-std::vector<std::unordered_set<nnvm::Node*>> GetTrtCompatibleSubsets(const Graph& g,
-    std::unordered_map<std::string, NDArray>* const params_map) {
-  BidirectionalGraph biG = BidirectionalGraph(g);
-  std::vector<std::unordered_set<BidirectionalGraph::Node*>> subsets = biG.get_subsets(params_map);
-  std::vector<std::unordered_set<nnvm::Node*>> nnvm_subsets(subsets.size(),
-                                                            std::unordered_set<nnvm::Node*>());
-  for (size_t i = 0; i < subsets.size(); ++i) {
-    nnvm_subsets[i].reserve(subsets[i].size());
-    for (auto& n : subsets[i]) {
-      nnvm_subsets[i].insert(n->nnvmptr);
-    }
-  }
-  return nnvm_subsets;
-}
-
-}  // namespace exec
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.cc b/src/executor/trt_graph_executor.cc
deleted file mode 100644
index c923922d5184..000000000000
--- a/src/executor/trt_graph_executor.cc
+++ /dev/null
@@ -1,443 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#if MXNET_USE_TENSORRT
-
-#include "trt_graph_executor.h"
-
-#include <onnx/onnx_pb.h>
-#include <NvInfer.h>
-#include "./onnx_to_tensorrt.h"
-#include "../operator/contrib/tensorrt-inl.h"
-#include "../common/utils.h"
-#include "../common/exec_utils.h"
-
-
-namespace mxnet {
-namespace exec {
-
-using namespace mxnet::common;
-
-  /*!
- * \brief TrtGraphExecutor initializer for simple bind flow in
- * which only certain input shapes and dtypes are provided by users.
- * The initializer uses these shapes and dtypes to perform
- * shape and dtype inferences, and then create NDArrays
- * to populate data entries of the graph. The created NDArrays
- * for in_args, arg_grads and aux_states are passed to the
- * front end to attach the created executor.
- * In front end, if the simple_bind flow is trigger by
- * _bind_ith_exec, the shared data arrays of DataParallelExecutorGroup
- * and shared executor will be taken into account in creating
- * NDArrays for in_args, arg_grads, and aux_states for reusing
- * already allocated memory.
- *
- * This version of an executor exports the computation graph to TensorRT make use of fused
- * kernels and other runtime enhancements.  TRT will compile the sub-graphs to executable fused
- * operators without intervention from the user.  Operators in the original graph that are not
- * supported by TRT will continue to be executed normally by MXNet.
- *
- */
-void TrtGraphExecutor::Init(nnvm::Symbol symbol,
-                            const Context& default_ctx,
-                            const std::map<std::string, Context>& ctx_map,
-                            std::vector<Context> *in_arg_ctxes,
-                            std::vector<Context> *arg_grad_ctxes,
-                            std::vector<Context> *aux_state_ctxes,
-                            std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                            std::unordered_map<std::string, int> *arg_dtype_map,
-                            std::unordered_map<std::string, int> *arg_stype_map,
-                            std::vector<OpReqType> *grad_req_types,
-                            const std::unordered_set<std::string>& shared_arg_names,
-                            std::vector<NDArray>* in_arg_vec,
-                            std::vector<NDArray>* arg_grad_vec,
-                            std::vector<NDArray>* aux_state_vec,
-                            std::unordered_map<std::string, NDArray>* shared_buffer,
-                            Executor* shared_exec,
-                            const nnvm::NodeEntryMap<NDArray>& feed_dict) {
-  symbol = symbol.Copy();
-  nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
-                            *aux_state_ctxes, *grad_req_types);
-
-  if (need_grad_) {
-    LOG(FATAL) << "You may be attempting to use TensorRT for training.  TensorRT is an inference "
-                  "only library.  To re-enable legacy MXNet graph execution, which will support "
-                  "training, set the MXNET_USE_TENSORRT environment variable to 0, or call "
-                  "mx.contrib.tensorrt.set_use_tensorrt(False)";
-  }
-
-  if (shared_buffer == nullptr || shared_buffer->empty()) {
-    LOG(FATAL) << "MXNET_USE_TENSORRT = 1 but shared_buffer is empty. "
-               << "Please provide weights and other parameters, such as "
-               << "BatchNorm moments, via the shared_buffer, during simple bind call.";
-  }
-
-  // The following code of shape and dtype inferences and argument
-  // initialization is for simple_bind only. Regular bind operation
-  // should do this differently.
-
-  // Initialize arg_shapes and arg_dtypes for shape and type inferences.
-  // It contains all in_args and aux_states' shapes and types in a certain order.
-  const nnvm::IndexedGraph& idx = g.indexed_graph();
-  mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
-  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
-  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const std::string& name = idx[nid].source->attrs.name;
-    auto it1 = arg_shape_map->find(name);
-    if (arg_shape_map->end() != it1) {
-      arg_shapes[i] = it1->second;
-    }
-    auto it2 = arg_dtype_map->find(name);
-    if (arg_dtype_map->end() != it2) {
-      arg_dtypes[i] = it2->second;
-    }
-    auto it3 = arg_stype_map->find(name);
-    if (arg_stype_map->end() != it3) {
-      arg_stypes[i] = it3->second;
-    }
-  }
-  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
-  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
-    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
-                          g.GetAttr<mxnet::ShapeVector>("shape"));
-  }
-
-  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
-  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
-    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
-                         g.GetAttr<nnvm::DTypeVector>("dtype"));
-  }
-
-  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
-  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
-    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
-                                g.GetAttr<StorageTypeVector>("storage_type"));
-  }
-
-  auto trt_groups = GetTrtCompatibleSubsets(g, shared_buffer);
-  for (const auto &trt_group : trt_groups) {
-    if (trt_group.size() > 1) {
-      g = ReplaceSubgraph(std::move(g), trt_group, shared_buffer);
-      g = ReinitGraph(std::move(g), default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
-                      aux_state_ctxes, grad_req_types, arg_shape_map, arg_dtype_map,
-                      arg_stype_map, shared_buffer);
-    }
-  }
-
-  InitArguments(g.indexed_graph(), g.GetAttr<mxnet::ShapeVector>("shape"),
-                g.GetAttr<nnvm::DTypeVector>("dtype"),
-                g.GetAttr<StorageTypeVector>("storage_type"),
-                *in_arg_ctxes, *arg_grad_ctxes, *aux_state_ctxes,
-                *grad_req_types, shared_arg_names, shared_exec,
-                shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec);
-
-  // The above code of shape and dtype inferences and argument
-  // initialization is for simple_bind only. Regular bind operation
-  // should do this differently.
-
-  // Initialize the rest attributes of the graph.
-  // This function can be called by regular bind
-  // operation flow as well.
-  FinishInitGraph(symbol, g, shared_exec, feed_dict);
-}
-/*!
- * \brief Initialize in_args, arg_grads, and aux_states
- * and their data_entry_ of the executor using
- * shared_buffer from DataParallelExecutorGroup
- * and shared_exec if available.
- */
-void TrtGraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
-                                  const mxnet::ShapeVector& inferred_shapes,
-                                  const nnvm::DTypeVector& inferred_dtypes,
-                                  const StorageTypeVector& inferred_stypes,
-                                  const std::vector<Context>& in_arg_ctxes,
-                                  const std::vector<Context>& arg_grad_ctxes,
-                                  const std::vector<Context>& aux_state_ctxes,
-                                  const std::vector<OpReqType>& grad_req_types,
-                                  const std::unordered_set<std::string>& shared_arg_names,
-                                  const Executor* shared_exec,
-                                  std::unordered_map<std::string, NDArray>* shared_buffer,
-                                  std::vector<NDArray>* in_arg_vec,
-                                  std::vector<NDArray>* arg_grad_vec,
-                                  std::vector<NDArray>* aux_state_vec) {
-  // initialize in_args, arg_grads, and aux_states and populate grad_store_
-  data_entry_.resize(idx.num_node_entries());
-  size_t arg_top = 0, aux_top = 0;
-  const auto& mutable_nodes = idx.mutable_input_nodes();
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const uint32_t eid = idx.entry_id(nid, 0);
-    const mxnet::TShape& inferred_shape = inferred_shapes[eid];
-    const int inferred_dtype = inferred_dtypes[eid];
-    const auto inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
-    const std::string& arg_name = idx[nid].source->attrs.name;
-    // aux_states
-    if (mutable_nodes.count(nid)) {
-      if (nullptr != shared_exec) {
-        const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name);
-        CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage)
-          << "Non-default storage type detected when creating auxilliary NDArray. The allocated "
-          << "memory of shared_exec.aux_array cannot be resued for argument: "
-          << arg_name << " for the current executor";
-        CHECK_EQ(inferred_shape, aux_nd.shape())
-          << "Inferred shape does not match shared_exec.aux_array's shape."
-             " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument: "
-          << arg_name << " for the current executor";
-        CHECK_EQ(inferred_dtype, aux_nd.dtype())
-          << "Inferred dtype does not match shared_exec.aux_array's dtype."
-             " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument: "
-          << arg_name << " for the current executor";
-        aux_state_vec->emplace_back(aux_nd);
-      } else {
-        auto it = shared_buffer->find(arg_name);
-        if (it != shared_buffer->end()) {
-          aux_state_vec->push_back(std::move(it->second.Copy(aux_state_ctxes[aux_top])));
-        } else {
-          aux_state_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
-                                                       aux_state_ctxes[aux_top], inferred_dtype)));
-        }
-      }  // if (has_shared_exec)
-      data_entry_[eid] = aux_state_vec->back();
-      aux_state_map_.emplace(arg_name, aux_state_vec->back());
-      ++aux_top;
-    } else {  // in_args and grad for in_args
-      if (shared_arg_names.count(arg_name)) {  // model parameter
-        // model parameter
-        if (nullptr != shared_exec) {
-          const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
-          auto arg_nd_stype = in_arg_nd.storage_type();
-          // for model parameter, both default storage and row_sparse storage can be shared
-          bool shareable_arg_stype = inferred_stype == kDefaultStorage ||
-                                     inferred_stype == kRowSparseStorage;
-          // try to reuse memory from shared_exec
-          CHECK(shareable_arg_stype) << "Inferred storage type "
-            << common::stype_string(inferred_stype)
-            << " does not support memory sharing with shared_exec.arg_array";
-          CHECK_EQ(inferred_stype, arg_nd_stype)
-            << "Inferred stype does not match shared_exec.arg_array's stype"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          CHECK_EQ(inferred_shape, in_arg_nd.shape())
-            << "Inferred shape does not match shared_exec.arg_array's shape"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          CHECK_EQ(inferred_dtype, in_arg_nd.dtype())
-            << "Inferred dtype does not match shared_exec.arg_array's dtype"
-               " Therefore, the allocated memory for shared_exec.arg_array cannot"
-               " be resued for creating NDArray of the argument "
-            << arg_name << " for the current executor";
-          in_arg_vec->emplace_back(in_arg_nd);
-        } else {
-          // doesn't have shared_exec, or non-default storage
-          EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
-                           inferred_dtype, in_arg_vec);
-        }
-        // gradient for model parameter
-        if (kNullOp == grad_req_types[arg_top]) {
-          arg_grad_vec->emplace_back();
-        } else {
-          auto grad_oid = grad_store_.size() + num_forward_outputs_;
-          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
-          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
-          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
-              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
-            // try to reuse memory from shared_exec
-            arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
-          } else {
-            // no need to reuse memory from shared_exec for gradient of non-default storage
-            EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
-                             inferred_dtype, arg_grad_vec);
-          }
-          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-        }
-      } else {  // !shared_arg_names.count(arg_name)
-        // model parameter, row_sparse ndarray sharing enabled
-        auto it = shared_buffer->find(arg_name);
-        if (it != shared_buffer->end()) {
-          in_arg_vec->push_back(std::move(it->second.Copy(in_arg_ctxes[arg_top])));
-        } else {
-          in_arg_vec->push_back(std::move(InitZeros(inferred_stype, inferred_shape,
-                                                    in_arg_ctxes[arg_top], inferred_dtype)));
-        }
-        // gradient for model parameter, row_sparse ndarray sharing disabled
-        if (kNullOp == grad_req_types[arg_top]) {
-          arg_grad_vec->emplace_back();
-        } else {
-          auto grad_oid = grad_store_.size() + num_forward_outputs_;
-          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
-          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
-          bool enable_row_sparse_sharing = false;
-          arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape,
-                                                     inferred_dtype, grad_stype,
-                                                     arg_grad_ctxes[arg_top], shared_buffer,
-                                                     enable_row_sparse_sharing));
-          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-        }  // if (kNullOp == grad_req_types[arg_top])
-      }  // if (shared_arg_names.count(arg_name))
-      in_arg_map_.emplace(arg_name, in_arg_vec->back());
-      if (!arg_grad_vec->back().is_none()) {
-        arg_grad_map_.emplace(arg_name, arg_grad_vec->back());
-      }
-      data_entry_[eid] = in_arg_vec->back();
-      ++arg_top;
-    }
-  }
-}
-
-
-  /*!
- * \brief This function is triggered after each tensorrt subgraph replacement pass.
- * Reset arguments of GraphExecutor::Init(...) as some variables (weights and biases)
- * are absorbed into the TRT engine it also it reruns attributes inferences accordingly
- * to the new topology.
- */
-Graph TrtGraphExecutor::ReinitGraph(Graph&& g, const Context &default_ctx,
-                                 const std::map<std::string, Context> &ctx_map,
-                                 std::vector<Context> *in_arg_ctxes,
-                                 std::vector<Context> *arg_grad_ctxes,
-                                 std::vector<Context> *aux_state_ctxes,
-                                 std::vector<OpReqType> *grad_req_types,
-                                 std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                                 std::unordered_map<std::string, int> *arg_dtype_map,
-                                 std::unordered_map<std::string, int> *arg_stype_map,
-                                 std::unordered_map<std::string, NDArray> *params_map) {
-  std::unordered_set<std::string> to_remove_params;
-  for (auto& el : *params_map) {
-    to_remove_params.insert(el.first);
-  }
-
-  DFSVisit(g.outputs, [&to_remove_params](const nnvm::NodePtr n) {
-    to_remove_params.erase(n->attrs.name);
-  });
-
-  for (auto& el : to_remove_params) {
-    params_map->erase(el);
-    arg_shape_map->erase(el);
-    arg_dtype_map->erase(el);
-    arg_stype_map->erase(el);
-  }
-  const auto &idx = g.indexed_graph();
-  num_forward_inputs_ = idx.input_nodes().size();
-  in_arg_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  arg_grad_ctxes->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  grad_req_types->resize(num_forward_inputs_ - idx.mutable_input_nodes().size());
-  aux_state_ctxes->resize(idx.mutable_input_nodes().size());
-
-  // create "device" and "context" attrs for the graph
-  g = AssignContext(g, default_ctx, ctx_map, *in_arg_ctxes, *arg_grad_ctxes,
-                    *aux_state_ctxes, *grad_req_types, num_forward_inputs_,
-                    num_forward_outputs_);
-
-  // get number of nodes used in forward pass
-  num_forward_nodes_ = 0;
-  for (size_t i = 0; i < num_forward_outputs_; ++i) {
-    num_forward_nodes_ = std::max(
-        num_forward_nodes_, static_cast<size_t>(idx.outputs()[i].node_id + 1));
-  }
-  mxnet::ShapeVector arg_shapes(idx.input_nodes().size(), mxnet::TShape());
-  nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
-  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
-  for (size_t i = 0; i < num_forward_inputs_; ++i) {
-    const uint32_t nid = idx.input_nodes().at(i);
-    const std::string &name = idx[nid].source->attrs.name;
-    auto it1 = arg_shape_map->find(name);
-    if (arg_shape_map->end() != it1) {
-      arg_shapes[i] = it1->second;
-    }
-    auto it2 = arg_dtype_map->find(name);
-    if (arg_dtype_map->end() != it2) {
-      arg_dtypes[i] = it2->second;
-    }
-    auto it3 = arg_stype_map->find(name);
-    if (arg_stype_map->end() != it3) {
-      arg_stypes[i] = it3->second;
-    }
-  }
-  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
-  if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
-    HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
-                          g.GetAttr<mxnet::ShapeVector>("shape"));
-  }
-
-  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
-  if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
-    HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
-                         g.GetAttr<nnvm::DTypeVector>("dtype"));
-  }
-
-  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
-
-  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
-    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
-                                g.GetAttr<StorageTypeVector>("storage_type"));
-  }
-
-  return g;
-}
-
-
-/*!
- * \brief Return the "optimized" symbol contained in the graph.
- * For optimization pass such as TensorRT pass
- */
-nnvm::Symbol TrtGraphExecutor::GetOptimizedSymbol() {
-  Symbol ret;
-  ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
-                                             graph_.outputs.begin() + num_forward_outputs_);
-  return ret.Copy();
-}
-
-Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol,
-                                         const Context &default_ctx,
-                                         const std::map<std::string, Context> &group2ctx,
-                                         std::vector<Context> *in_arg_ctxes,
-                                         std::vector<Context> *arg_grad_ctxes,
-                                         std::vector<Context> *aux_state_ctxes,
-                                         std::unordered_map<std::string, mxnet::TShape>
-                                                                              *arg_shape_map,
-                                         std::unordered_map<std::string, int> *arg_dtype_map,
-                                         std::unordered_map<std::string, int> *arg_stype_map,
-                                         std::vector<OpReqType> *grad_req_types,
-                                         const std::unordered_set<std::string> &param_names,
-                                         std::vector<NDArray> *in_args,
-                                         std::vector<NDArray> *arg_grads,
-                                         std::vector<NDArray> *aux_states,
-                                         std::unordered_map<std::string, NDArray> *shared_buffer,
-                                         Executor *shared_exec) {
-  auto exec = new exec::TrtGraphExecutor();
-  exec->Init(std::move(symbol), default_ctx, group2ctx,
-             in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
-             arg_shape_map, arg_dtype_map, arg_stype_map,
-             grad_req_types, param_names,
-             in_args, arg_grads, aux_states,
-             shared_buffer, shared_exec);
-  return exec;
-}
-
-}  // namespace exec
-
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/executor/trt_graph_executor.h b/src/executor/trt_graph_executor.h
deleted file mode 100644
index a4ec5bf657ae..000000000000
--- a/src/executor/trt_graph_executor.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
-#define MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
-
-#if MXNET_USE_TENSORRT
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "./graph_executor.h"
-
-namespace mxnet {
-
-namespace exec {
-
-class TrtGraphExecutor : public GraphExecutor {
- public:
-  static Executor* TensorRTBind(nnvm::Symbol symbol,
-                                const Context& default_ctx,
-                                const std::map<std::string, Context>& group2ctx,
-                                std::vector<Context> *in_arg_ctxes,
-                                std::vector<Context>* arg_grad_ctxes,
-                                std::vector<Context>* aux_state_ctxes,
-                                std::unordered_map<std::string, mxnet::TShape>* arg_shape_map,
-                                std::unordered_map<std::string, int>* arg_dtype_map,
-                                std::unordered_map<std::string, int>* arg_stype_map,
-                                std::vector<OpReqType>* grad_req_types,
-                                const std::unordered_set<std::string>& param_names,
-                                std::vector<NDArray>* in_args,
-                                std::vector<NDArray>* arg_grads,
-                                std::vector<NDArray>* aux_states,
-                                std::unordered_map<std::string, NDArray>*
-                                shared_data_arrays = nullptr,
-                                Executor* shared_exec = nullptr);
-
-  virtual void Init(nnvm::Symbol symbol,
-                    const Context& default_ctx,
-                    const std::map<std::string, Context>& ctx_map,
-                    std::vector<Context> *in_arg_ctxes,
-                    std::vector<Context> *arg_grad_ctxes,
-                    std::vector<Context> *aux_state_ctxes,
-                    std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-                    std::unordered_map<std::string, int> *arg_dtype_map,
-                    std::unordered_map<std::string, int> *arg_stype_map,
-                    std::vector<OpReqType> *grad_req_types,
-                    const std::unordered_set<std::string>& shared_arg_names,
-                    std::vector<NDArray>* in_arg_vec,
-                    std::vector<NDArray>* arg_grad_vec,
-                    std::vector<NDArray>* aux_state_vec,
-                    std::unordered_map<std::string, NDArray>* shared_buffer = nullptr,
-                    Executor* shared_exec = nullptr,
-                    const nnvm::NodeEntryMap<NDArray>& feed_dict
-                      = nnvm::NodeEntryMap<NDArray>());
-
-  // Returns symbol representing the TRT optimized graph for comparison purposes.
-  nnvm::Symbol GetOptimizedSymbol();
-
- protected:
-  Graph ReinitGraph(Graph&& g, const Context &default_ctx,
-        const std::map<std::string, Context> &ctx_map,
-        std::vector<Context> *in_arg_ctxes,
-        std::vector<Context> *arg_grad_ctxes,
-        std::vector<Context> *aux_state_ctxes,
-        std::vector<OpReqType> *grad_req_types,
-        std::unordered_map<std::string, mxnet::TShape> *arg_shape_map,
-        std::unordered_map<std::string, int> *arg_dtype_map,
-        std::unordered_map<std::string, int> *arg_stype_map,
-        std::unordered_map<std::string, NDArray> *params_map);
-
-  void InitArguments(const nnvm::IndexedGraph& idx,
-                     const mxnet::ShapeVector& inferred_shapes,
-                     const nnvm::DTypeVector& inferred_dtypes,
-                     const StorageTypeVector& inferred_stypes,
-                     const std::vector<Context>& in_arg_ctxes,
-                     const std::vector<Context>& arg_grad_ctxes,
-                     const std::vector<Context>& aux_state_ctxes,
-                     const std::vector<OpReqType>& grad_req_types,
-                     const std::unordered_set<std::string>& shared_arg_names,
-                     const Executor* shared_exec,
-                     std::unordered_map<std::string, NDArray>* shared_buffer,
-                     std::vector<NDArray>* in_arg_vec,
-                     std::vector<NDArray>* arg_grad_vec,
-                     std::vector<NDArray>* aux_state_vec) override;
-};
-
-}  // namespace exec
-
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
-
-#endif  // MXNET_EXECUTOR_TRT_GRAPH_EXECUTOR_H_
diff --git a/src/operator/contrib/tensorrt-inl.h b/src/operator/contrib/tensorrt-inl.h
deleted file mode 100644
index 062d22e35795..000000000000
--- a/src/operator/contrib/tensorrt-inl.h
+++ /dev/null
@@ -1,79 +0,0 @@
-#ifndef MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
-#define MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt-inl.h
- * \brief TensorRT Operator
- * \author Marek Kolodziej, Clement Fuji Tsang
-*/
-
-#if MXNET_USE_TENSORRT
-
-#include <dmlc/logging.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/serializer.h>
-#include <dmlc/parameter.h>
-#include <mxnet/base.h>
-#include <mxnet/operator.h>
-#include <nnvm/graph.h>
-#include <nnvm/pass_functions.h>
-
-#include <NvInfer.h>
-
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <vector>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <string>
-
-#include "nnvm_to_onnx-inl.h"
-#include "../operator_common.h"
-#include "../../common/utils.h"
-#include "../../common/serialization.h"
-#include "../../executor/exec_pass.h"
-#include "../../executor/graph_executor.h"
-#include "../../executor/onnx_to_tensorrt.h"
-
-namespace mxnet {
-namespace op {
-
-using namespace nnvm;
-using int64 = ::google::protobuf::int64;
-
-
-using trt_name_to_idx = std::map<std::string, uint32_t>;
-
-
-struct TRTEngineParam {
-  nvinfer1::IExecutionContext* trt_executor;
-  std::vector<std::pair<uint32_t, nnvm_to_onnx::TypeIO> > binding_map;
-};
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
-
-#endif  // MXNET_OPERATOR_CONTRIB_TENSORRT_INL_H_
diff --git a/src/operator/contrib/tensorrt.cc b/src/operator/contrib/tensorrt.cc
deleted file mode 100644
index 5b3df70fd825..000000000000
--- a/src/operator/contrib/tensorrt.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2018 by Contributors
- * \file trt.cc
- * \brief TensorRT operation registration
- * \author Marek Kolodziej, Clement Fuji Tsang
-*/
-
-#if MXNET_USE_TENSORRT
-
-#include "./tensorrt-inl.h"
-
-#include <mxnet/base.h>
-#include <nnvm/graph.h>
-#include <nnvm/pass_functions.h>
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <unordered_map>
-#include <vector>
-
-#include "../../common/serialization.h"
-#include "../../common/utils.h"
-
-namespace mxnet {
-namespace op {
-
-OpStatePtr GetPtrMapping(nvinfer1::ICudaEngine* trt_engine,
-                         nnvm_to_onnx::NameToIdx_t input_map,
-                         nnvm_to_onnx::NameToIdx_t output_map) {
-  TRTEngineParam param;
-  for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
-    const std::string& binding_name = trt_engine->getBindingName(b);
-    if (trt_engine->bindingIsInput(b)) {
-      param.binding_map.emplace_back(input_map[binding_name],
-                                     nnvm_to_onnx::TypeIO::Inputs);
-    } else {
-      param.binding_map.emplace_back(output_map[binding_name],
-                                     nnvm_to_onnx::TypeIO::Outputs);
-    }
-  }
-  param.trt_executor = trt_engine->createExecutionContext();
-  return OpStatePtr::Create<TRTEngineParam>(param);
-}
-
-OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
-                          const mxnet::ShapeVector& /*ishape*/,
-                          const std::vector<int>& /*itype*/) {
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-
-  ::onnx::ModelProto model_proto;
-  bool success = model_proto.ParseFromString(node_param.serialized_onnx_graph);
-  if (!success) {
-    LOG(FATAL) << "Problems parsing serialized ONNX model.";
-  }
-  auto graph = model_proto.graph();
-  auto first_input_type = graph.input(0).type().tensor_type();
-  auto dim_value = first_input_type.shape().dim(0).dim_value();
-  auto batch_size = static_cast<int32_t >(dim_value);
-  // Need to set up max workspace size based on device properties
-  nvinfer1::ICudaEngine* const trt_engine = ::onnx_to_tensorrt::onnxToTrtCtx(
-      node_param.serialized_onnx_graph, batch_size, 1 << 30);
-
-  nnvm_to_onnx::NameToIdx_t output_map;
-  for (auto& el : node_param.output_map) {
-    output_map[el.first] = std::get<0>(el.second);
-  }
-  return GetPtrMapping(trt_engine, node_param.input_map, output_map);
-}
-
-void TRTParamParser(nnvm::NodeAttrs* attrs) {
-  ONNXParam param_;
-
-  try {
-    param_.Init(attrs->dict);
-    common::Deserialize(&param_.input_map, param_.serialized_input_map);
-    common::Deserialize(&param_.output_map, param_.serialized_output_map);
-    param_.onnx_pb_graph.ParseFromString(param_.serialized_onnx_graph);
-  } catch (const dmlc::ParamError& e) {
-    std::ostringstream os;
-    os << e.what();
-    os << ", in operator " << attrs->op->name << "("
-       << "name=\"" << attrs->name << "\"";
-    for (const auto& k : attrs->dict) {
-      os << ", " << k.first << "=\"" << k.second << "\"";
-    }
-    os << ")";
-    throw dmlc::ParamError(os.str());
-  }
-
-  attrs->parsed = std::move(param_);
-}
-
-inline bool TRTInferShape(const NodeAttrs& attrs, mxnet::ShapeVector* /*in_shape*/,
-                          mxnet::ShapeVector* out_shape) {
-  const auto &node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  for (auto& el : node_param.output_map) {
-    (*out_shape)[std::get<0>(el.second)] = std::get<1>(el.second);
-  }
-  return true;
-}
-
-inline bool TRTInferStorageType(const NodeAttrs& /*attrs*/, const int /*dev_mask*/,
-                                DispatchMode* dispatch_mode,
-                                std::vector<int>* /*in_storage_type*/,
-                                std::vector<int>* out_storage_type) {
-  return storage_type_assign(out_storage_type, mxnet::kDefaultStorage,
-                             dispatch_mode, DispatchMode::kFCompute);
-}
-
-inline bool TRTInferType(const NodeAttrs& attrs, std::vector<int>* /*in_dtype*/,
-                         std::vector<int>* out_dtype) {
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  for (auto& el : node_param.output_map) {
-    (*out_dtype)[std::get<0>(el.second)] = std::get<3>(el.second);
-  }
-  return true;
-}
-
-inline std::vector<std::string> TRTListInputNames(const NodeAttrs& attrs) {
-  std::vector<std::string> output;
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  output.resize(node_param.input_map.size());
-  for (auto& el : node_param.input_map) {
-    output[el.second] = el.first;
-  }
-  return output;
-}
-
-inline std::vector<std::string> TRTListOutputNames(const NodeAttrs& attrs) {
-  std::vector<std::string> output;
-  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-  output.resize(node_param.output_map.size());
-  for (auto& el : node_param.output_map) {
-    output[std::get<0>(el.second)] = el.first;
-  }
-  return output;
-}
-
-NNVM_REGISTER_OP(_trt_op)
-    .describe(R"code(TRT operation (one engine)
-)code" ADD_FILELINE)
-    .set_num_inputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-      return node_param.input_map.size();
-    })
-    .set_num_outputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
-      return node_param.output_map.size();
-    })
-    .set_attr_parser(TRTParamParser)
-    .set_attr<mxnet::FInferShape>("FInferShape", TRTInferShape)
-    .set_attr<nnvm::FInferType>("FInferType", TRTInferType)
-    .set_attr<nnvm::FListInputNames>("FListInputNames", TRTListInputNames)
-    .set_attr<nnvm::FListOutputNames>("FListOutputNames", TRTListOutputNames)
-    .set_attr<FCreateOpState>("FCreateOpState", TRTCreateState)
-    .set_attr<FInferStorageType>("FInferStorageType", TRTInferStorageType);
-
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/subgraph/partition_graph.cc b/src/operator/subgraph/partition_graph.cc
index 4c7552ec9bfe..50a36f6ef335 100644
--- a/src/operator/subgraph/partition_graph.cc
+++ b/src/operator/subgraph/partition_graph.cc
@@ -589,9 +589,9 @@ void FindOutputEntries(Graph* g,
 void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
                     std::vector<nnvm::NodeEntry> *orig_entries,
                     const bool skip_var = false) {
-  orig_entries->resize(input_entries.size());
+  orig_entries->reserve(input_entries.size());
   // map for creating unique var nodes for deduplicating entries from the same node
-  std::unordered_map<std::string, int> name_count_map;
+  std::unordered_map<std::string, nnvm::NodePtr> new_node_map;
   for (size_t i = 0; i < input_entries.size(); ++i) {
     nnvm::NodeEntry *e = input_entries[i];
     // If the node is a variable itself, we may want to skip the node.
@@ -599,19 +599,17 @@ void CutGraphInputs(const std::vector<nnvm::NodeEntry*> &input_entries,
       continue;
     }
 
-    orig_entries->at(i) = *e;
     nnvm::Symbol sym;
     sym.outputs.push_back(*e);
     const auto output_names = sym.ListOutputNames();
     CHECK_EQ(output_names.size(), 1U);
     const std::string& var_name = output_names[0];
-    auto it = name_count_map.find(var_name);
-    if (name_count_map.end() == it) {
-      name_count_map.emplace(var_name, 0);
-    } else {
-      ++(it->second);
+    auto it = new_node_map.find(var_name);
+    if (it == new_node_map.end()) {
+      orig_entries->push_back(*e);
+      new_node_map[var_name] = nnvm::CreateVariableNode(var_name);
     }
-    nnvm::NodePtr n = nnvm::CreateVariableNode(var_name + std::to_string(name_count_map[var_name]));
+    nnvm::NodePtr n = new_node_map[var_name];
     *e = nnvm::NodeEntry{n, 0, 0};
   }
 }
diff --git a/src/operator/contrib/nnvm_to_onnx-inl.h b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
similarity index 61%
rename from src/operator/contrib/nnvm_to_onnx-inl.h
rename to src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
index 052948521ba8..4a88aee886db 100644
--- a/src/operator/contrib/nnvm_to_onnx-inl.h
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
@@ -1,5 +1,5 @@
-#ifndef MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
-#define MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,76 +20,23 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- * \file tensorrt-inl.h
- * \brief TensorRT Operator
+ * Copyright (c) 2019 by Contributors
+ * \file nnvm_to_onnx-inl.h
+ * \brief Conversion from NNVM to ONNX for TensorRT
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
 #if MXNET_USE_TENSORRT
 
-#include <dmlc/logging.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/serializer.h>
-#include <dmlc/parameter.h>
-#include <mxnet/base.h>
 #include <mxnet/operator.h>
-#include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
 #include <onnx/onnx_pb.h>
 
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <vector>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
 #include <string>
 
-#include "../operator_common.h"
-#include "../../common/utils.h"
-#include "../../common/serialization.h"
-
 namespace mxnet {
 namespace op {
-
-namespace nnvm_to_onnx {
-    enum class TypeIO { Inputs = 0, Outputs = 1 };
-    using NameToIdx_t = std::map<std::string, int32_t>;
-    using InferenceTuple_t = std::tuple<uint32_t, mxnet::TShape, int, int>;
-    using InferenceMap_t = std::map<std::string, InferenceTuple_t>;
-}  // namespace nnvm_to_onnx
-
-struct ONNXParam : public dmlc::Parameter<ONNXParam> {
-  std::string serialized_onnx_graph;
-  std::string serialized_input_map;
-  std::string serialized_output_map;
-  nnvm_to_onnx::NameToIdx_t input_map;
-  nnvm_to_onnx::InferenceMap_t output_map;
-  ::onnx::ModelProto onnx_pb_graph;
-
-  ONNXParam() = default;
-
-  ONNXParam(const ::onnx::ModelProto& onnx_graph,
-           const nnvm_to_onnx::InferenceMap_t& input_map,
-           const nnvm_to_onnx::NameToIdx_t& output_map) {
-    common::Serialize(input_map, &serialized_input_map);
-    common::Serialize(output_map, &serialized_output_map);
-    onnx_graph.SerializeToString(&serialized_onnx_graph);
-  }
-
-DMLC_DECLARE_PARAMETER(ONNXParam) {
-    DMLC_DECLARE_FIELD(serialized_onnx_graph)
-    .describe("Serialized ONNX graph");
-    DMLC_DECLARE_FIELD(serialized_input_map)
-    .describe("Map from inputs to topological order as input.");
-    DMLC_DECLARE_FIELD(serialized_output_map)
-    .describe("Map from outputs to order in g.outputs.");
-  }
-};
-
 namespace nnvm_to_onnx {
 
 using namespace nnvm;
@@ -99,24 +46,26 @@ using int64 = ::google::protobuf::int64;
 std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(const ShapeVector& shape_inputs,
     const nnvm::IndexedGraph& ig);
 
+std::unordered_map<std::string, int> GetPlaceholderDTypes(const DTypeVector&
+dtype_inputs,
+    const nnvm::IndexedGraph& ig);
+
 std::unordered_map<std::string, uint32_t> GetOutputLookup(const nnvm::IndexedGraph& ig);
 
 void ConvertPlaceholder(
   const std::string& node_name,
-  const std::unordered_map<std::string, mxnet::TShape>& placeholder_shapes,
+  const std::unordered_map<std::string, TShape>& placeholder_shapes,
+  const std::unordered_map<std::string, int>& placeholder_dtypes,
   GraphProto* graph_proto);
 
 void ConvertConstant(GraphProto* graph_proto,
   const std::string& node_name,
-  std::unordered_map<std::string, NDArray>* shared_buffer);
+  const std::unordered_map<std::string, NDArray>* const params_map);
 
-void ConvertOutput(op::nnvm_to_onnx::InferenceMap_t* trt_output_map,
-                   GraphProto* graph_proto,
+void ConvertOutput(GraphProto* graph_proto,
                    const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
-                   const std::string& node_name,
-                   const nnvm::Graph& g,
-                   const StorageTypeVector& storage_types,
-                   const DTypeVector& dtypes);
+                   const std::string& node_name, const ShapeVector& shapes,
+                   const DTypeVector& dtypes, const nnvm::IndexedGraph &ig);
 
 typedef void (*ConverterFunction)(NodeProto *node_proto,
                                   const NodeAttrs &attrs,
@@ -137,6 +86,11 @@ void ConvertPooling(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
+void ConvertRelu(NodeProto *node_proto,
+                 const NodeAttrs &attrs,
+                 const nnvm::IndexedGraph &ig,
+                 const array_view<IndexedGraph::NodeEntry> &inputs);
+
 void ConvertActivation(NodeProto *node_proto,
                        const NodeAttrs &attrs,
                        const nnvm::IndexedGraph &ig,
@@ -157,6 +111,11 @@ void ConvertFlatten(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
+void ConvertDropout(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
 void ConvertBatchNorm(NodeProto *node_proto,
                     const NodeAttrs &attrs,
                     const nnvm::IndexedGraph &ig,
@@ -167,19 +126,39 @@ void ConvertElementwiseAdd(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
-ONNXParam ConvertNnvmGraphToOnnx(
-    const nnvm::Graph &g,
-    std::unordered_map<std::string, NDArray>* shared_buffer);
+void ConvertConcatenate(NodeProto *node_proto,
+                    const NodeAttrs &attrs,
+                    const nnvm::IndexedGraph &ig,
+                    const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertClip(NodeProto *node_proto,
+                 const NodeAttrs &attrs,
+                 const nnvm::IndexedGraph &ig,
+                 const array_view<IndexedGraph::NodeEntry> &inputs);
+
+void ConvertPad(NodeProto* node_proto,
+                const NodeAttrs & attrs,
+                const nnvm::IndexedGraph &ig,
+                const array_view<IndexedGraph::NodeEntry> &inputs);
+
+std::string ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
+    const std::unordered_map<std::string, NDArray>* const params_map);
 
 static const std::unordered_map<std::string, ConverterFunction> converter_map = {
-  {"Convolution", ConvertConvolution},
-  {"Pooling", ConvertPooling},
   {"Activation", ConvertActivation},
-  {"FullyConnected", ConvertFullyConnected},
-  {"SoftmaxOutput", ConvertSoftmaxOutput},
-  {"Flatten", ConvertFlatten},
   {"BatchNorm", ConvertBatchNorm},
-  {"elemwise_add", ConvertElementwiseAdd}};
+  {"clip", ConvertClip},
+  {"Convolution", ConvertConvolution},
+  {"Concat", ConvertConcatenate},
+  {"Dropout", ConvertDropout},
+  {"elemwise_add", ConvertElementwiseAdd},
+  {"Flatten", ConvertFlatten},
+  {"FullyConnected", ConvertFullyConnected},
+  {"Pad", ConvertPad},
+  {"Pooling", ConvertPooling},
+  {"relu", ConvertRelu},
+  {"SoftmaxOutput", ConvertSoftmaxOutput}
+};
 
 }  // namespace nnvm_to_onnx
 }  // namespace op
@@ -187,4 +166,4 @@ static const std::unordered_map<std::string, ConverterFunction> converter_map =
 
 #endif  // MXNET_USE_TENSORRT
 
-#endif  // MXNET_OPERATOR_CONTRIB_NNVM_TO_ONNX_INL_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_NNVM_TO_ONNX_INL_H_
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
similarity index 68%
rename from src/operator/contrib/nnvm_to_onnx.cc
rename to src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
index 0417a085616a..70e49d650505 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
- * \file trt.cc
- * \brief TensorRT operation registration
+ * Copyright (c) 2019 by Contributors
+ * \file nnvm_to_onnx.cc
+ * \brief Conversion from NNVM to ONNX for TensorRT
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
@@ -32,21 +32,17 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <unordered_map>
-#include <vector>
-
-#include "../../common/serialization.h"
-#include "../../common/utils.h"
-#include "../../ndarray/ndarray_function.h"
-#include "../../operator/nn/activation-inl.h"
-#include "../../operator/nn/batch_norm-inl.h"
-#include "../../operator/nn/convolution-inl.h"
-#include "../../operator/nn/fully_connected-inl.h"
-#include "../../operator/nn/pooling-inl.h"
-#include "../../operator/softmax_output-inl.h"
+#include "../../../common/utils.h"
+#include "../../../ndarray/ndarray_function.h"
+#include "../../pad-inl.h"
+#include "../../nn/activation-inl.h"
+#include "../../nn/batch_norm-inl.h"
+#include "../../nn/convolution-inl.h"
+#include "../../nn/fully_connected-inl.h"
+#include "../../nn/pooling-inl.h"
+#include "../../nn/concat-inl.h"
+#include "../../softmax_output-inl.h"
+#include "../../tensor/matrix_op-inl.h"
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
 #include <onnx/checker.h>
@@ -54,36 +50,21 @@
 
 namespace mxnet {
 namespace op {
-
-DMLC_REGISTER_PARAMETER(ONNXParam);
-
 namespace nnvm_to_onnx {
 
-op::ONNXParam ConvertNnvmGraphToOnnx(
+std::string ConvertNnvmGraphToOnnx(
     const nnvm::Graph& g,
-    std::unordered_map<std::string, NDArray>* const shared_buffer) {
+    const std::unordered_map<std::string, NDArray>* const params_map) {
 
   static std::atomic_ulong subgraph_count = { 0 };
 
-  op::ONNXParam onnx_param;
-  op::nnvm_to_onnx::NameToIdx_t onnx_input_map;
-  op::nnvm_to_onnx::InferenceMap_t onnx_output_map;
+  std::string serialized_onnx_graph;
 
   const nnvm::IndexedGraph& ig = g.indexed_graph();
-  const auto& storage_types = g.GetAttr<StorageTypeVector>("storage_type");
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
-  const auto& shape_inputs = g.GetAttr<mxnet::ShapeVector>("shape_inputs");
-
-  // TODO(kellens): At the moment this check always passes no matter the weight dtypes used in your
-  // graph.  We should first iterate over datatypes by name and ensure  they're valid types
-  // (fp16 or fp32) and that they're uniform.  Then ensure later conversions set tensor types
-  // correctly in ONNX.
-  for (auto& e : storage_types) {
-    if (e != mshadow::kFloat32) {
-      LOG(FATAL) << "ONNX converter does not support types other than float32 "
-                    "right now.";
-    }
-  }
+  const auto& shapes = g.GetAttr<ShapeVector>("shape");
+  const auto& dtype_inputs = g.GetAttr<DTypeVector>("dtype_inputs");
+  const auto& shape_inputs = g.GetAttr<ShapeVector>("shape_inputs");
 
   ModelProto model_proto;
 
@@ -104,9 +85,9 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
   auto subgraph_name_id = subgraph_count.fetch_add(1);
   graph_proto->set_name("MXNetTRTSubgraph" + std::to_string(subgraph_name_id));
 
-  std::unordered_map<std::string, mxnet::TShape> placeholder_shapes =
-      GetPlaceholderShapes(shape_inputs, ig);
-  std::unordered_map<std::string, uint32_t> output_lookup = GetOutputLookup(ig);
+  auto placeholder_shapes = GetPlaceholderShapes(shape_inputs, ig);
+  auto placeholder_dtypes = GetPlaceholderDTypes(dtype_inputs, ig);
+  auto output_lookup = GetOutputLookup(ig);
   uint32_t current_input = 0;
 
   // Can't do a foreach over IndexedGraph since it doesn't implement begin(), etc.
@@ -121,18 +102,17 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
     // placeholder
     if (source->is_variable()) {
       // Is this a placeholder?
-      if (shared_buffer->count(node_name) == 0) {
+      if (params_map->count(node_name) == 0) {
         // This fixes the problem with a SoftmaxOutput node during inference, but it's hacky.
         // Need to figure out how to properly fix it.
         if (node_name.find("label") != std::string::npos) {
           current_input++;
           continue;
         }
-        onnx_input_map.emplace(node_name, current_input++);
-        ConvertPlaceholder(node_name, placeholder_shapes, graph_proto);
+        ConvertPlaceholder(node_name, placeholder_shapes, placeholder_dtypes, graph_proto);
       } else {
         // If it's not a placeholder, then by exclusion it's a constant.
-        ConvertConstant(graph_proto, node_name, shared_buffer);
+        ConvertConstant(graph_proto, node_name, params_map);
       }  // is_placeholder
     } else {
       // It's an op, rather than a "variable" (constant or placeholder)
@@ -163,23 +143,18 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
       auto out_iter = output_lookup.find(node_name);
       // We found an output
       if (out_iter != output_lookup.end()) {
-        ConvertOutput(&onnx_output_map, graph_proto, out_iter, node_name, g,
-                      storage_types, dtypes);
+        ConvertOutput(graph_proto, out_iter, node_name, shapes, dtypes, ig);
       }  // output found
     }    // conversion function exists
   }      // loop over i from 0 to num_nodes
 
-  model_proto.SerializeToString(&onnx_param.serialized_onnx_graph);
-  common::Serialize<op::nnvm_to_onnx::NameToIdx_t>(onnx_input_map,
-                                          &onnx_param.serialized_input_map);
-  common::Serialize<op::nnvm_to_onnx::InferenceMap_t>(onnx_output_map,
-                                             &onnx_param.serialized_output_map);
+  model_proto.SerializeToString(&serialized_onnx_graph);
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
   onnx::checker::check_model(model_proto);
 #endif  // MXNET_USE_TENSORRT_ONNX_CHECKER
 
-  return onnx_param;
+  return serialized_onnx_graph;
 }
 
 void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
@@ -225,9 +200,10 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
   pads->set_name("pads");
   pads->set_type(AttributeProto::INTS);
 
-  for (const dim_t kval : pad) {
-    pads->add_ints(static_cast<int64>(kval));
-    pads->add_ints(static_cast<int64>(kval));
+  for (int i =0; i < 2; i++) {
+    for (dim_t kval : pad) {
+      pads->add_ints(static_cast<int64>(kval));
+    }
   }
 
   // strides
@@ -295,6 +271,12 @@ void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
   // not global pooling
 }  // end ConvertPooling
 
+void ConvertRelu(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
+                 const nnvm::IndexedGraph& /*ig*/,
+                 const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Relu");
+}
+
 void ConvertActivation(NodeProto* node_proto, const NodeAttrs& attrs,
                        const nnvm::IndexedGraph& /*ig*/,
                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
@@ -411,7 +393,41 @@ void ConvertElementwiseAdd(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
   node_proto->set_op_type("Add");
 }
 
-std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
+void ConvertConcatenate(NodeProto* node_proto, const NodeAttrs& attrs,
+                        const nnvm::IndexedGraph& /*ig*/,
+                        const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto& _param = nnvm::get<ConcatParam>(attrs.parsed);
+  node_proto->set_op_type("Concat");
+  node_proto->set_name(attrs.name);
+  // axis
+  AttributeProto* const axis = node_proto->add_attribute();
+  axis->set_name("axis");
+  axis->set_type(AttributeProto::INT);
+  axis->set_i(static_cast<int64_t>(_param.dim));
+}
+
+inline TensorProto_DataType ConvertDType(int dtype) {
+  switch (dtype) {
+    case mshadow::kFloat64:
+      return TensorProto_DataType_DOUBLE;
+    case mshadow::kFloat32:
+      return TensorProto_DataType_FLOAT;
+    case mshadow::kFloat16:
+      return TensorProto_DataType_FLOAT16;
+    case mshadow::kUint8:
+      return TensorProto_DataType_UINT8;
+    case mshadow::kInt32:
+      return TensorProto_DataType_INT32;
+    case mshadow::kInt8:
+      return TensorProto_DataType_INT8;
+    case mshadow::kInt64:
+      return TensorProto_DataType_INT64;
+    default:
+      return TensorProto_DataType_UNDEFINED;
+  }
+}
+
+std::unordered_map<std::string, TShape> GetPlaceholderShapes(
     const ShapeVector& shape_inputs, const nnvm::IndexedGraph& ig) {
   std::unordered_map<std::string, mxnet::TShape> placeholder_shapes;
   for (uint32_t i = 0; i < shape_inputs.size(); ++i) {
@@ -424,6 +440,17 @@ std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
   return placeholder_shapes;
 }
 
+std::unordered_map<std::string, int> GetPlaceholderDTypes(
+    const DTypeVector& dtype_inputs, const nnvm::IndexedGraph& ig) {
+  std::unordered_map<std::string, int> placeholder_dtypes;
+  for (uint32_t i = 0; i < dtype_inputs.size(); ++i) {
+    std::string name = ig[ig.input_nodes()[i]].source->attrs.name;
+    int dtype = dtype_inputs[i];
+    placeholder_dtypes.emplace(name, dtype);
+  }
+  return placeholder_dtypes;
+}
+
 std::unordered_map<std::string, uint32_t> GetOutputLookup(
     const nnvm::IndexedGraph& ig) {
   std::unordered_map<std::string, uint32_t> output_lookup;
@@ -441,17 +468,17 @@ std::unordered_map<std::string, uint32_t> GetOutputLookup(
 
 void ConvertPlaceholder(
     const std::string& node_name,
-    const std::unordered_map<std::string, mxnet::TShape>& placeholder_shapes,
+    const std::unordered_map<std::string, TShape>& placeholder_shapes,
+    const std::unordered_map<std::string, int>& placeholder_dtypes,
     GraphProto* const graph_proto) {
   auto val_info_proto = graph_proto->add_input();
   auto type_proto = val_info_proto->mutable_type()->mutable_tensor_type();
   auto shape_proto = type_proto->mutable_shape();
 
   val_info_proto->set_name(node_name);
-  // Will support fp16, etc. in the near future
-  type_proto->set_elem_type(TensorProto_DataType_FLOAT);
   auto entry_shape = placeholder_shapes.find(node_name)->second;
-
+  auto entry_dtype = placeholder_dtypes.find(node_name)->second;
+  type_proto->set_elem_type(ConvertDType(entry_dtype));
   for (const auto& elem : entry_shape) {
     TensorShapeProto_Dimension* const tsp_dim = shape_proto->add_dim();
     tsp_dim->set_dim_value(static_cast<int64>(elem));
@@ -460,38 +487,49 @@ void ConvertPlaceholder(
 
 void ConvertConstant(
     GraphProto* const graph_proto, const std::string& node_name,
-    std::unordered_map<std::string, NDArray>* const shared_buffer) {
-  TensorProto* const initializer_proto = graph_proto->add_initializer();
+    const std::unordered_map<std::string, NDArray>* const params_map) {
+    TensorProto* const initializer_proto = graph_proto->add_initializer();
 
   // Create initializer for constants
   initializer_proto->set_name(node_name);
-  // TODO(kellens): convert to fp16 if needed.
-  initializer_proto->set_data_type(TensorProto_DataType_FLOAT);
 
-  const NDArray nd = shared_buffer->find(node_name)->second;
+  const NDArray nd = params_map->find(node_name)->second;
   const TBlob& blob = nd.data();
-  const mxnet::TShape shape = blob.shape_;
+  const TShape shape = blob.shape_;
+  const auto dtype = ConvertDType(nd.dtype());
+  initializer_proto->set_data_type(dtype);
 
   for (auto& dim : shape) {
     initializer_proto->add_dims(static_cast<int64>(dim));
   }
 
   auto size = shape.Size();
-  // TODO(kellens): Note hard coded float32 size assumed.
-  std::shared_ptr<float> shared_data_ptr(new float[size]);
-  float* const data_ptr = shared_data_ptr.get();
-  nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
 
-  for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
-    initializer_proto->add_float_data(data_ptr[blob_idx]);
+  if (dtype == TensorProto_DataType_FLOAT) {
+    std::shared_ptr<float> shared_data_ptr(new float[size]);
+    float* const data_ptr = shared_data_ptr.get();
+    nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
+
+    for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
+      initializer_proto->add_float_data(data_ptr[blob_idx]);
+    }
+  } else if (dtype == TensorProto_DataType_FLOAT16) {
+    std::shared_ptr<uint16_t> shared_data_ptr(new uint16_t[size]);
+    uint16_t* const data_ptr = shared_data_ptr.get();
+    nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
+    for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
+      initializer_proto->add_int32_data(
+          reinterpret_cast<int32_t*>(data_ptr)[blob_idx]);
+    }
+  } else {
+    LOG(FATAL) << "dtype not supported for variables: " << node_name;
   }
 
   // Create inputs for constants.
   ValueInfoProto* const input_proto = graph_proto->add_input();
   input_proto->set_name(node_name);
 
-  // TODO(kellens): (fp16 support)
-  input_proto->mutable_type()->mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  input_proto->mutable_type()->mutable_tensor_type()->set_elem_type(dtype);
   for (auto& dim : shape) {
     auto new_dim = input_proto->mutable_type()->mutable_tensor_type()->mutable_shape()->add_dim();
     new_dim->set_dim_value(static_cast<int64>(dim));
@@ -499,37 +537,98 @@ void ConvertConstant(
 }
 
 void ConvertOutput(
-    op::nnvm_to_onnx::InferenceMap_t* const output_map,
     GraphProto* const graph_proto,
     const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
-    const std::string& node_name, const nnvm::Graph& g,
-    const StorageTypeVector& storage_types, const DTypeVector& dtypes) {
-  const nnvm::IndexedGraph& ig = g.indexed_graph();
+    const std::string& node_name, const ShapeVector& shapes,
+    const DTypeVector& dtypes, const nnvm::IndexedGraph &ig) {
   uint32_t out_idx = ig.entry_id(ig.outputs()[out_iter->second]);
-  mxnet::TShape out_shape = g.GetAttr<mxnet::ShapeVector>("shape")[out_idx];
-  int storage_type = storage_types[out_idx];
   int dtype = dtypes[out_idx];
-
-  // This should work with fp16 as well
-  op::nnvm_to_onnx::InferenceTuple_t out_tuple{out_iter->second, out_shape, storage_type,
-                                      dtype};
-
-  output_map->emplace(node_name, out_tuple);
-
   auto graph_out = graph_proto->add_output();
   auto tensor_type = graph_out->mutable_type()->mutable_tensor_type();
   auto tensor_shape_proto = tensor_type->mutable_shape();
   graph_out->set_name(node_name);
 
   // Also support fp16.
-  tensor_type->set_elem_type(TensorProto_DataType_FLOAT);
+  tensor_type->set_elem_type(ConvertDType(dtype));
 
-  for (int64_t dim_shp : out_shape) {
+  for (int64_t dim_shp : shapes[out_idx]) {
     TensorShapeProto_Dimension* const tsp_dim = tensor_shape_proto->add_dim();
     tsp_dim->set_dim_value(static_cast<int64>(dim_shp));
   }
 }
 
+void ConvertClip(NodeProto* node_proto, const NodeAttrs& attrs,
+                 const nnvm::IndexedGraph& /*ig*/,
+                 const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto param = nnvm::get<ClipParam>(attrs.parsed);
+
+  node_proto->set_op_type("Clip");
+
+  // max
+  AttributeProto* const a_max = node_proto->add_attribute();
+  a_max->set_name("max");
+  a_max->set_type(AttributeProto::FLOAT);
+  a_max->set_f(static_cast<float>(param.a_max));
+
+  // min
+  AttributeProto* const a_min = node_proto->add_attribute();
+  a_min->set_name("min");
+  a_min->set_type(AttributeProto::FLOAT);
+  a_min->set_f(static_cast<float>(param.a_min));
+}
+
+void ConvertPad(NodeProto* node_proto, const NodeAttrs& attrs,
+                const nnvm::IndexedGraph& /*ig*/,
+                const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  const auto param = nnvm::get<PadParam>(attrs.parsed);
+
+  node_proto->set_op_type("Pad");
+
+  // mode
+  AttributeProto* const mode = node_proto->add_attribute();
+  mode->set_name("mode");
+  mode->set_type(AttributeProto::STRING);
+  switch (param.mode) {
+    case op::pad_enum::kConstant:
+      mode->set_s("constant");
+      break;
+    case op::pad_enum::kEdge:
+      mode->set_s("edge");
+      break;
+    case op::pad_enum::kReflect:
+      mode->set_s("reflect");
+      break;
+    default:
+      throw dmlc::Error("Such mode of padding doesn't exist doesn't exist");
+  }
+
+  // pads
+  AttributeProto* const pads = node_proto->add_attribute();
+  pads->set_name("pads");
+  pads->set_type(AttributeProto::INTS);
+
+  std::vector<int64> pad_begin;
+  std::vector<int64> pad_end;
+  for (int st = 0; st < 2; ++st) {
+    for (auto it = param.pad_width.begin() + st;
+         it != param.pad_width.end(); it += 2) {
+      pads->add_ints(static_cast<int64>(*it));
+    }
+  }
+
+  // value
+  AttributeProto* const value = node_proto->add_attribute();
+  value->set_name("value");
+  value->set_type(AttributeProto::FLOAT);
+  value->set_f(param.constant_value);
+}
+
+void ConvertDropout(NodeProto* node_proto, const NodeAttrs& attrs,
+                    const nnvm::IndexedGraph& /*ig*/,
+                    const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
+  node_proto->set_op_type("Dropout");
+}
+
 }  // namespace nnvm_to_onnx
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/executor/onnx_to_tensorrt.cc b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
similarity index 89%
rename from src/executor/onnx_to_tensorrt.cc
rename to src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
index f7fbc8f81359..7dbc54bc1a63 100644
--- a/src/executor/onnx_to_tensorrt.cc
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file onnx_to_tensorrt.cc
  * \brief TensorRT integration with the MXNet executor
  * \author Marek Kolodziej, Clement Fuji Tsang
@@ -36,6 +36,9 @@
 #include <google/protobuf/text_format.h>
 #include <onnx-tensorrt/NvOnnxParser.h>
 #include <onnx-tensorrt/NvOnnxParserRuntime.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+
 #include <onnx-tensorrt/PluginFactory.hpp>
 #include <onnx-tensorrt/plugin_common.hpp>
 
@@ -80,7 +83,7 @@ void PrintVersion() {
     << NV_TENSORRT_PATCH << endl;
 }
 
-nvinfer1::ICudaEngine* onnxToTrtCtx(
+std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size,
         size_t max_workspace_size,
@@ -91,14 +94,13 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
   TRT_Logger trt_logger(verbosity);
   auto trt_builder = InferObject(nvinfer1::createInferBuilder(trt_logger));
   auto trt_network = InferObject(trt_builder->createNetwork());
-  auto trt_parser  = InferObject(nvonnxparser::createParser(trt_network.get(), trt_logger));
+  auto trt_parser  = nvonnxparser::createParser(trt_network.get(), trt_logger);
   ::ONNX_NAMESPACE::ModelProto parsed_model;
   // We check for a valid parse, but the main effect is the side effect
   // of populating parsed_model
   if (!parsed_model.ParseFromString(onnx_model)) {
     throw dmlc::Error("Could not parse ONNX from string");
   }
-
   if ( !trt_parser->parse(onnx_model.c_str(), onnx_model.size()) ) {
       size_t nerror = trt_parser->getNbErrors();
       for ( size_t i=0; i < nerror; ++i ) {
@@ -127,19 +129,18 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
       }
       throw dmlc::Error("Cannot parse ONNX into TensorRT Engine");
   }
-
-  bool fp16 = trt_builder->platformHasFastFp16();
-
+  if (dmlc::GetEnv("MXNET_TENSORRT_USE_FP16", true)) {
+    if (trt_builder->platformHasFastFp16()) {
+      trt_builder->setFp16Mode(true);
+    } else {
+      LOG(WARNING) << "TensorRT can't use fp16 on this platform";
+    }
+  }
   trt_builder->setMaxBatchSize(max_batch_size);
   trt_builder->setMaxWorkspaceSize(max_workspace_size);
-  if ( fp16 && dmlc::GetEnv("MXNET_TENSORRT_USE_FP16_FOR_FP32", false) ) {
-    LOG(INFO) << "WARNING: TensorRT using fp16 given original MXNet graph in fp32 !!!";
-    trt_builder->setHalf2Mode(true);
-  }
-
   trt_builder->setDebugSync(debug_builder);
   nvinfer1::ICudaEngine* trt_engine = trt_builder->buildCudaEngine(*trt_network.get());
-  return trt_engine;
+  return std::make_tuple(trt_engine, trt_parser);
 }
 
 }  // namespace onnx_to_tensorrt
diff --git a/src/executor/onnx_to_tensorrt.h b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
similarity index 88%
rename from src/executor/onnx_to_tensorrt.h
rename to src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
index 259cfce7c332..3e8ea1bf9ee1 100644
--- a/src/executor/onnx_to_tensorrt.h
+++ b/src/operator/subgraph/tensorrt/onnx_to_tensorrt.h
@@ -1,5 +1,5 @@
-#ifndef MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
-#define MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -20,7 +20,7 @@
  */
 
 /*!
- * Copyright (c) 2018 by Contributors
+ * Copyright (c) 2019 by Contributors
  * \file onnx_to_tensorrt.h
  * \brief TensorRT integration with the MXNet executor
  * \author Marek Kolodziej, Clement Fuji Tsang
@@ -28,13 +28,15 @@
 
 #if MXNET_USE_TENSORRT
 
+#include <onnx-tensorrt/NvOnnxParser.h>
+#include <NvInfer.h>
+
 #include <fstream>
 #include <iostream>
-#include <NvInfer.h>
 #include <sstream>
 #include <string>
-
-#include "../operator/contrib/tensorrt-inl.h"
+#include <ctime>
+#include <tuple>
 
 namespace onnx_to_tensorrt {
 
@@ -64,7 +66,7 @@ class TRT_Logger : public nvinfer1::ILogger {
         }
 };
 
-nvinfer1::ICudaEngine* onnxToTrtCtx(
+std::tuple<nvinfer1::ICudaEngine*, nvonnxparser::IParser*> onnxToTrtCtx(
         const std::string& onnx_model,
         int32_t max_batch_size = 32,
         size_t max_workspace_size = 1L << 30,
@@ -74,4 +76,4 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
 
 #endif  // MXNET_USE_TENSORRT
 
-#endif  // MXNET_EXECUTOR_ONNX_TO_TENSORRT_H_
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_ONNX_TO_TENSORRT_H_
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
new file mode 100644
index 000000000000..e258d892aaba
--- /dev/null
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -0,0 +1,240 @@
+#ifndef MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
+#define MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file tensorrt-inl.h
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include <onnx-tensorrt/NvOnnxParser.h>
+
+#include <utility>
+#include <string>
+#include <vector>
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "nnvm_to_onnx-inl.h"
+#include "./onnx_to_tensorrt.h"
+
+namespace mxnet {
+namespace op {
+
+using int64 = ::google::protobuf::int64;
+
+struct TRTParam {
+  std::unordered_map<std::string, uint32_t> inputs_to_idx;
+  std::unordered_map<std::string, uint32_t> outputs_to_idx;
+  std::unordered_map<std::string, NDArray> params_map;
+};
+
+struct TRTEngineParam {
+  TRTEngineParam(nvinfer1::ICudaEngine* trt_engine,
+                 nvonnxparser::IParser* _parser,
+                 const std::unordered_map<std::string, uint32_t> input_map,
+                 const std::unordered_map<std::string, uint32_t> output_map) {
+    binding_order = std::make_shared<std::vector<std::pair<uint32_t, bool> > >();
+    bindings = std::make_shared<std::vector<void*> >();
+    binding_order->reserve(trt_engine->getNbBindings());
+    bindings->resize(trt_engine->getNbBindings());
+    for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
+      const std::string& binding_name = trt_engine->getBindingName(b);
+      if (trt_engine->bindingIsInput(b)) {
+        binding_order->emplace_back(input_map.at(binding_name), true);
+      } else {
+        binding_order->emplace_back(output_map.at(binding_name), false);
+      }
+    }
+    trt_executor = trt_engine->createExecutionContext();
+    trt_parser = _parser;
+  }
+
+  ~TRTEngineParam() {
+    trt_parser->destroy();
+    trt_executor->destroy();
+  }
+  nvinfer1::IExecutionContext* trt_executor;
+  nvonnxparser::IParser* trt_parser;
+  std::shared_ptr<std::vector<std::pair<uint32_t, bool> > > binding_order;
+  std::shared_ptr<std::vector<void*> > bindings;
+};
+
+class TensorrtSelector : public SubgraphSelector {
+ public:
+  const std::unordered_set<std::string> unconditionalTRTops = {
+    "BatchNorm",
+    "clip",
+    "Concat",
+    "Convolution",
+    "Dropout",
+    "elemwise_add",
+    "elemwise_sub",
+    "elemwise_mul",
+    "Flatten",
+    "FullyConnected",
+    "mean",
+    "Pad",
+    "relu",
+    "rsqrt",
+    "SoftmaxOutput"
+  };
+
+  const std::unordered_set<std::string> withWeightsOps = {
+    "BatchNorm",
+    "Convolution",
+    "FullyConnected"
+  };
+
+  bool isTRTCompatible(const nnvm::Node &n) {
+    const std::string op_name = n.op()->name;
+    if (op_name == "Pooling") {
+      return (n.attrs.dict.at("pool_type") == "avg" ||
+          n.attrs.dict.at("pool_type") == "max");
+    }
+
+    if (unconditionalTRTops.count(op_name)) {
+      return true;
+    }
+
+    if (op_name == "Activation") {
+      return n.attrs.dict.at("act_type") == "relu" ||
+        n.attrs.dict.at("act_type") == "tanh" ||
+        n.attrs.dict.at("act_type") == "sigmoid";
+    }
+
+    return false;
+  }
+
+  bool Select(const nnvm::Node &n) override {
+    return !n.is_variable() && isTRTCompatible(n);
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (new_node.is_variable()) {
+      if (withWeightsOps.count(n.op()->name)) {
+        return n.inputs[0].node->attrs.name != new_node.attrs.name;
+      } else {
+        return false;
+      }
+    }
+    return isTRTCompatible(new_node);
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return isTRTCompatible(new_node);
+  }
+
+  std::vector<nnvm::Node*> Filter(const std::vector<nnvm::Node*>& candidates) override {
+    bool found_one = false;
+    // TensorRT is interesting with at least 2 operations
+    for (auto& n : candidates) {
+      if (!n->is_variable()) {
+        if (found_one) {
+          return candidates;
+        } else {
+          found_one = true;
+        }
+      }
+    }
+    return std::vector<nnvm::Node*>();
+  }
+};
+
+class TensorrtProperty : public SubgraphProperty {
+ public:
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<TensorrtProperty>();
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id) const override {
+    nnvm::NodePtr n = nnvm::Node::Create();
+    nnvm::Symbol new_sym;
+    std::unique_copy(sym.outputs.begin(), sym.outputs.end(),
+        std::back_inserter(new_sym.outputs), [](
+        nnvm::NodeEntry lhs, nnvm::NodeEntry rhs) {
+          return lhs.index == rhs.index && lhs.node.get() == rhs.node.get();
+        });
+    n->attrs.name = "TensorRT" + std::to_string(subgraph_id);
+    n->attrs.op = Op::Get("_TensorRT");
+    CHECK(n->attrs.op);
+    n->attrs.subgraphs.emplace_back(std::make_shared<nnvm::Symbol>(new_sym));
+    std::ostringstream params_oss;
+    for (auto &e : new_sym.ListInputNames(nnvm::Symbol::kAll)) {
+      params_oss << e << ";";
+    }
+    auto tensorrt_params_names = params_oss.str();
+    tensorrt_params_names.pop_back();
+    n->attrs.dict["subgraph_params_names"] = tensorrt_params_names;
+    TRTParam param;
+    n->attrs.parsed = param;
+    n->op()->attr_parser(&(n->attrs));
+    return n;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    return std::make_shared<TensorrtSelector>();
+  }
+
+  void ConnectSubgraphOutputs(const nnvm::NodePtr subgraph_node, \
+                              std::vector<nnvm::NodeEntry*>* output_entries) const override {
+    std::vector<nnvm::NodeEntry>& outputs = subgraph_node->attrs.subgraphs[0]->outputs;
+    TRTParam& _params = nnvm::get<TRTParam>(subgraph_node->attrs.parsed);
+    for (size_t i = 0; i < outputs.size(); i++) {
+      auto& o = outputs[i];
+      for (auto& e : *output_entries) {
+        if (o.index == e->index && o.node.get() == e->node.get()) {
+          e->index = i;
+          e->node = subgraph_node;
+          // TODO(cfujitsang): For future support this would fail
+          //                   if the node have multiple outputs
+          _params.outputs_to_idx[o.node->attrs.name] = i;
+        }
+      }
+    }
+    subgraph_node->attrs.parsed = std::move(_params);
+  }
+
+  void ConnectSubgraphInputs(const nnvm::NodePtr subgraph_node,
+                             std::vector<nnvm::NodeEntry*>* input_entries,
+                             std::vector<nnvm::NodeEntry>* orig_input_entries) const override {
+    TRTParam& _params = nnvm::get<TRTParam>(subgraph_node->attrs.parsed);
+    subgraph_node->inputs.clear();
+    subgraph_node->inputs.resize(orig_input_entries->size());
+    for (size_t i = 0; i < orig_input_entries->size(); ++i) {
+      subgraph_node->inputs[i] = orig_input_entries->at(i);
+      _params.inputs_to_idx[input_entries->at(i)->node->attrs.name] = i;
+    }
+    subgraph_node->attrs.parsed = std::move(_params);
+  }
+};
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
+
+#endif  // MXNET_OPERATOR_SUBGRAPH_TENSORRT_TENSORRT_INL_H_
diff --git a/src/operator/subgraph/tensorrt/tensorrt.cc b/src/operator/subgraph/tensorrt/tensorrt.cc
new file mode 100644
index 000000000000..25c7bc8e7fde
--- /dev/null
+++ b/src/operator/subgraph/tensorrt/tensorrt.cc
@@ -0,0 +1,336 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file tensorrt.cc
+ * \brief TensorRT operation registration
+ * \author Marek Kolodziej, Clement Fuji Tsang
+*/
+
+#if MXNET_USE_TENSORRT
+
+#include "./tensorrt-inl.h"
+
+namespace mxnet {
+namespace op {
+
+inline uint32_t TRTNumInputs(const nnvm::NodeAttrs& attrs) {
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto inputs_to_idx = param.inputs_to_idx;
+  return inputs_to_idx.size();
+}
+
+inline std::vector<std::string> TRTListInputNames(const nnvm::NodeAttrs& attrs) {
+  std::vector<std::string> outputs;
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto inputs_to_idx = param.inputs_to_idx;
+  for (auto& p : inputs_to_idx) {
+    outputs[p.second] = p.first;
+  }
+  return outputs;
+}
+
+inline bool TRTInferShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_shapes,
+                          std::vector<TShape> *out_shapes) {
+  using namespace exec;
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_shapes->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_shapes->size());
+
+  // Put the input and output shapes to the shape vector.
+  nnvm::ShapeVector shapes(idx_g.num_node_entries());
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_shapes->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      shapes[eid] = it_params->second.shape();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      shapes[eid] = in_shapes->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name << " shape information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_shapes->size());
+  for (size_t i = 0; i < out_shapes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    shapes[eid] = out_shapes->at(i);
+  }
+
+  // Infer shape of the graph.
+  g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  g = exec::InferShape(std::move(g));
+  // Copy the inferred shape back to the input shapes and the output shapes.
+  shapes = g.GetAttr<nnvm::ShapeVector>("shape");
+  // assign to in_shapes
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      SHAPE_ASSIGN_CHECK(*in_shapes, it->second, shapes[eid]);
+    }
+  }
+  // assign to out_shapes
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    SHAPE_ASSIGN_CHECK(*out_shapes, i, shapes[eid]);
+  }
+  // Check if we have inferred the shapes correctly.
+  return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
+}
+
+inline bool TRTInferType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_types,
+                    std::vector<int> *out_types) {
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_types->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_types->size());
+
+  // Put the input and output data types to the dtype vector.
+  nnvm::DTypeVector types(idx_g.num_node_entries(), -1);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_types->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      types[eid] = it_params->second.dtype();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      types[eid] = in_types->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name
+                 << " dtype information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_types->size());
+  for (size_t i = 0; i < out_types->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    types[eid] = out_types->at(i);
+  }
+
+  // Infer data type of the graph.
+  g.attrs["dtype"] = std::make_shared<dmlc::any>(std::move(types));
+  g = exec::InferType(std::move(g));
+
+  types = g.GetAttr<nnvm::DTypeVector>("dtype");
+  // assign to in_types
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      TYPE_ASSIGN_CHECK(*in_types, it->second, types[eid]);
+    }
+  }
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    TYPE_ASSIGN_CHECK(*out_types, i, types[eid]);
+  }
+
+  // Check if we have inferred the dtypes correctly.
+  return g.GetAttr<size_t>("dtype_num_unknown_nodes") == 0;
+}
+
+inline bool TRTInferStorageType(const nnvm::NodeAttrs& attrs,
+                           const int dev_mask,
+                           DispatchMode* dispatch_mode,
+                           std::vector<int>* in_stypes,
+                           std::vector<int>* out_stypes) {
+  const nnvm::Symbol subgraph_sym = *(attrs.subgraphs[0]);
+  const TRTParam& param = nnvm::get<TRTParam>(attrs.parsed);
+  auto params_map = param.params_map;
+  auto inputs_to_idx = param.inputs_to_idx;
+  nnvm::Graph g;
+  g.outputs = subgraph_sym.outputs;
+  const auto& idx_g = g.indexed_graph();
+  CHECK_EQ(idx_g.input_nodes().size(), in_stypes->size() + params_map.size());
+  CHECK_EQ(idx_g.outputs().size(), out_stypes->size());
+  exec::DevMaskVector dev_masks(idx_g.num_node_entries(), dev_mask);
+
+  // Put the input and output storages to the storage vector.
+  StorageTypeVector stypes(idx_g.num_node_entries(), kUndefinedStorage);
+  const auto &input_nids = idx_g.input_nodes();
+  CHECK_EQ(input_nids.size(), in_stypes->size() + params_map.size());
+  for (size_t i = 0; i < input_nids.size(); i++) {
+    auto node = idx_g[input_nids[i]].source;
+    auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      stypes[eid] = it_params->second.storage_type();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      stypes[eid] = in_stypes->at(it_inputs->second);
+    } else {
+      LOG(FATAL) << node->attrs.name
+                 << " storage type information is missing for attributes inference";
+    }
+  }
+  CHECK_EQ(g.outputs.size(), out_stypes->size());
+  for (size_t i = 0; i < out_stypes->size(); i++) {
+    auto eid = idx_g.entry_id(g.outputs[i]);
+    stypes[eid] = out_stypes->at(i);
+  }
+
+  // Infer storage type of the graph.
+  bool dev_match = g.attrs.count("dev_mask") &&
+                   g.GetAttr<exec::DevMaskVector>("dev_mask") == dev_masks;
+  if (!dev_match) {
+    g.attrs["dev_mask"] = std::make_shared<dmlc::any>(std::move(dev_masks));
+  }
+  g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(stypes));
+  g = exec::InferStorageType(std::move(g));
+
+  stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  // assign to in_types
+  for (size_t i = 0; i < input_nids.size(); ++i) {
+    const auto node = idx_g[input_nids[i]].source;
+    const auto eid = idx_g.entry_id(input_nids[i], 0);
+    auto it = inputs_to_idx.find(node->attrs.name);
+    if (it != inputs_to_idx.end()) {
+      STORAGE_TYPE_ASSIGN_CHECK(*in_stypes, it->second, stypes[eid]);
+    }
+  }
+
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  // assign to out_types
+  for (size_t i = 0; i < g.outputs.size(); ++i) {
+    const auto eid = idx_g.entry_id(g.outputs[i]);
+    STORAGE_TYPE_ASSIGN_CHECK(*out_stypes, i, stypes[eid]);
+  }
+  // Check if we have inferred the storages correctly.
+  return g.GetAttr<size_t>("storage_type_num_unknown_nodes") == 0;
+}
+
+void TRTParamParser(nnvm::NodeAttrs* attrs) {
+  TRTParam& _param = nnvm::get<TRTParam>(attrs->parsed);
+  std::string prefix = "subgraph_param_";
+  std::string str_dtype, str_shape, str_pointer, _tmp;
+  for (auto it = attrs->dict.begin(); it != attrs->dict.end();) {
+    std::string attrs_name = it->first;
+    if (std::equal(prefix.begin(), prefix.end(), attrs_name.begin())) {
+      std::string param_name = attrs_name.substr(prefix.size(),
+                                                 attrs_name.size() - prefix.size());
+      // TODO(cfujitsang): find a less dirty way to give weights
+      NDArray *cache = reinterpret_cast<NDArray*>(stol(it->second));
+      _param.params_map.emplace(param_name, cache->Copy(Context()));
+      _param.params_map[param_name].WaitToRead();
+      it = attrs->dict.erase(it);
+    } else {
+      ++it;
+    }
+  }
+  attrs->parsed = std::move(_param);
+}
+
+OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context ctx,
+                          const std::vector<TShape>& in_shape,
+                          const std::vector<int>& in_type) {
+  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  nnvm::Graph graph;
+  graph.outputs = attrs.subgraphs[0]->outputs;
+  uint32_t max_batch_size = dmlc::GetEnv("MXNET_TENSORRT_MAX_BATCH_SIZE", in_shape[0][0]);
+  if (max_batch_size < in_shape[0][0]) {
+    LOG(INFO) << "Warning: max batch size changed to be is: " << in_shape[0][0]
+              << " instead of: " << max_batch_size;
+    max_batch_size = in_shape[0][0];
+  }
+  const auto& params_map = node_param.params_map;
+  const auto& inputs_to_idx = node_param.inputs_to_idx;
+  const auto& outputs_to_idx = node_param.outputs_to_idx;
+  const auto& idx_g = graph.indexed_graph();
+  const auto& input_nids = idx_g.input_nodes();
+  nnvm::ShapeVector shape_inputs(input_nids.size());
+  nnvm::DTypeVector dtype_inputs(input_nids.size());
+  for (int i = 0; i < input_nids.size(); ++i) {
+    auto node = idx_g[input_nids[i]].source;
+    auto it_params = params_map.find(node->attrs.name);
+    auto it_inputs = inputs_to_idx.find(node->attrs.name);
+    if (it_params != params_map.end()) {
+      shape_inputs[i] = it_params->second.shape();
+      dtype_inputs[i] = it_params->second.dtype();
+    } else if (it_inputs != inputs_to_idx.end()) {
+      shape_inputs[i] = in_shape[it_inputs->second];
+      dtype_inputs[i] = in_type[it_inputs->second];
+    } else {
+      LOG(FATAL) << node->attrs.name << " attribute is missing for attributes inference";
+    }
+  }
+  std::vector<TShape> out_shape(graph.outputs.size());
+  std::vector<int> out_type(graph.outputs.size(), -1);
+  std::vector<TShape> _in_shape(in_shape.begin(), in_shape.end());
+  std::vector<int> _in_type(in_type.begin(), in_type.end());
+  TRTInferShape(attrs, &_in_shape, &out_shape);
+  TRTInferType(attrs, &_in_type, &out_type);
+  nnvm::DTypeVector dtypes(idx_g.num_node_entries());
+  nnvm::ShapeVector shapes(idx_g.num_node_entries());
+  for (int i = 0; i < graph.outputs.size(); ++i) {
+    auto eid = idx_g.entry_id(graph.outputs[i]);
+    dtypes[eid] = out_type[i];
+    shapes[eid] = out_shape[i];
+  }
+  graph.attrs["dtype_inputs"] = std::make_shared<nnvm::any>(std::move(dtype_inputs));
+  graph.attrs["shape_inputs"] = std::make_shared<nnvm::any>(std::move(shape_inputs));
+  graph.attrs["dtype"]        = std::make_shared<nnvm::any>(std::move(dtypes));
+  graph.attrs["shape"]        = std::make_shared<nnvm::any>(std::move(shapes));
+  auto onnx_graph = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(graph, &params_map);
+  auto trt_tuple = ::onnx_to_tensorrt::onnxToTrtCtx(onnx_graph, max_batch_size, 1 << 30);
+  return OpStatePtr::Create<TRTEngineParam>(std::get<0>(trt_tuple), std::get<1>(trt_tuple),
+                                            inputs_to_idx, outputs_to_idx);
+}
+
+NNVM_REGISTER_OP(_TensorRT)
+    .describe(R"code(TRT operation (one engine)
+)code" ADD_FILELINE)
+    .set_num_inputs(TRTNumInputs)
+    .set_num_outputs(DefaultSubgraphOpNumOutputs)
+    .set_attr_parser(TRTParamParser)
+    .set_attr<nnvm::FInferShape>("FInferShape", TRTInferShape)
+    .set_attr<nnvm::FInferType>("FInferType", TRTInferType)
+    .set_attr<nnvm::FListInputNames>("FListInputNames", TRTListInputNames)
+    .set_attr<nnvm::FListOutputNames>("FListOutputNames", DefaultSubgraphOpListOutputs)
+    .set_attr<FCreateOpState>("FCreateOpState", TRTCreateState)
+    .set_attr<FInferStorageType>("FInferStorageType", TRTInferStorageType);
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(TensorRT, TensorrtProperty);
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_TENSORRT
diff --git a/src/operator/contrib/tensorrt.cu b/src/operator/subgraph/tensorrt/tensorrt.cu
similarity index 69%
rename from src/operator/contrib/tensorrt.cu
rename to src/operator/subgraph/tensorrt/tensorrt.cu
index 9a9c3c024366..4a5b23b3a9f7 100644
--- a/src/operator/contrib/tensorrt.cu
+++ b/src/operator/subgraph/tensorrt/tensorrt.cu
@@ -19,8 +19,8 @@
 
 /*!
  * Copyright (c) 2018 by Contributors
- * \file trt.cu
- * \brief TensorRT GPU operation
+ * \file tensorrt.cu
+ * \brief TensorRT GPU operation registration
  * \author Marek Kolodziej, Clement Fuji Tsang
 */
 
@@ -41,30 +41,26 @@ namespace op {
 } while (0)
 
 void TRTCompute(const OpStatePtr& state, const OpContext& ctx,
-                     const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
-
-  Stream<gpu>* s = ctx.get_stream<gpu>();
-  cudaStream_t cuda_s = Stream<gpu>::GetStream(s);
+  cudaStream_t cuda_s = Stream<gpu>::GetStream(ctx.get_stream<gpu>());
   const auto& param = state.get_state<TRTEngineParam>();
-  std::vector<void*> bindings;
-  bindings.reserve(param.binding_map.size());
-  for (auto& p : param.binding_map) {
-    if (p.second == nnvm_to_onnx::TypeIO::Inputs) {
-      bindings.emplace_back(inputs[p.first].dptr_);
+  for (size_t i = 0; i < param.binding_order->size(); ++i) {
+    auto& p = param.binding_order->at(i);
+    if (p.second == true) {
+      param.bindings->at(i) = inputs[p.first].dptr_;
     } else {
-      bindings.emplace_back(outputs[p.first].dptr_);
+      param.bindings->at(i) = outputs[p.first].dptr_;
     }
   }
-
   const int batch_size = static_cast<int>(inputs[0].shape_[0]);
-  param.trt_executor->enqueue(batch_size, bindings.data(), cuda_s, nullptr);
-  CHECK_CUDART(cudaStreamSynchronize(cuda_s));
+  param.trt_executor->enqueue(batch_size, param.bindings->data(), cuda_s, nullptr);
 }
 
-NNVM_REGISTER_OP(_trt_op)
+NNVM_REGISTER_OP(_TensorRT)
 .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", TRTCompute);
 
 }  // namespace op
diff --git a/tests/cpp/misc/serialization.cc b/tests/cpp/misc/serialization.cc
deleted file mode 100644
index 77014238c2fa..000000000000
--- a/tests/cpp/misc/serialization.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <gtest/gtest.h>
-#include <../../../src/common/serialization.h>
-
-using namespace mxnet;
-using namespace std;
-
-/*
- * Test that used datastruct are properly serialized and deserialized
- */
-
-TEST(SerializerTest, InputMapCorrect) {
-    std::map<std::string, int32_t> input_map;
-    input_map.emplace("input_0", 2);
-    input_map.emplace("another_input", 0);
-    input_map.emplace("last_input", 1);
-    std::string serialized_data;
-    common::Serialize(input_map, &serialized_data);
-    std::map<std::string, int32_t> deserialized_input_map;
-    common::Deserialize(&deserialized_input_map, serialized_data);
-    ASSERT_EQ(input_map.size(), deserialized_input_map.size());
-    for (auto& p : input_map) {
-        auto it = deserialized_input_map.find(p.first);
-        ASSERT_NE(it, deserialized_input_map.end());
-        ASSERT_EQ(it->second, p.second);
-    }
-}
-
-TEST(SerializerTest, OutputMapCorrect) {
-    std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > output_map;
-    output_map.emplace("output_0", std::make_tuple(1, mxnet::TShape({23, 12, 63, 432}), 0, 1));
-    output_map.emplace("another_output", std::make_tuple(2, mxnet::TShape({23, 123}), 14, -23));
-    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape({0}), -1, 0));
-    std::string serialized_data;
-    common::Serialize(output_map, &serialized_data);
-    std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > deserialized_output_map;
-    common::Deserialize(&deserialized_output_map, serialized_data);
-    ASSERT_EQ(output_map.size(), deserialized_output_map.size());
-    for (auto& p : output_map) {
-        auto it = deserialized_output_map.find(p.first);
-        ASSERT_NE(it, deserialized_output_map.end());
-        auto lhs = it->second;
-        auto rhs = p.second;
-        ASSERT_EQ(std::get<0>(lhs), std::get<0>(rhs));
-        ASSERT_EQ(std::get<1>(lhs), std::get<1>(rhs));
-        ASSERT_EQ(std::get<2>(lhs), std::get<2>(rhs));
-        ASSERT_EQ(std::get<3>(lhs), std::get<3>(rhs));
-    }
-}
-
diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py
index 8edd9abf70e7..e679c05894a3 100644
--- a/tests/python/tensorrt/lenet5_train.py
+++ b/tests/python/tensorrt/lenet5_train.py
@@ -24,6 +24,7 @@ def lenet5():
     """LeNet-5 Symbol"""
     #pylint: disable=no-member
     data = mx.sym.Variable('data')
+    data = mx.sym.Cast(data, 'float16')
     conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
     tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
     pool1 = mx.sym.Pooling(data=tanh1, pool_type="max",
@@ -39,6 +40,7 @@ def lenet5():
     tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
     # second fullc
     fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
+    fc2 = mx.sym.Cast(fc2, 'float32')
     # loss
     lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
     #pylint: enable=no-member
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
index 4fdd522341bc..9282bc6f0de6 100644
--- a/tests/python/tensorrt/test_cvnets.py
+++ b/tests/python/tensorrt/test_cvnets.py
@@ -27,28 +27,22 @@
 
 
 def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
-    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
+    mx.contrib.tensorrt.set_use_fp16(False)
     h, w = 32, 32
     net = gluoncv.model_zoo.get_model(model_name, pretrained=True)
-    data = mx.sym.var('data')
-
+    net.hybridize()
+    net.forward(mx.nd.zeros((batch_size, 3, h, w)))
+    net.export(model_name)
+    _sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
     if use_tensorrt:
-        out = net(data)
-        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        all_params = dict([(k, v.data()) for k, v in net.collect_params().items()])
-        executor = mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
-                                                     data=(batch_size,3, h, w),
-                                                     softmax_label=(batch_size,), grad_req='null',
-                                                     force_rebind=True)
+        sym = _sym.get_backend_symbol('TensorRT') 
+        mx.contrib.tensorrt.init_tensorrt_params(sym, arg_params, aux_params)
     else:
-        # Convert gluon model to Symbolic
-        net.hybridize()
-        net.forward(mx.ndarray.zeros((batch_size, 3, h, w)))
-        net.export(model_name)
-        symbol, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
-        executor = symbol.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
-                                      softmax_label=(batch_size,))
-        executor.copy_params_from(arg_params, aux_params)
+        sym = _sym
+    executor = sym.simple_bind(ctx=ctx, data=(batch_size, 3, h, w),
+                               softmax_label=(batch_size,),
+			       grad_req='null', force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
     return executor
 
 
@@ -126,7 +120,7 @@ def run_experiment_for(model_name, batch_size, num_workers):
 
 
 def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
-    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
+    original_use_fp16 = mx.contrib.tensorrt.get_use_fp16()
     try:
         models = [
             'cifar_resnet20_v1',
@@ -170,7 +164,7 @@ def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
 
         print("Test duration: %.2f seconds" % test_duration)
     finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+        mx.contrib.tensorrt.set_use_fp16(original_use_fp16)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/tensorrt/test_cycle.py b/tests/python/tensorrt/test_cycle.py
deleted file mode 100644
index 25f515a106a6..000000000000
--- a/tests/python/tensorrt/test_cycle.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from common import *
-
-
-def detect_cycle_from(sym, visited, stack):
-    visited.add(sym.handle.value)
-    stack.add(sym.handle.value)
-    for s in sym.get_children():
-        if s.handle.value not in visited:
-            if detect_cycle_from(sym, visited, stack):
-                return True
-        elif s.handle.value in stack:
-            return True
-        stack.remove(sym.handle.value)
-    return False
-
-
-def has_no_cycle(sym):
-    visited = set()
-    stack = set()
-    all_nodes = sym.get_internals()
-    for s in all_nodes:
-        if s.handle.value in visited:
-            if detect_cycle_from(s, visited, stack):
-                return False
-    return True
-
-
-def test_simple_cycle():
-    inp = mx.sym.Variable('input', shape=[1,10])
-    A = mx.sym.FullyConnected(data=inp, num_hidden=10, no_bias=False, name='A')
-    B = mx.sym.FullyConnected(data=A, num_hidden=10, no_bias=False, name='B')
-    D = mx.sym.sin(data=A, name='D')
-    C = mx.sym.elemwise_add(lhs=B, rhs=D, name='C')
-    arg_params = {
-                'I_weight': mx.nd.zeros([10,10]),
-                'I_bias': mx.nd.zeros([10]),
-                'A_weight': mx.nd.zeros([10,10]),
-                'A_bias': mx.nd.zeros([10]),
-                'B_weight': mx.nd.zeros([10,10]),
-                'B_bias': mx.nd.zeros([10]),
-               }
-
-    executor = C.simple_bind(ctx=mx.gpu(0), data=(1,10), softmax_label=(1,),
-                           shared_buffer=arg_params, grad_req='null', force_rebind=True)
-    optimized_graph = mx.contrib.tensorrt.get_optimized_symbol(executor)
-    assert has_no_cycle(optimized_graph), "The graph optimized by TRT contains a cycle"
-
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()
diff --git a/tests/python/tensorrt/test_resnet18.py b/tests/python/tensorrt/test_resnet18.py
index fff3ac5dd768..36a7f33fe6a0 100644
--- a/tests/python/tensorrt/test_resnet18.py
+++ b/tests/python/tensorrt/test_resnet18.py
@@ -25,7 +25,6 @@
 url = 'https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true'
 model_file_name = 'resnet18_v2_trt_test'
 
-
 def get_image(image_url):
     fname = mx.test_utils.download(image_url, fname=image_url.split('/')[-1].split('?')[0])
     img = mx.image.imread(fname)
@@ -33,8 +32,7 @@ def get_image(image_url):
     img = img.transpose((2, 0, 1))  # Channel first
     img = img.expand_dims(axis=0)  # Batchify
     img = mx.nd.cast(img, dtype=np.float32)
-    return img/255.0
-
+    return img / 255.0
 
 def test_tensorrt_resnet18_feature_vect():
     print("downloading sample input")
@@ -45,24 +43,32 @@ def test_tensorrt_resnet18_feature_vect():
     gluon_resnet18.export(model_file_name)
     sym, arg_params, aux_params = mx.model.load_checkpoint(model_file_name, 0)
 
-    os.environ['MXNET_USE_TENSORRT'] = '0'
-    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape, grad_req='null', force_rebind=True)
+    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                               grad_req='null', force_rebind=True)
     executor.copy_params_from(arg_params, aux_params)
     y = executor.forward(is_train=False, data=input_data)
-
-    os.environ['MXNET_USE_TENSORRT'] = '1'
-    all_params = arg_params
-    all_params.update(aux_params)
-    executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(), all_params=all_params, data=batch_shape,
-                                                 grad_req='null', force_rebind=True)
-    y_trt = executor.forward(is_train=False, data=input_data)
-
-    no_trt_output = y[0].asnumpy()[0]
-    trt_output = y_trt[0].asnumpy()[0]
-    assert_almost_equal(no_trt_output, trt_output, 1e-4, 1e-4)
-
+    trt_sym = sym.get_backend_symbol('TensorRT')
+    mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
+    original_precision_value = mx.contrib.tensorrt.get_use_fp16()
+    try:
+        mx.contrib.tensorrt.set_use_fp16(True)
+        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                                       grad_req='null', force_rebind=True)
+        executor.copy_params_from(arg_params, aux_params)
+        y_trt = executor.forward(is_train=False, data=input_data)
+        mx.contrib.tensorrt.set_use_fp16(False)
+        executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+                                       grad_req='null', force_rebind=True)
+        executor.copy_params_from(arg_params, aux_params)
+        y_trt_fp32 = executor.forward(is_train=False, data=input_data)
+        no_trt_output = y[0].asnumpy()[0]
+        trt_output = y_trt[0].asnumpy()[0]
+        trt_fp32_output = y_trt_fp32[0].asnumpy()[0]
+        assert_almost_equal(no_trt_output, trt_output, 1e-1, 1e-2)
+        assert_almost_equal(no_trt_output, trt_fp32_output, 1e-4, 1e-4)
+    finally:
+        mx.contrib.tensorrt.set_use_fp16(original_precision_value)
 
 if __name__ == '__main__':
     import nose
-
     nose.runmodule()
diff --git a/tests/python/tensorrt/test_tensorrt_lenet5.py b/tests/python/tensorrt/test_tensorrt_lenet5.py
index 258686428a45..bdc306c0b297 100644
--- a/tests/python/tensorrt/test_tensorrt_lenet5.py
+++ b/tests/python/tensorrt/test_tensorrt_lenet5.py
@@ -24,24 +24,25 @@
 
 def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_size, use_tensorrt):
     """Run inference with either MXNet or TensorRT"""
-    mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
 
     data_size = (batch_size,) + mnist['test_data'].shape[1:]
+    type_dict = {'data': 'float32', 'softmax_label': 'float32'}
     if use_tensorrt:
-        all_params = merge_dicts(arg_params, aux_params)
-        executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(0), all_params=all_params,
-                                                     data=data_size,
-                                                     softmax_label=(batch_size,),
-                                                     grad_req='null',
-                                                     force_rebind=True)
+        _sym = sym.get_backend_symbol('TensorRT')
+        mx.contrib.tensorrt.init_tensorrt_params(_sym, arg_params, aux_params)
     else:
-        executor = sym.simple_bind(ctx=mx.gpu(0),
-                                   data=data_size,
-                                   softmax_label=(batch_size,),
-                                   grad_req='null',
-                                   force_rebind=True)
-        executor.copy_params_from(arg_params, aux_params)
-
+        _sym = sym
+    for k, v in arg_params.items():
+        type_dict[k] = v.dtype
+    for k, v in aux_params.items():
+        type_dict[k] = v.dtype
+    executor = _sym.simple_bind(ctx=mx.gpu(0),
+                                type_dict=type_dict,
+                                data=data_size,
+                                softmax_label=(batch_size,),
+                                grad_req='null',
+                                force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
     # Get this value from all_test_labels
     # Also get classes from the dataset
     num_ex = 10000
@@ -68,39 +69,35 @@ def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_siz
 
 def test_tensorrt_inference():
     """Run LeNet-5 inference comparison between MXNet and TensorRT."""
-    original_try_value = mx.contrib.tensorrt.get_use_tensorrt()
-    try:
-        check_tensorrt_installation()
-        mnist = mx.test_utils.get_mnist()
-        num_epochs = 10
-        batch_size = 128
-        model_name = 'lenet5'
-        model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
-        model_file = '%s/%s-symbol.json' % (model_dir, model_name)
-        params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
-
-        _, _, _, all_test_labels = get_iters(mnist, batch_size)
-
-        # Load serialized MXNet model (model-symbol.json + model-epoch.params)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
-
-        print("LeNet-5 test")
-        print("Running inference in MXNet")
-        mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                               batch_size=batch_size, use_tensorrt=False)
-
-        print("Running inference in MXNet-TensorRT")
-        trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                                batch_size=batch_size, use_tensorrt=True)
-
-        print("MXNet accuracy: %f" % mx_pct)
-        print("MXNet-TensorRT accuracy: %f" % trt_pct)
-
-        assert abs(mx_pct - trt_pct) < 1e-2, \
-            """Diff. between MXNet & TensorRT accuracy too high:
-               MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
-    finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_try_value)
+    check_tensorrt_installation()
+    mnist = mx.test_utils.get_mnist()
+    num_epochs = 10
+    batch_size = 128
+    model_name = 'lenet5'
+    model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
+    model_file = '%s/%s-symbol.json' % (model_dir, model_name)
+    params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
+
+    _, _, _, all_test_labels = get_iters(mnist, batch_size)
+
+    # Load serialized MXNet model (model-symbol.json + model-epoch.params)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
+
+    print("LeNet-5 test")
+    print("Running inference in MXNet")
+    mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                           batch_size=batch_size, use_tensorrt=False)
+
+    print("Running inference in MXNet-TensorRT")
+    trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
+                            batch_size=batch_size, use_tensorrt=True)
+
+    print("MXNet accuracy: %f" % mx_pct)
+    print("MXNet-TensorRT accuracy: %f" % trt_pct)
+
+    assert abs(mx_pct - trt_pct) < 1e-2, \
+        """Diff. between MXNet & TensorRT accuracy too high:
+           MXNet = %f, TensorRT = %f""" % (mx_pct, trt_pct)
 
 
 if __name__ == '__main__':
diff --git a/tests/python/tensorrt/test_training_warning.py b/tests/python/tensorrt/test_training_warning.py
deleted file mode 100644
index fdac859aef6f..000000000000
--- a/tests/python/tensorrt/test_training_warning.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import gluoncv
-import mxnet as mx
-
-from tests.python.unittest.common import assertRaises
-
-
-def test_training_without_trt():
-    run_resnet(is_train=True, use_tensorrt=False)
-
-
-def test_inference_without_trt():
-    run_resnet(is_train=False, use_tensorrt=False)
-
-
-def test_training_with_trt():
-    assertRaises(RuntimeError, run_resnet, is_train=True, use_tensorrt=True)
-
-
-def test_inference_with_trt():
-    run_resnet(is_train=False, use_tensorrt=True)
-
-
-def run_resnet(is_train, use_tensorrt):
-    original_trt_value = mx.contrib.tensorrt.get_use_tensorrt()
-    try:
-        mx.contrib.tensorrt.set_use_tensorrt(use_tensorrt)
-        ctx = mx.gpu(0)
-        batch_size = 1
-        h = 32
-        w = 32
-        model_name = 'cifar_resnet20_v1'
-        resnet = gluoncv.model_zoo.get_model(model_name, pretrained=True)
-        data = mx.sym.var('data')
-        out = resnet(data)
-        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        if is_train:
-            grad_req = 'write'
-        else:
-            grad_req = 'null'
-        if use_tensorrt:
-            all_params = dict([(k, v.data()) for k, v in resnet.collect_params().items()])
-            mx.contrib.tensorrt.tensorrt_bind(softmax, ctx=ctx, all_params=all_params,
-                                              data=(batch_size, 3, h, w), softmax_label=(batch_size,),
-                                              force_rebind=True, grad_req=grad_req)
-        else:
-            softmax.simple_bind(ctx=ctx, data=(batch_size, 3, h, w), softmax_label=(batch_size,),
-                                force_rebind=True, grad_req=grad_req)
-    finally:
-        mx.contrib.tensorrt.set_use_tensorrt(original_trt_value)
-
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()