diff --git a/docs/api/python/contrib/tensorrt.md b/docs/api/python/contrib/tensorrt.md index 2b7427dad2a5..4bac8ff706b1 100644 --- a/docs/api/python/contrib/tensorrt.md +++ b/docs/api/python/contrib/tensorrt.md @@ -87,7 +87,7 @@ sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, epoch) all_params = merge_dicts(arg_params, aux_params) ``` -This `all_params` dictionary cn be seem in use in the `simple_bind` call in `#2`. +This `all_params` dictionary can be seen in use in the `simple_bind` call in `#2`. 4. Once the symbol is bound, we need to feed the data and run the `forward()` method. Let's say we're using a test set data iterator called `test_iter`. We can run inference as follows: ```python for idx, dbatch in enumerate(test_iter): diff --git a/example/image-classification/tensorrt/test_tensorrt_resnet50.py b/example/image-classification/tensorrt/test_tensorrt_resnet50.py index b79557717950..cfcf5d0ae6a9 100644 --- a/example/image-classification/tensorrt/test_tensorrt_resnet50.py +++ b/example/image-classification/tensorrt/test_tensorrt_resnet50.py @@ -18,13 +18,10 @@ from __future__ import print_function import os.path -import subprocess import mxnet as mx import numpy as np from time import time import sys -import urllib - def get_use_tensorrt(): return int(os.environ.get("MXNET_USE_TENSORRT", 0)) diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h index 20b5deff9ed3..11d3c5b0127b 100644 --- a/include/mxnet/executor.h +++ b/include/mxnet/executor.h @@ -164,7 +164,7 @@ class Executor { std::vector* arg_grads, std::vector* aux_states, std::unordered_map* - shared_data_arrays = nullptr, + shared_data_arrays = nullptr, Executor* shared_exec = nullptr); /*! * \brief the prototype of user-defined monitor callback diff --git a/python/mxnet/base.py b/python/mxnet/base.py index 0fb73b3c7dda..579a70a19fec 100644 --- a/python/mxnet/base.py +++ b/python/mxnet/base.py @@ -680,3 +680,19 @@ def write_all_str(module_file, module_all_list): module_op_file.close() write_all_str(module_internal_file, module_internal_all) module_internal_file.close() + +def cint(init_val=0): + """create a C int with an optional initial value""" + return C.c_int(init_val) + +def int_addr(x): + """given a c_int, return it's address as an int ptr""" + x_addr = C.addressof(x) + int_p = C.POINTER(C.c_int) + x_int_addr = C.cast(x_addr, int_p) + return x_int_addr + +def checked_call(f, *args): + """call a cuda function and check for success""" + error_t = f(*args) + assert error_t == 0, "Failing cuda call %s returns %s." % (f.__name__, error_t) diff --git a/python/mxnet/cuda_utils.py b/python/mxnet/cuda_utils.py index 11f8fa439952..89adf19f6e98 100644 --- a/python/mxnet/cuda_utils.py +++ b/python/mxnet/cuda_utils.py @@ -25,22 +25,7 @@ # As a stand-alone program, it prints a list of unique cuda SM architectures import ctypes as C from ctypes.util import find_library - -def cint(init_val=0): - """create a C int with an optional initial value""" - return C.c_int(init_val) - -def int_addr(x): - """given a c_int, return it's address as an int ptr""" - x_addr = C.addressof(x) - INTP = C.POINTER(C.c_int) - x_int_addr = C.cast(x_addr, INTP) - return x_int_addr - -def checked_call(f, *args): - """call a cuda function and check for success""" - error_t = f(*args) - assert error_t == 0, "Failing cuda call %s returns %s." % (f.__name__, error_t) +from .base import cint, int_addr, checked_call def find_cuda_lib(candidates): for candidate in candidates: @@ -84,7 +69,3 @@ def unique_sm_arches(): for device_id in range(device_count): archs.add(get_sm_arch(device_id)) return sorted(archs) - -# print a list of unique cuda SM architectures on the system -if __name__ == '__main__': - print(' '.join(str(x) for x in unique_sm_arches())) diff --git a/src/common/serialization.h b/src/common/serialization.h index 5fb72ad3c999..54a4cda70dde 100644 --- a/src/common/serialization.h +++ b/src/common/serialization.h @@ -138,7 +138,7 @@ inline void deserialize(std::tuple* obj, const std::string& buffer, siz template -struct is_cont { +struct is_container { static const bool value = !std::is_pod::value; }; @@ -149,7 +149,7 @@ inline size_t serialized_size(const T& obj) { template inline size_t serialized_size(const nnvm::Tuple& obj) { - if (is_cont::value) { + if (is_container::value) { size_t sum_val = 4; for (auto& el : obj) { sum_val += serialized_size(el); @@ -162,7 +162,7 @@ inline size_t serialized_size(const nnvm::Tuple& obj) { template inline size_t serialized_size(const std::vector& obj) { - if (is_cont::value) { + if (is_container::value) { size_t sum_val = 4; for (T i : obj) { sum_val += serialized_size(i); @@ -181,16 +181,16 @@ inline size_t serialized_size(const std::pair& obj) { template inline size_t serialized_size(const std::map& obj) { size_t sum_val = 4; - if (is_cont::value && is_cont::value) { + if (is_container::value && is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.first) + serialized_size(p.second); } - } else if (is_cont::value) { + } else if (is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.first); } sum_val += sizeof(V) * obj.size(); - } else if (is_cont::value) { + } else if (is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.second); } @@ -204,16 +204,16 @@ inline size_t serialized_size(const std::map& obj) { template inline size_t serialized_size(const std::unordered_map& obj) { size_t sum_val = 4; - if (is_cont::value && is_cont::value) { + if (is_container::value && is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.first) + serialized_size(p.second); } - } else if (is_cont::value) { + } else if (is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.first); } sum_val += sizeof(V) * obj.size(); - } else if (is_cont::value) { + } else if (is_container::value) { for (auto p : obj) { sum_val += serialized_size(p.second); } @@ -226,7 +226,7 @@ inline size_t serialized_size(const std::unordered_map& obj) { template inline size_t serialized_size(const std::set& obj) { - if (is_cont::value) { + if (is_container::value) { size_t sum_val = 4; for (auto& el : obj) { sum_val += serialized_size(el); @@ -239,7 +239,7 @@ inline size_t serialized_size(const std::set& obj) { template inline size_t serialized_size(const std::unordered_set& obj) { - if (is_cont::value) { + if (is_container::value) { size_t sum_val = 4; for (auto& el : obj) { sum_val += serialized_size(el); @@ -279,7 +279,7 @@ inline size_t serialized_size(const std::tuple& obj) { // SERIALIZE template -inline size_t serialize_cont_size(const T& obj, char** buffer) { +inline size_t serialize_container_size(const T& obj, char** buffer) { uint32_t size = obj.size(); std::memcpy(*buffer, &size, 4); *buffer += 4; @@ -304,8 +304,8 @@ inline void serialize(const nnvm::Tuple& obj, char** buffer) { template inline void serialize(const std::vector& obj, char** buffer) { - auto size = serialize_cont_size(obj, buffer); - if (is_cont::value) { + auto size = serialize_container_size(obj, buffer); + if (is_container::value) { for (auto& el : obj) { serialize(el, buffer); } @@ -323,7 +323,7 @@ inline void serialize(const std::pair& obj, char** buffer) { template inline void serialize(const std::map& obj, char** buffer) { - serialize_cont_size(obj, buffer); + serialize_container_size(obj, buffer); for (auto& p : obj) { serialize(p.first, buffer); serialize(p.second, buffer); @@ -332,7 +332,7 @@ inline void serialize(const std::map& obj, char** buffer) { template inline void serialize(const std::unordered_map& obj, char** buffer) { - serialize_cont_size(obj, buffer); + serialize_container_size(obj, buffer); for (auto& p : obj) { serialize(p.first, buffer); serialize(p.second, buffer); @@ -341,7 +341,7 @@ inline void serialize(const std::unordered_map& obj, char** buffer) { template inline void serialize(const std::set& obj, char** buffer) { - serialize_cont_size(obj, buffer); + serialize_container_size(obj, buffer); for (auto& el : obj) { serialize(el, buffer); } @@ -349,7 +349,7 @@ inline void serialize(const std::set& obj, char** buffer) { template inline void serialize(const std::unordered_set& obj, char** buffer) { - serialize_cont_size(obj, buffer); + serialize_container_size(obj, buffer); for (auto& el : obj) { serialize(el, buffer); } @@ -357,7 +357,7 @@ inline void serialize(const std::unordered_set& obj, char** buffer) { template<> inline void serialize(const std::string& obj, char** buffer) { - auto size = serialize_cont_size(obj, buffer); + auto size = serialize_container_size(obj, buffer); std::memcpy(*buffer, &obj[0], size); *buffer += size; } @@ -387,7 +387,7 @@ inline void serialize(const std::tuple& obj, char** buffer) { // Deserializer template -inline size_t deserialize_cont_size(T* obj, const std::string& buffer, size_t* curr_pos) { +inline size_t deserialize_container_size(T* obj, const std::string& buffer, size_t* curr_pos) { uint32_t size = obj->size(); std::memcpy(&size, &buffer[*curr_pos], 4); *curr_pos += 4; @@ -413,9 +413,9 @@ inline void deserialize(nnvm::Tuple* obj, const std::string& buffer, size_t* template inline void deserialize(std::vector* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); obj->resize(size); - if (is_cont::value) { + if (is_container::value) { for (size_t i = 0; i < size; ++i) { deserialize((*obj)[i], buffer, curr_pos); } @@ -433,7 +433,7 @@ inline void deserialize(std::pair* obj, const std::string& buffer, size_t* template inline void deserialize(std::map* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); K first; for (size_t i = 0; i < size; ++i) { deserialize(&first, buffer, curr_pos); @@ -444,7 +444,7 @@ inline void deserialize(std::map* obj, const std::string& buffer, size_t* template inline void deserialize(std::unordered_map* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); K first; for (size_t i = 0; i < size; ++i) { deserialize(first, buffer, curr_pos); @@ -454,7 +454,7 @@ inline void deserialize(std::unordered_map* obj, template inline void deserialize(std::set* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); K first; for (size_t i = 0; i < size; ++i) { deserialize(first, buffer, curr_pos); @@ -464,7 +464,7 @@ inline void deserialize(std::set* obj, const std::string& buffer, size_t* cur template inline void deserialize(std::unordered_set* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); K first; for (size_t i = 0; i < size; ++i) { deserialize(first, buffer, curr_pos); @@ -474,7 +474,7 @@ inline void deserialize(std::unordered_set* obj, const std::string& buffer, s template<> inline void deserialize(std::string* obj, const std::string& buffer, size_t* curr_pos) { - auto size = deserialize_cont_size(obj, buffer, curr_pos); + auto size = deserialize_container_size(obj, buffer, curr_pos); obj->resize(size); std::memcpy(&(obj->front()), &buffer[*curr_pos], size); *curr_pos += size; diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index b136cca03b79..8a00b2e8cd24 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -213,7 +213,7 @@ bool DefaultStorageType(const nnvm::NodeAttrs& attrs, * \brief Replace subgraphs by TRT (forward only) */ Graph ReplaceSubgraph(Graph&& g, - std::unordered_set set_subgraph, + const std::unordered_set& set_subgraph, std::unordered_map* const params_map); std::vector> GetTrtCompatibleSubsets(const Graph& g, diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc index 5b9ce5c7f6ae..bb29f8f362fc 100644 --- a/src/executor/tensorrt_pass.cc +++ b/src/executor/tensorrt_pass.cc @@ -88,18 +88,18 @@ class BidirectionalGraph { template void DFS(const std::vector& heads, bool reverse, FVisit fvisit) { std::unordered_set visited; - std::deque stack(heads.begin(), heads.end()); + std::vector vec(heads.begin(), heads.end()); visited.reserve(heads.size()); - while (!stack.empty()) { - Node* vertex = stack.back(); - stack.pop_back(); + while (!vec.empty()) { + Node* vertex = vec.back(); + vec.pop_back(); if (visited.count(vertex) == 0) { visited.insert(vertex); fvisit(vertex); std::vector nexts = reverse ? vertex->inputs : vertex->outputs; for (Node* node : nexts) { if (visited.count(node) == 0) { - stack.emplace_back(node); + vec.emplace_back(node); } } } @@ -337,12 +337,12 @@ Graph UpdateSubgraphAttrs(Graph&& subgraph, const Graph& g, const auto& idx = g.indexed_graph(); const auto& sub_idx = subgraph.indexed_graph(); - const auto shape = g.GetAttr("shape"); - const auto dtype = g.GetAttr("dtype"); - const auto storage_type = g.GetAttr("storage_type"); - const auto shape_inputs = g.GetAttr("shape_inputs"); - const auto dtype_inputs = g.GetAttr("dtype_inputs"); - const auto storage_type_inputs = g.GetAttr("storage_type_inputs"); + const auto& shape = g.GetAttr("shape"); + const auto& dtype = g.GetAttr("dtype"); + const auto& storage_type = g.GetAttr("storage_type"); + const auto& shape_inputs = g.GetAttr("shape_inputs"); + const auto& dtype_inputs = g.GetAttr("dtype_inputs"); + const auto& storage_type_inputs = g.GetAttr("storage_type_inputs"); nnvm::ShapeVector sub_shape(sub_idx.num_node_entries()); nnvm::DTypeVector sub_dtype(sub_idx.num_node_entries()); @@ -453,7 +453,7 @@ void dispNodesSet(Graph g, std::unordered_set s) { * \brief Replace a set of nodes by a TensorRT node */ Graph ReplaceSubgraph(Graph&& g, - std::unordered_set set_subgraph, + const std::unordered_set& set_subgraph, std::unordered_map* const params_map) { // Create MXNet subgraph Graph subgraph; diff --git a/src/operator/contrib/tensorrt-inl.h b/src/operator/contrib/tensorrt-inl.h index be4248fc762b..be335ab1208f 100644 --- a/src/operator/contrib/tensorrt-inl.h +++ b/src/operator/contrib/tensorrt-inl.h @@ -105,33 +105,6 @@ struct TRTEngineParam { std::vector > binding_map; }; -OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context ctx, - const std::vector& ishape, - const std::vector& itype); - -template -void TRTCompute(const OpStatePtr& state, const OpContext& ctx, - const std::vector& inputs, const std::vector& req, - const std::vector& outputs); - -inline bool TRTInferShape(const NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape); - -inline bool TRTInferStorageType(const NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_storage_type, - std::vector *out_storage_type); - -inline bool TRTInferType(const NodeAttrs& attrs, - std::vector *in_dtype, - std::vector *out_dtype); - -inline std::vector TRTListInputNames(const NodeAttrs& attrs); - -inline std::vector TRTListOutputNames(const NodeAttrs& attrs); - } // namespace op } // namespace mxnet diff --git a/src/operator/contrib/tensorrt.cc b/src/operator/contrib/tensorrt.cc index 535c7875e333..9ee0b7c337e6 100644 --- a/src/operator/contrib/tensorrt.cc +++ b/src/operator/contrib/tensorrt.cc @@ -116,14 +116,6 @@ void TRTParamParser(nnvm::NodeAttrs* attrs) { attrs->parsed = std::move(param_); } -template <> -void TRTCompute(const OpStatePtr& state, const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - LOG(FATAL) << "TRTCompute not implemented on the CPU"; -} - inline bool TRTInferShape(const NodeAttrs& attrs, std::vector* in_shape, std::vector* out_shape) { const auto node_param = nnvm::get(attrs.parsed); @@ -187,7 +179,6 @@ NNVM_REGISTER_OP(_trt_op) .set_attr("FListInputNames", TRTListInputNames) .set_attr("FListOutputNames", TRTListOutputNames) .set_attr("FCreateOpState", TRTCreateState) - .set_attr("FStatefulCompute", TRTCompute) .set_attr("FInferStorageType", TRTInferStorageType); } // namespace op diff --git a/src/operator/contrib/tensorrt.cu b/src/operator/contrib/tensorrt.cu index 5211b0b9b039..2fe8727b73e4 100644 --- a/src/operator/contrib/tensorrt.cu +++ b/src/operator/contrib/tensorrt.cu @@ -40,8 +40,7 @@ namespace op { } \ } while (0) -template<> -void TRTCompute(const OpStatePtr& state, const OpContext& ctx, +void TRTCompute(const OpStatePtr& state, const OpContext& ctx, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { using namespace mshadow; @@ -66,7 +65,7 @@ void TRTCompute(const OpStatePtr& state, const OpContext& ctx, } NNVM_REGISTER_OP(_trt_op) -.set_attr("FStatefulCompute", TRTCompute); +.set_attr("FStatefulCompute", TRTCompute); } // namespace op } // namespace mxnet