Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Pointwise fusion for GPU #15167

Merged
merged 112 commits into from
Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
112 commits
Select commit Hold shift + click to select a range
9653b67
Beginning of RTC of pointwise ops
ptrendx Apr 9, 2019
0e1774f
Code generation from the given JSON
ptrendx Apr 29, 2019
8bf2945
add initial simple_partition_pass and use it for pointwise fusion
Caenorst May 5, 2019
5cbb50d
fix the fusion, use a symbol.Copy() at the beginning of binding funct…
Caenorst May 5, 2019
fcf23c7
Fixes
Caenorst May 5, 2019
892c18f
Adding support for attribute inference for backward nodes when fusing
ptrendx May 23, 2019
0a342a0
keep proper input ordering for fused Op
Caenorst May 24, 2019
07de800
instantiate the indexed_graph before starting the subgraph replacemen…
Caenorst May 24, 2019
975e8a6
Fuse backward
ptrendx May 27, 2019
6d9c0bf
fix ordering of subgraph node inputs using subgraph topological order…
Caenorst May 29, 2019
384fbb0
excluse forward node fusion during the fusion of the nodes in the bac…
Caenorst May 29, 2019
b9506ff
Dealing with fused backward nodes inferattr
ptrendx Jun 3, 2019
f30fbbb
use subgraph.indexed_graph() instead of main for _FusedOpHelper nodes…
Caenorst Jun 4, 2019
1a2e30d
Adding support for other reqs in codegen
ptrendx Jun 4, 2019
15fbed5
Fix
ptrendx Jun 4, 2019
506b126
Cleaning
ptrendx Jun 4, 2019
cf88753
Change the TVM submodule
ptrendx Jun 4, 2019
b861af9
More cleaning
ptrendx Jun 4, 2019
d001b5d
Making linter happy
ptrendx Jun 5, 2019
48f1b94
Do fusion only if default context is GPU
ptrendx Jun 5, 2019
37d4bbf
Fixes for tests
ptrendx Jun 5, 2019
616b932
Fix the TVM commit
ptrendx Jun 6, 2019
56303c8
Fix lint
ptrendx Jun 6, 2019
00e61cf
Guard fusion with MXNET_USE_CUDA
ptrendx Jun 6, 2019
204ab30
Fix
ptrendx Jun 6, 2019
0e89f8c
Fix clang-tidy
ptrendx Jun 10, 2019
73a2a5c
Add erf and erfinv backward
ptrendx Jun 12, 2019
4d0f1c9
Gluon support for fusion
ptrendx Jun 10, 2019
3dddad7
Cleaning
ptrendx Jun 13, 2019
5067fa6
Cleaning and allow shape/type change in FusedOp
ptrendx Jun 13, 2019
b27a369
Fixing Gluon bugs
ptrendx Jun 13, 2019
f18847c
Fixing after rebase
ptrendx Jun 14, 2019
9a05327
Fixing race condition and guarding against races when using NVRTC
ptrendx Jun 14, 2019
309f9a7
Cleaning and renaming FusedOp to _FusedOp
ptrendx Jun 14, 2019
9617b03
Going easy on Windows compiler
ptrendx Jun 14, 2019
d730027
Merge branch 'upstream' into pr_fusion
ptrendx Jun 19, 2019
de9027b
Disable fusion on Windows for now
ptrendx Jun 19, 2019
3d2d715
Refactor InferAttr and InferShapeAttr
ptrendx Jun 19, 2019
5221677
Added slice and half2 support to FusedOp
nvchai Jun 17, 2019
f3e4f7a
Fix lint errors
nvchai Jun 17, 2019
84822e1
Added multiple types support for vector loading/storing
nvchai Jun 18, 2019
2896258
add slice fusion when it's at the beginning of subgraphs
Caenorst Jun 18, 2019
eb0151c
Removed constant ndim assumption in fused op
nvchai Jun 18, 2019
935342f
Fix memory alignment issue in slice for FusedOp
nvchai Jun 18, 2019
ffa6c63
Fixes
nvchai Jun 19, 2019
803fd2a
Fix lint errors
nvchai Jun 19, 2019
3ed3aef
Do not include cuda_fp16.h
ptrendx Jun 19, 2019
84c2df5
Refactor fused op op lists
ptrendx Jun 19, 2019
1d94365
Make linter happy
ptrendx Jun 19, 2019
844cb9f
Changes from review
ptrendx Jun 19, 2019
204b127
Fixes after rebase
ptrendx Jun 20, 2019
56eb99d
Expand FusedOp support for slice
nvchai Jun 19, 2019
e31b586
Fix for fp16 _zeros and _ones
ptrendx Jun 20, 2019
c611b56
Fix
ptrendx Jun 20, 2019
d0d0fcf
Moving aux functions to unnamed namespace and detail namespace -> fusion
ptrendx Jun 21, 2019
39e309f
Merge branch 'upstream' into pr_fusion
ptrendx Jun 25, 2019
7f12eac
Disabling fusion if it alters topological order of inputs
ptrendx Jun 25, 2019
654a358
Print code only when env variable is set
ptrendx Jun 25, 2019
32b690a
Fix
ptrendx Jun 25, 2019
39bfcf6
Fix lint and 2 tests that specify the same names for multiple inputs
ptrendx Jun 25, 2019
b109a38
Fixes from review and disabling fusion of slice with non-default step
ptrendx Jun 26, 2019
f1a14fd
Add amp_cast to fusion, fixes
ptrendx Jun 27, 2019
a72b980
Add amp_multicast and its backward to the list of support ops
ptrendx Jul 2, 2019
e4e674e
Apply wording suggestions from code review
ptrendx Jul 2, 2019
5766481
Apply wording suggestions from code review
ptrendx Jul 2, 2019
62513e6
Make clearer comment
ptrendx Jul 2, 2019
dd651d3
Adding punctuation and capitalization to \brief descriptions
ptrendx Jul 2, 2019
7974888
Fix
ptrendx Jul 2, 2019
2aa8950
Fix
ptrendx Jul 3, 2019
a96e778
Add backward_cast to fusion
ptrendx Jul 8, 2019
9ea5464
Adding unittests for fusion. Fix for erfinv_grad
ptrendx Jul 8, 2019
6c3a75a
Adding slice ops and add_n to tests
ptrendx Jul 9, 2019
6d0eaf3
Fixes from review
ptrendx Jul 10, 2019
70735f2
Setting inplace option
ptrendx Jul 11, 2019
9049086
Fix lint
ptrendx Jul 12, 2019
6f56a8b
Storing double in half
ptrendx Jul 12, 2019
171c24f
Retrigger CI
ptrendx Jul 19, 2019
26b19ed
Slight relaxing of the relative tolerance in the test
ptrendx Jul 23, 2019
551c3b7
Merge branch 'upstream' into pr_fusion
ptrendx Jul 23, 2019
912e831
Move the env variable check to the end
ptrendx Jul 24, 2019
052576e
Fix a race condition between InferShape and scheduled Forward
ptrendx Jul 25, 2019
0e1918f
Fix flakey test_fusion test involving fp32 erfinv op.
DickJC123 Jul 26, 2019
1bbdba6
Merge branch 'upstream' into pr_fusion
ptrendx Jul 26, 2019
7e1df6a
Fix from review
ptrendx Jul 29, 2019
7a92738
Added broadcast_like and slice_like to fused op
nvchai Jul 11, 2019
a1dee58
Minor fix and cleanup
nvchai Jul 11, 2019
36201fe
Added negative axis support in slice_axis, temporarily disabled fusio…
nvchai Aug 1, 2019
c077e97
Added axes support to slice_like
nvchai Aug 2, 2019
3f0bfb4
Added axis support to broadcast_like
nvchai Aug 2, 2019
1e20339
Add fast_load_slice function to fused op code
nvchai Aug 9, 2019
13b3076
Added runtime switch for choosing fast and slow slice kernel
nvchai Aug 14, 2019
e5649e1
Fix lint and warning
ptrendx Aug 20, 2019
868bcf6
Going easy on Windows compiler (again)
ptrendx Aug 21, 2019
1608d6a
Fix slice_like
ptrendx Sep 4, 2019
037a5de
Debug broadcast_like fusion
ptrendx Sep 4, 2019
e501bc9
Fix lint
ptrendx Sep 5, 2019
e0ca7d0
Fix lint
ptrendx Sep 10, 2019
8d3dc77
Trigger CI
ptrendx Sep 11, 2019
786b071
Get rid of the initializer list
ptrendx Sep 11, 2019
0720f66
Fix backward calls with different gradient type
ptrendx Sep 16, 2019
da8bfe3
avoid cycle when adding node specific for inputs of subgraph for poin…
Caenorst Sep 19, 2019
ed03595
Fix lint
ptrendx Sep 20, 2019
69facdc
Add namespace to the fusion implementations
ptrendx Sep 23, 2019
a5ee989
Merge branch 'upstream' into pr_fusion
ptrendx Oct 21, 2019
e26770b
Set launch bounds on the fused kernel
ptrendx Oct 21, 2019
80e36ba
Fix NumPy tests
ptrendx Oct 21, 2019
36e5ce8
Test showcasing an issue fixed in PR #16553
ptrendx Oct 22, 2019
f77fe5b
Cast scalarts to FP32 and perform (a*1.0/b) instead of (a/b)
MoisesHer Oct 23, 2019
fdf710e
Merge branch 'upstream' into pr_fusion
ptrendx Oct 24, 2019
76aa154
Fix a bug in cycle detection for inputs only op in pointwise fusion
Caenorst Oct 28, 2019
929b8e9
Merge branch 'upstream' into pr_fusion
ptrendx Oct 29, 2019
3d1b5af
Add comments to simple_partition_pass.h file
ptrendx Oct 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions docs/static_site/src/pages/api/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ The following environments can be used to profile the application without changi

* MXNET_PROFILER_AUTOSTART
- Values: 0(false) or 1(true) ```(default=0)```
- Set to 1, MXNet starts the profiler automatically. The profiling result is stored into profile.json in the working directory.
- Set to 1, MXNet starts the profiler automatically. The profiling result is stored into profile.json in the working directory.

* MXNET_PROFILER_MODE
- Values: 0(false) or 1(true) ```(default=0)```
- If set to '0', profiler records the events of the symbolic operators.
- If set to '1', profiler records the events of all operators.
- If set to '0', profiler records the events of the symbolic operators.
- If set to '1', profiler records the events of all operators.

## Interface between Python and the C API

Expand Down Expand Up @@ -241,14 +241,14 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.

* MXNET_CUDA_ALLOW_TENSOR_CORE
- 0(false) or 1(true) ```(default=1)```
- If set to '0', disallows Tensor Core use in CUDA ops.
- If set to '1', allows Tensor Core use in CUDA ops.
- If set to '0', disallows Tensor Core use in CUDA ops.
- If set to '1', allows Tensor Core use in CUDA ops.
- This variable can only be set once in a session.

* MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
- 0(false) or 1(true) ```(default=0)```
- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
- If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
- If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.

* MXNET_CUDA_LIB_CHECKING
- 0(false) or 1(true) ```(default=1)```
Expand Down Expand Up @@ -328,6 +328,17 @@ If ctypes is used, it must be `mxnet._ctypes.ndarray.NDArrayBase`.
with float32.
- Model accuracies do not necessarily improve with this environment variable turned on.

* MXNET_USE_FUSION
- Values: 0(false) or 1(true) ```(default=1)```
- If this variable is set, MXNet will try fusing some of the operations (pointwise operations only for now).
- It works in Symbolic execution as well as in Gluon models hybridized with ```static_alloc=True``` option.
- Only applies to MXNet that has been compiled with CUDA (```pip install mxnet-cuXX``` or built from source with ```USE_CUDA=1```) and running on GPU.

* MXNET_FUSION_VERBOSE
- Values: 0(false) or 1(true) ```(default=0)```
- Only applies to MXNet that has been compiled with CUDA and when ```MXNET_USE_FUSION``` option is enabled.
- If this variable is set, MXNet will print the code for fused operators that it generated.

Settings for Minimum Memory Usage
---------------------------------
- Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
Expand Down
79 changes: 79 additions & 0 deletions src/common/exec_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

/*!
* \file exec_utils.cc
* \brief Implementation of executor util functions.
*/

#include "exec_utils.h"
#include <unordered_set>
#include <unordered_map>
#include <string>

namespace mxnet {
namespace common {

void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) {
using nnvm::Node;
using nnvm::NodePtr;
using nnvm::NodeEntry;
std::unordered_map<Node*, NodePtr> old_new;
// use DFSVisit to copy all the nodes
DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) {
NodePtr np;
if (copy_variables || !node->is_variable()) {
np = Node::Create();
np->attrs = node->attrs;
} else {
np = node;
}
old_new[node.get()] = std::move(np);
});
// connect nodes of new graph
for (const auto &kv : old_new) {
for (const NodeEntry& e : kv.first->inputs) {
Node *ptr = e.node.get();
kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version});
}
for (const NodePtr& p : kv.first->control_deps) {
kv.second->control_deps.emplace_back(old_new[p.get()]);
}
}
// set the head
for (const NodeEntry &e : src.outputs) {
(*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version});
}
}

bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) {
std::unordered_set<std::string> names;
for (const auto& nid : idx.input_nodes()) {
const std::string &name = idx[nid].source->attrs.name;
if (names.count(name)) {
LOG(WARNING) << "Variable name " << name << " is used more than once!";
return false;
}
names.insert(name);
}
return true;
}

} // namespace common
} // namespace mxnet
19 changes: 19 additions & 0 deletions src/common/exec_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -621,6 +621,25 @@ inline nnvm::Graph AssignContext(nnvm::Graph g,
return g;
}

/*!
* \brief Copy the graph, optionally leaving original Variable nodes.
*
* \param dst destination graph
* \param src source graph being copied
* \param copy_variable whether to copy or reuse Variable nodes from the
* source graph
*/
void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables);

/*!
* \brief Check whether graph contains any duplicated names in its inputs.
*
* \param idx Indexed graph being checked
*
* \return true if there are no duplicates, false otherwise
*/
bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx);

} // namespace common
} // namespace mxnet
#endif // MXNET_COMMON_EXEC_UTILS_H_
Expand Down
42 changes: 42 additions & 0 deletions src/executor/exec_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,34 @@
#include <vector>
#include <memory>
#include <string>
#include <utility>
#include <tuple>

namespace mxnet {
namespace exec {

template <typename Attr>
using FAccessSubgraphAttr = std::function<std::tuple<const nnvm::NodePtr,
std::vector<Attr>,
std::vector<Attr>>
(const NodeAttrs& attrs)>;

using FAccessSubgraphShape = FAccessSubgraphAttr<mxnet::TShape>;
using FAccessSubgraphType = FAccessSubgraphAttr<int>;
using FAccessSubgraphStorageType = FAccessSubgraphAttr<int>;

template <typename Attr>
using FProvideSubgraphAttr = std::function<void (const NodeAttrs& attrs,
const std::vector<nnvm::NodePtr> &nodes,
const std::vector<std::vector<Attr>> &in_attrs,
const std::vector<std::vector<Attr>> &out_attrs)>;
using FProvideSubgraphShape = FProvideSubgraphAttr<mxnet::TShape>;
using FProvideSubgraphType = FProvideSubgraphAttr<int>;
using FProvideSubgraphStorageType = FProvideSubgraphAttr<int>;

using TIsFusion = bool;
using TIsFusionHelper = bool;

/*! \brief reuse graph definition */
using nnvm::Graph;

Expand Down Expand Up @@ -170,6 +194,24 @@ void AttachOpResources(const Graph& g,
*/
Graph DetectInplaceAddTo(Graph g);

/*!
* \brief Fuse pointwise operations in the forward pass.
*
* \param g input graph (needs to be entire graph, not just forward part)
*
* \return graph with fused pointwise operations in the forward pass
*/
Graph FusePointwiseForward(Graph&& g);

/*!
* \brief Fuse pointwise operations in the backward pass.
*
* \param g input graph (needs to be entire graph, not just forward part)
*
* \return graph with fused pointwise operations in the backward pass
*/
Graph FusePointwiseBackward(Graph&& g);

/*!
* \brief Infer shapes in the graph given the information.
* \param graph The input graph.
Expand Down
48 changes: 44 additions & 4 deletions src/executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <nnvm/graph.h>
#include <nnvm/pass_functions.h>
#include <vector>
#include <set>
#include <algorithm>

#include "./exec_pass.h"
Expand Down Expand Up @@ -337,6 +338,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
if (!need_grad_) return g;
for (size_t i = 0; i < g.outputs.size(); ++i) {
NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
ngrad.node->attrs.name = "_head_grad_" + std::to_string(i);
head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
head_grad_map_[ngrad.node.get()] = i;
}
Expand Down Expand Up @@ -377,6 +379,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
for (const auto &e : g_grad.outputs) {
g.outputs.push_back(e);
}

return g;
}

Expand Down Expand Up @@ -796,6 +799,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
const nnvm::NodeEntryMap<NDArray>& feed_dict) {
nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
aux_state_ctxes, grad_req_types);

// The following code of shape and dtype inferences and argument
// initialization is for simple_bind only. Regular bind operation
// should do this differently.
Expand Down Expand Up @@ -976,6 +980,7 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
this);
return exec;
}

/*!
* \brief This function is triggered by both simple_bind
* and bind flows.
Expand All @@ -993,6 +998,41 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
// setup gradient
nnvm::Graph g = InitFullGraph(symbol, grad_req_types);

#if MXNET_USE_CUDA && !defined(_WIN32)
if (default_ctx.dev_mask() == Context::kGPU && dmlc::GetEnv("MXNET_USE_FUSION", true)) {
nnvm::Graph unoptimized_graph;
common::CopyGraph(&unoptimized_graph, g, false);

if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) {
g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
g = FusePointwiseForward(std::move(g));
g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
g = FusePointwiseBackward(std::move(g));
// Check the topological order of inputs
const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes();
const auto &new_inputs = g.indexed_graph().input_nodes();
if (original_inputs.size() != new_inputs.size()) {
LOG(WARNING)
<< "Number of inputs after fusion does not match original number of inputs. "
<< "This is most probably a bug. Disabling fusion for this run.";
g = unoptimized_graph;
} else {
for (size_t i = 0; i < new_inputs.size(); ++i) {
if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name !=
g.indexed_graph()[new_inputs[i]].source->attrs.name) {
LOG(WARNING) << "Disabling fusion due to altered topological order of inputs.";
g = unoptimized_graph;
break;
}
}
}
} else {
LOG(WARNING)
<< "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
}
}
#endif // MXNET_USE_CUDA

// create "device" and "context" attrs for the graph
g = AssignContext(g, default_ctx, ctx_map,
in_arg_ctxes,
Expand Down Expand Up @@ -1946,7 +1986,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
symbol = exec::BuildSubgraph(symbol, backend, arg_shape_map, arg_dtype_map, arg_stype_map,
default_ctx, group2ctx, &tmp_in_arg_ctxes, &tmp_arg_grad_ctxes,
&tmp_grad_req_types, &tmp_aux_state_ctxes, verbose);
exec->Init(symbol, default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map,
tmp_grad_req_types, shared_arg_names, &tmp_in_args, &tmp_arg_grads,
&tmp_aux_states, shared_buffer, shared_exec);
Expand Down Expand Up @@ -1985,7 +2025,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
}
if (!init) {
// init without subgraph
exec->Init(symbol, default_ctx, group2ctx, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
exec->Init(symbol.Copy(), default_ctx, group2ctx, in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
arg_shape_map, arg_dtype_map, arg_stype_map, grad_req_types, shared_arg_names,
in_args, arg_grads, aux_states, shared_buffer, shared_exec);
}
Expand Down Expand Up @@ -2017,8 +2057,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
verbose);
}
}
exec->Init(symbol, default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, tmp_grad_req_type,
tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,
tmp_grad_req_type, tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
return exec;
}
} // namespace mxnet
Loading