From 778b150c039794276b326fd278e57bc8e01a69d6 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Dec 2019 10:15:32 -0800 Subject: [PATCH 1/5] Debug the long startup time --- src/executor/pointwise_fusion_pass.cc | 29 ++++++++- src/executor/simple_partition_pass.h | 94 ++++++++++++++++++++------- 2 files changed, 97 insertions(+), 26 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 6a0d5f4efe87..5492aa6728e7 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -35,6 +35,7 @@ #include "../operator/fusion/fused_op-inl.h" #include "../operator/fusion/fused_op.h" #include "../operator/operator_common.h" +#include namespace mxnet { namespace exec { @@ -286,21 +287,42 @@ void AddInputsOnlyCompatible(const Graph &g, } } +#define TIMEIT(x) \ +{\ + auto time_start = std::chrono::high_resolution_clock::now();\ + x;\ + auto time_end = std::chrono::high_resolution_clock::now();\ + std::chrono::duration diff = time_end - time_start;\ + std::cout << #x << " took " << diff.count() << std::endl;\ +} + + Graph FusePointwiseForward(Graph &&g) { + std::cout << "Fuse forward" << std::endl; + auto start_time = std::chrono::high_resolution_clock::now(); Graph ret; g.indexed_graph(); const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_outputs); + auto subsets_start = std::chrono::high_resolution_clock::now(); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); - AddInputsOnlyCompatible(fg, &subsets, IsInputsOnlyCompatible); - g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); + auto subsets_end = std::chrono::high_resolution_clock::now(); + std::chrono::duration subsets_diff = subsets_end - subsets_start; + std::cout << "subsets took " << subsets_diff.count() << std::endl; + TIMEIT(AddInputsOnlyCompatible(fg, &subsets, IsInputsOnlyCompatible)); + TIMEIT(g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode)); ret.outputs = g.outputs; + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end_time - start_time; + std::cout << "END Fuse forward: " << diff.count() << std::endl; return ret; } Graph FusePointwiseBackward(Graph &&g) { + std::cout << "Fuse backward" << std::endl; + auto start_time = std::chrono::high_resolution_clock::now(); Graph ret; g.indexed_graph(); const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); @@ -318,6 +340,9 @@ Graph FusePointwiseBackward(Graph &&g) { }); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end_time - start_time; + std::cout << "END Fuse backward: " << diff.count() << std::endl; return ret; } #endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 5b26a4523c13..9de7fc0d0fc3 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -34,12 +34,24 @@ #include #include #include +#include #include "exec_pass.h" namespace mxnet { namespace exec { +#define TIME_START(x) \ + auto start ## x = std::chrono::high_resolution_clock::now(); + +#define TIME_END(x) \ +{\ + auto end ## x = std::chrono::high_resolution_clock::now();\ + std::chrono::duration diff = end ## x - start ## x;\ + std::cout << #x << " took " << diff.count() << std::endl;\ +} + + /*! * \brief Custom graph class, which contains bi-directional nodes @@ -102,12 +114,13 @@ class BidirectionalGraph { std::vector> get_subsets(FCompatible is_compatible) { std::vector> subgraphs; std::unordered_set incomp_set; - std::unordered_set all_set(nodes.size()); - std::vector separation_sets; + std::vector> separation_sets; + std::cout << nodes.size() << std::endl; // Check each node for compatibility // and, if it is incompatible, mark nodes // on each side of it as not possible to be // in the same subset + TIME_START(compute_separation); for (Node& node : nodes) { if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); @@ -116,47 +129,71 @@ class BidirectionalGraph { std::vector dummy_head; dummy_head.emplace_back(&node); DFS(dummy_head, false, [&out_graph, &is_compatible](Node* node) { - if (is_compatible(node->nnvmptr)) + //if (is_compatible(node->nnvmptr)) out_graph.insert(node); }); DFS(dummy_head, true, [&in_graph, is_compatible](Node* node) { - if (is_compatible(node->nnvmptr)) + //if (is_compatible(node->nnvmptr)) in_graph.insert(node); }); - if (!(in_graph.empty() || out_graph.empty())) - separation_sets.push_back(std::make_pair(in_graph, out_graph)); + if (!(in_graph.empty() || out_graph.empty())) { + separation_sets.push_back(std::make_pair(true, + std::make_pair(in_graph, out_graph))); + } else { + separation_sets.push_back(std::make_pair(false, PairSet())); + } + } else { + separation_sets.push_back(std::make_pair(false, PairSet())); } - all_set.emplace(&node); } + TIME_END(compute_separation); IncompMap incomp_map; - std::unordered_set comp_set; - comp_set.insert(all_set.begin(), all_set.end()); - for (Node* n : incomp_set) { - comp_set.erase(n); - } // For each node construct the map of nodes that cannot be in // the same subset - for (Node* n : comp_set) { - for (PairSet p : separation_sets) { - if (p.first.count(n)) { - incomp_map[n].insert(p.second.begin(), p.second.end()); - } else if (p.second.count(n)) { - incomp_map[n].insert(p.first.begin(), p.first.end()); + TIME_START(map_construct); + index_t num_nodes = nodes.size(); + for (index_t i = 0; i < num_nodes; ++i) { + const auto n = &(nodes[i]); + if (incomp_set.count(n) == 0) { + for (index_t j = i + 1; j < num_nodes; ++j) { + const auto& sep_set_pair = separation_sets[j]; + if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) { + const auto& p = sep_set_pair.second; + if (p.first.count(n)) { + incomp_map[n].insert(p.second.begin(), p.second.end()); + } else if (p.second.count(n)) { + incomp_map[n].insert(p.first.begin(), p.first.end()); + } + } + } + for (index_t j = i - 1; j >= 0; --j) { + const auto& sep_set_pair = separation_sets[j]; + if (sep_set_pair.first && incomp_map[n].count(&nodes[j]) == 0) { + const auto& p = sep_set_pair.second; + if (p.first.count(n)) { + incomp_map[n].insert(p.second.begin(), p.second.end()); + } else if (p.second.count(n)) { + incomp_map[n].insert(p.first.begin(), p.first.end()); + } + } + } + for (Node* incomp_n : incomp_set) { + incomp_map[n].erase(incomp_n); } - } - for (Node* incomp_n : incomp_set) { - incomp_map[n].erase(incomp_n); } } + TIME_END(map_construct); std::unordered_set unused_set; - unused_set.reserve(comp_set.size()); - for (auto& n : comp_set) { - unused_set.insert(n); + for (auto& n : nodes) { + if (incomp_set.count(&n) == 0) { + unused_set.insert(&n); + } } std::unordered_set visited; std::deque stack(outputs.begin(), outputs.end()); // Create subsets + TIME_START(create_subsets); while (!stack.empty()) { Node* vertex = stack.front(); stack.pop_front(); @@ -170,6 +207,7 @@ class BidirectionalGraph { } } } + TIME_END(create_subsets); return subgraphs; } @@ -420,13 +458,20 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set * which identifies which nodes should be included in * subsets. */ + template std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_compatible) { + TIME_START(big_construction); BidirectionalGraph biG = BidirectionalGraph(g); + TIME_END(big_construction); + + TIME_START(get_subsets); std::vector> subsets = biG.get_subsets(is_compatible); + TIME_END(get_subsets); std::vector nnvm_subsets; nnvm_subsets.reserve(subsets.size()); + TIME_START(prepare_subsets); for (auto& subset : subsets) { if (subset.size() > 1) { NodeRawPtrSet node_set; @@ -437,6 +482,7 @@ std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_c nnvm_subsets.push_back(node_set); } } + TIME_END(prepare_subsets); return nnvm_subsets; } From 04f47faf1725ef1f554e01a0fba3b948a22679d9 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Dec 2019 14:39:59 -0800 Subject: [PATCH 2/5] Optimize backward fusion --- src/executor/simple_partition_pass.h | 47 +++++++++++++++++++--------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 9de7fc0d0fc3..149648816047 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -124,21 +124,38 @@ class BidirectionalGraph { for (Node& node : nodes) { if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); - std::unordered_set in_graph; - std::unordered_set out_graph; - std::vector dummy_head; - dummy_head.emplace_back(&node); - DFS(dummy_head, false, [&out_graph, &is_compatible](Node* node) { - //if (is_compatible(node->nnvmptr)) - out_graph.insert(node); - }); - DFS(dummy_head, true, [&in_graph, is_compatible](Node* node) { - //if (is_compatible(node->nnvmptr)) - in_graph.insert(node); - }); - if (!(in_graph.empty() || out_graph.empty())) { - separation_sets.push_back(std::make_pair(true, - std::make_pair(in_graph, out_graph))); + } + } + for (Node& node : nodes) { + if (incomp_set.count(&node) != 0) { + // Check if all your inputs and outputs are incompatible too. + // If so, then your separation set does not matter + bool inside_node = true; + for (Node* input : node.inputs) { + if (incomp_set.count(input) == 0) { + inside_node = false; + } + } + if (inside_node) { + for (Node* output : node.outputs) { + if (incomp_set.count(output) == 0) { + inside_node = false; + } + } + } + if (!inside_node) { + std::unordered_set in_graph; + std::unordered_set out_graph; + std::vector dummy_head; + dummy_head.emplace_back(&node); + DFS(dummy_head, false, [&out_graph](Node* node) { + out_graph.insert(node); + }); + DFS(dummy_head, true, [&in_graph](Node* node) { + in_graph.insert(node); + }); + separation_sets.push_back(std::make_pair(true, + std::make_pair(in_graph, out_graph))); } else { separation_sets.push_back(std::make_pair(false, PairSet())); } From f09220703f2c9774a3ef5b5ebe0f377c0b0e53ad Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Dec 2019 15:00:55 -0800 Subject: [PATCH 3/5] Figure out why the fusion pass is called twice --- src/imperative/cached_op.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index ec5a79a2e675..1edd9897ec82 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -1032,17 +1032,19 @@ OpStatePtr CachedOp::Forward( CHECK_EQ(inputs.size(), num_inputs()); Context default_ctx = inputs[0]->ctx(); - auto state_ptr = GetCachedOpState(default_ctx); - auto& state = state_ptr.get_state(); + { + auto state_ptr = GetCachedOpState(default_ctx); + auto& state = state_ptr.get_state(); - const auto& idx = state.info.fwd_graph.indexed_graph(); - for (size_t i = 0; i < inputs.size(); ++i) { - CHECK_EQ(inputs[i]->ctx(), default_ctx) - << "CachedOp requires all inputs to live on the same context. But " - << idx[idx.input_nodes()[0]].source->attrs.name - << " is on " << default_ctx << " while " - << idx[idx.input_nodes()[i]].source->attrs.name - << " is on " << inputs[i]->ctx(); + const auto& idx = state.info.fwd_graph.indexed_graph(); + for (size_t i = 0; i < inputs.size(); ++i) { + CHECK_EQ(inputs[i]->ctx(), default_ctx) + << "CachedOp requires all inputs to live on the same context. But " + << idx[idx.input_nodes()[0]].source->attrs.name + << " is on " << default_ctx << " while " + << idx[idx.input_nodes()[i]].source->attrs.name + << " is on " << inputs[i]->ctx(); + } } int prev_bulk_size = Engine::Get()->set_bulk_size(config_.forward_bulk_size); From 0c8393d20e617fd37cd6f27cc4d8b72643474ab9 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Dec 2019 15:56:32 -0800 Subject: [PATCH 4/5] Cleaning --- src/executor/pointwise_fusion_pass.cc | 29 ++------------------------- src/executor/simple_partition_pass.h | 27 ------------------------- 2 files changed, 2 insertions(+), 54 deletions(-) diff --git a/src/executor/pointwise_fusion_pass.cc b/src/executor/pointwise_fusion_pass.cc index 5492aa6728e7..6a0d5f4efe87 100644 --- a/src/executor/pointwise_fusion_pass.cc +++ b/src/executor/pointwise_fusion_pass.cc @@ -35,7 +35,6 @@ #include "../operator/fusion/fused_op-inl.h" #include "../operator/fusion/fused_op.h" #include "../operator/operator_common.h" -#include namespace mxnet { namespace exec { @@ -287,42 +286,21 @@ void AddInputsOnlyCompatible(const Graph &g, } } -#define TIMEIT(x) \ -{\ - auto time_start = std::chrono::high_resolution_clock::now();\ - x;\ - auto time_end = std::chrono::high_resolution_clock::now();\ - std::chrono::duration diff = time_end - time_start;\ - std::cout << #x << " took " << diff.count() << std::endl;\ -} - - Graph FusePointwiseForward(Graph &&g) { - std::cout << "Fuse forward" << std::endl; - auto start_time = std::chrono::high_resolution_clock::now(); Graph ret; g.indexed_graph(); const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); Graph fg; fg.outputs.insert(fg.outputs.begin(), g.outputs.begin(), g.outputs.begin() + num_forward_outputs); - auto subsets_start = std::chrono::high_resolution_clock::now(); auto subsets = GetCompatibleSubsets(fg, IsFusionCompatible); - auto subsets_end = std::chrono::high_resolution_clock::now(); - std::chrono::duration subsets_diff = subsets_end - subsets_start; - std::cout << "subsets took " << subsets_diff.count() << std::endl; - TIMEIT(AddInputsOnlyCompatible(fg, &subsets, IsInputsOnlyCompatible)); - TIMEIT(g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode)); + AddInputsOnlyCompatible(fg, &subsets, IsInputsOnlyCompatible); + g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff = end_time - start_time; - std::cout << "END Fuse forward: " << diff.count() << std::endl; return ret; } Graph FusePointwiseBackward(Graph &&g) { - std::cout << "Fuse backward" << std::endl; - auto start_time = std::chrono::high_resolution_clock::now(); Graph ret; g.indexed_graph(); const auto& num_forward_outputs = g.GetAttr("num_forward_outputs"); @@ -340,9 +318,6 @@ Graph FusePointwiseBackward(Graph &&g) { }); g = ReplaceSubgraphsPointwise(std::move(g), subsets, CreateSubgraphNode); ret.outputs = g.outputs; - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration diff = end_time - start_time; - std::cout << "END Fuse backward: " << diff.count() << std::endl; return ret; } #endif // MXNET_USE_CUDA && MXNET_ENABLE_CUDA_RTC diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 149648816047..814a26e178e6 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -34,24 +34,12 @@ #include #include #include -#include #include "exec_pass.h" namespace mxnet { namespace exec { -#define TIME_START(x) \ - auto start ## x = std::chrono::high_resolution_clock::now(); - -#define TIME_END(x) \ -{\ - auto end ## x = std::chrono::high_resolution_clock::now();\ - std::chrono::duration diff = end ## x - start ## x;\ - std::cout << #x << " took " << diff.count() << std::endl;\ -} - - /*! * \brief Custom graph class, which contains bi-directional nodes @@ -115,12 +103,10 @@ class BidirectionalGraph { std::vector> subgraphs; std::unordered_set incomp_set; std::vector> separation_sets; - std::cout << nodes.size() << std::endl; // Check each node for compatibility // and, if it is incompatible, mark nodes // on each side of it as not possible to be // in the same subset - TIME_START(compute_separation); for (Node& node : nodes) { if (!is_compatible(node.nnvmptr)) { incomp_set.insert(&node); @@ -163,11 +149,9 @@ class BidirectionalGraph { separation_sets.push_back(std::make_pair(false, PairSet())); } } - TIME_END(compute_separation); IncompMap incomp_map; // For each node construct the map of nodes that cannot be in // the same subset - TIME_START(map_construct); index_t num_nodes = nodes.size(); for (index_t i = 0; i < num_nodes; ++i) { const auto n = &(nodes[i]); @@ -199,7 +183,6 @@ class BidirectionalGraph { } } } - TIME_END(map_construct); std::unordered_set unused_set; for (auto& n : nodes) { @@ -210,7 +193,6 @@ class BidirectionalGraph { std::unordered_set visited; std::deque stack(outputs.begin(), outputs.end()); // Create subsets - TIME_START(create_subsets); while (!stack.empty()) { Node* vertex = stack.front(); stack.pop_front(); @@ -224,7 +206,6 @@ class BidirectionalGraph { } } } - TIME_END(create_subsets); return subgraphs; } @@ -475,20 +456,13 @@ Graph ReplaceSubgraphs(Graph&& g, const std::vector& subgraph_set * which identifies which nodes should be included in * subsets. */ - template std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_compatible) { - TIME_START(big_construction); BidirectionalGraph biG = BidirectionalGraph(g); - TIME_END(big_construction); - - TIME_START(get_subsets); std::vector> subsets = biG.get_subsets(is_compatible); - TIME_END(get_subsets); std::vector nnvm_subsets; nnvm_subsets.reserve(subsets.size()); - TIME_START(prepare_subsets); for (auto& subset : subsets) { if (subset.size() > 1) { NodeRawPtrSet node_set; @@ -499,7 +473,6 @@ std::vector GetCompatibleSubsets(const Graph& g, FCompatible is_c nnvm_subsets.push_back(node_set); } } - TIME_END(prepare_subsets); return nnvm_subsets; } From 277ebc4be6b49927a91692f8f11feb38f81edf6e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Wed, 18 Dec 2019 16:19:19 -0800 Subject: [PATCH 5/5] Small optimization --- src/executor/simple_partition_pass.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/executor/simple_partition_pass.h b/src/executor/simple_partition_pass.h index 814a26e178e6..ea1dcf39b8ba 100644 --- a/src/executor/simple_partition_pass.h +++ b/src/executor/simple_partition_pass.h @@ -114,21 +114,15 @@ class BidirectionalGraph { } for (Node& node : nodes) { if (incomp_set.count(&node) != 0) { - // Check if all your inputs and outputs are incompatible too. - // If so, then your separation set does not matter + // Check if all your inputs are incompatible too. + // If so, then your separation set does not matter, + // because it will covered by the sets of your inputs bool inside_node = true; for (Node* input : node.inputs) { if (incomp_set.count(input) == 0) { inside_node = false; } } - if (inside_node) { - for (Node* output : node.outputs) { - if (incomp_set.count(output) == 0) { - inside_node = false; - } - } - } if (!inside_node) { std::unordered_set in_graph; std::unordered_set out_graph;