From f50495bf7219d681be7072138a1ad818681b2f5a Mon Sep 17 00:00:00 2001 From: electriclilies Date: Fri, 3 Sep 2021 11:27:44 -0700 Subject: [PATCH 01/12] Clean up LowerTEPass Add attrs to IRModule equal and hash Make LowerTEPass opt_level 0 Copy IRModule attrs to per_target_modules Add ShallowCopy to IRmodule --- include/tvm/ir/module.h | 20 ++++++- src/ir/module.cc | 11 +++- src/relay/backend/aot_executor_codegen.cc | 10 ++-- src/relay/backend/graph_executor_codegen.cc | 13 ++--- src/relay/backend/interpreter.cc | 52 +++++++------------ src/relay/backend/te_compiler.cc | 27 ++-------- src/relay/backend/te_compiler.h | 7 --- src/relay/ir/transform.cc | 4 +- src/relay/transforms/partition_graph.cc | 6 ++- .../transforms/to_basic_block_normal_form.cc | 2 +- src/relay/transforms/type_infer.cc | 17 +++--- .../relay/test_backend_compile_engine.py | 2 + 12 files changed, 84 insertions(+), 87 deletions(-) diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h index fefb08f878ef..852c7d0d8a98 100644 --- a/include/tvm/ir/module.h +++ b/include/tvm/ir/module.h @@ -122,6 +122,7 @@ class IRModuleNode : public Object { v->Visit("global_var_map_", &global_var_map_); v->Visit("global_type_var_map_", &global_type_var_map_); v->Visit("source_map", &source_map); + v->Visit("attrs", &attrs); } TVM_DLL bool SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const; @@ -277,6 +278,12 @@ class IRModuleNode : public Object { */ TVM_DLL void Update(const IRModule& other); + /*! + * \brief Create a shallow copy of this IRModule. + * \returns The shallow copy of the IRModule. + */ + TVM_DLL IRModule ShallowCopy(); + /*! * \brief Import Relay code from the file at path. * \param path The path of the Relay code to import. @@ -348,12 +355,14 @@ class IRModule : public ObjectRef { * \brief constructor * \param functions Functions in the module. * \param type_definitions Type definitions in the module. - * \param import_set Set of imported files in the module + * \param import_set Set of imported files in the module. * \param map The module source map. + * \param attrs The module attributes. */ TVM_DLL explicit IRModule(Map functions, Map type_definitions = {}, - std::unordered_set import_set = {}, parser::SourceMap map = {}); + std::unordered_set import_set = {}, parser::SourceMap map = {}, + DictAttrs attrs = {}); /*! \brief default constructor */ IRModule() : IRModule(Map({})) {} @@ -415,6 +424,13 @@ class IRModule : public ObjectRef { */ TVM_DLL static IRModule FromText(const String& text, const String& source_path); + /*! + * \brief Create a shallow copy of an IRModule. + * \param mod The module to copy. + * \return The copied module. + */ + IRModule ShallowCopyIRModule(IRModule mod); + /*! \brief Declare the container type. */ using ContainerType = IRModuleNode; diff --git a/src/ir/module.cc b/src/ir/module.cc index d4129c84ccf5..15c441d61a23 100644 --- a/src/ir/module.cc +++ b/src/ir/module.cc @@ -43,7 +43,8 @@ namespace tvm { IRModule::IRModule(tvm::Map functions, tvm::Map type_definitions, - std::unordered_set import_set, parser::SourceMap source_map) { + std::unordered_set import_set, parser::SourceMap source_map, + DictAttrs attrs) { auto n = make_object(); n->functions = std::move(functions); n->type_definitions = std::move(type_definitions); @@ -52,6 +53,7 @@ IRModule::IRModule(tvm::Map functions, n->constructor_tag_map_ = {}; n->import_set_ = std::move(import_set); n->source_map = source_map; + n->attrs = std::move(attrs); for (const auto& kv : n->functions) { // set global var map @@ -72,6 +74,7 @@ IRModule::IRModule(tvm::Map functions, bool IRModuleNode::SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const { if (functions.size() != other->functions.size()) return false; + if (!equal(this->attrs, other->attrs)) return false; for (const auto& kv : this->functions) { if (!other->ContainGlobalVar(kv.first->name_hint)) return false; if (!equal(kv.second, other->Lookup(kv.first->name_hint))) return false; @@ -112,6 +115,7 @@ void IRModuleNode::SHashReduce(SHashReducer hash_reduce) const { temp.emplace_back(kv.first->name_hint, kv.second); } reduce_temp(); + hash_reduce(this->attrs); } bool IRModuleNode::ContainGlobalVar(const String& name) const { @@ -361,6 +365,11 @@ void IRModuleNode::Update(const IRModule& mod) { } } +IRModule IRModuleNode::ShallowCopy() { + return IRModule(this->functions, this->type_definitions, this->Imports(), this->source_map, + this->attrs); +} + std::pair IRModule::FromExprInContext( const RelayExpr& expr, const tvm::Map& global_funcs, const tvm::Map& type_definitions, diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 0c094cb1fa2c..e2fd502fc632 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -598,8 +598,12 @@ class AOTExecutorCodegen : public MixedModeVisitor { tec::UpdateFunctionMetadata(func, this->function_metadata_); })(mod); - auto lowered_main = lowered_mod->Lookup("main"); - auto lowered_main_func = GetRef(lowered_main.as()); + Optional main_func_info = + lowered_mod->GetAttr("main_func_info"); + ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; + function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); + + Function lowered_main_func = Downcast(lowered_mod->Lookup("main")); // Post-lowering storage map for writing main func - this should be the same map as previously // created, just referencing the new expressions created from lowering @@ -676,7 +680,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { Optional> external_modules = lowered_mod->GetAttr>("external_mods"); - ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point."; + ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point."; // This is the point where we separate the functions in the module by target ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod); diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index b7b388431ca1..aca95db34c4e 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -241,26 +241,23 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator(main_module->Lookup("main")); + Function lowered_main_func = Downcast(lowered_mod->Lookup("main")); // Now that we have lowered all operators to TIR code, we can proceed with compilation. // // We need to unfortunately re-plan as the previous results have been invalidated by lowering // we will fix this in future refactors. - memory_plan_ = GraphPlanMemory(main_func); + memory_plan_ = GraphPlanMemory(lowered_main_func); // The graph planner also can not handle planning calls to global variables to we must remap // First we convert all the parameters into input nodes. - for (auto param : main_func->params) { + for (auto param : lowered_main_func->params) { auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs()); var_map_[param.get()] = AddNode(node_ptr, param); } - heads_ = VisitExpr(main_func->body); + heads_ = VisitExpr(lowered_main_func->body); std::ostringstream os; dmlc::JSONWriter writer(&os); @@ -277,7 +274,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator> external_modules = lowered_mod->GetAttr>("external_mods"); - ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point."; + ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point."; // This is the point where we separate the functions in the module by target ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod); diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc index 82455bdf925c..df14b9e078b6 100644 --- a/src/relay/backend/interpreter.cc +++ b/src/relay/backend/interpreter.cc @@ -292,14 +292,8 @@ InterpreterState::InterpreterState(Expr current_expr, InterpreterState::Stack st class Interpreter : public ExprFunctor, PatternFunctor { public: - // TODO(mbs, electriclilies): Collapse mod and per_target_module once IRModule subsumes - // LoweredModule. - Interpreter(IRModule mod, Map per_target_module, Device device, Target target) - : mod_(mod), - per_target_module_(per_target_module), - device_(device), - target_(target), - debug_op_(Op::Get("debug")) {} + Interpreter(IRModule unified_mod, Device device, Target target) + : unified_mod_(unified_mod), device_(device), target_(target), debug_op_(Op::Get("debug")) {} template T WithFrame(const Frame& fr, const std::function& f) { @@ -316,7 +310,7 @@ class Interpreter : public ExprFunctor, ObjectRef VisitExpr_(const VarNode* var_node) final { return Lookup(GetRef(var_node)); } ObjectRef VisitExpr_(const GlobalVarNode* op) final { - return Eval(mod_->Lookup(GetRef(op))); + return Eval(unified_mod_->Lookup(GetRef(op))); } ObjectRef VisitExpr_(const OpNode* id) override { @@ -387,9 +381,9 @@ class Interpreter : public ExprFunctor, // Project out just the function(s) we need. IRModule lowered_projected_mod; + Map per_target_module = tec::GetPerTargetModules(unified_mod_); std::unordered_map - per_target_module_std_map = - backend::TargetModuleMapToTargetStrModuleMap(per_target_module_); + per_target_module_std_map = backend::TargetModuleMapToTargetStrModuleMap(per_target_module); auto mod_itr = per_target_module_std_map.find(target); ICHECK(mod_itr != per_target_module_std_map.end()) << "No target module for target '" << target->str() << "'"; @@ -876,13 +870,11 @@ class Interpreter : public ExprFunctor, } private: - // Main module. All expressions are eval'ed w.r.t. the definitions in this module. This module - // may contain calls to TIR functions bound in a per_target_module_ below. - IRModule mod_; - // Map from target key to lowered TIR functions derived from mod_. - // Note that primitives are implicitly executed on target_, while shape functions are implicitly - // executed on the default 'cpu' host. Thus this map has at most two entries. - Map per_target_module_; + // Unified module. Functions are annotated with their target. + // All expressions are eval'ed w.r.t. the definitions in this module. + // This module contains functions that used to be in main_module and the per_target_module (TIR + // functions) in one module. + IRModule unified_mod_; // Cached packed functions for the primitives and shape functions, keyed by target and // global var name. std::unordered_map, PackedFunc, PairHash> compiled_packed_funcs_; @@ -902,7 +894,7 @@ class Interpreter : public ExprFunctor, * rewritten \p mod and target-specific modules containing bindings for all TIR primitive * functions needed by the rewritten module. */ -std::pair> Prepare(IRModule mod, Device device, Target target) { +IRModule Prepare(IRModule mod, Device device, Target target) { // Things to initialize to pass into tec::LowerTEPass // We only have one device-specific target. tec::TargetMap targets = {{device.device_type, target}}; @@ -930,8 +922,7 @@ std::pair> Prepare(IRModule mod, Device device, With ctx(pass_ctx); mod = seq(mod); - // Lower all primitive functions reachable from expr. - return {tec::GetMainModule(mod), tec::GetPerTargetModules(mod)}; + return mod; } /*! \brief Check if an expression could be changed by \p Prepare. @@ -1020,11 +1011,9 @@ TypedPackedFunc)> EvalFunction(IRModule mod, Expr expr, De // and can just eval it directly. expr_to_eval = expr; } - std::pair> main_and_lowered = - Prepare(mod_with_expr, device, target); - std::shared_ptr intrp = std::make_shared( - /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device, - target); + IRModule lowered_mod = Prepare(mod_with_expr, device, target); + + std::shared_ptr intrp = std::make_shared(lowered_mod, device, target); // // Step 2: Evaluate target function to a closure. @@ -1063,12 +1052,11 @@ ObjectRef Eval(Expr expr, Map type_definitions, std::unordered_set import_set, Device device, Target target) { std::pair mod_and_global = IRModule::FromExprInContext(expr, /*global_funcs=*/{}, type_definitions, import_set); - std::pair> main_and_lowered = - Prepare(mod_and_global.first, device, target); - Interpreter intrp( - /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device, - target); - Expr expr_to_eval = main_and_lowered.first->GetGlobalVar(mod_and_global.second->name_hint); + + IRModule mod = Prepare(mod_and_global.first, device, target); + + Interpreter intrp(mod, device, target); + Expr expr_to_eval = mod->GetGlobalVar(mod_and_global.second->name_hint); if (expr.as() == nullptr) { // TODO(mbs): IRModule::FromExpr will implicitly close over the free vars of expr // unless it is a function, so we must reverse that in the expression to eval. diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 4d7f50b4f3a0..0393fdfec70d 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -900,8 +900,9 @@ Map GetPerTargetModules(IRModule mod) { // Put the function in per_target_modules if (!per_target_modules.count(target.value())) { - // Initialize the IRModule for this target and add the function - IRModule target_module; + // Initialize the IRModule for this target with the attributes from the input IRModule + IRModule target_module = IRModule({}, {}, {}, {}, mod->attrs); + // Add the function to the IRModule target_module->Add(var, func); per_target_modules[target.value()] = target_module; } else { @@ -918,23 +919,6 @@ Map GetPerTargetModules(IRModule mod) { return per_target_modules; } -IRModule GetMainModule(IRModule mod) { - IRModule main_module; - // Copy the type defs - for (const auto& kv : mod->type_definitions) { - main_module->AddTypeDef(kv.first, kv.second); - } - // Copy all Relay functions (we don't include PrimFuncs) - for (auto kv : mod->functions) { - const GlobalVar& var = kv.first; - const BaseFunc& func = kv.second; - if (func->IsInstance()) { - main_module->Add(var, func); - } - } - return main_module; -} - Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, backend::StaticMemoryPlan memory_plan, const String& module_name, std::function process_fn) { @@ -942,9 +926,8 @@ Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, PassContext ctx) { return LowerTE(module, targets, device_context_map, memory_plan, module_name, process_fn); }; - // TODO(@electriclilies, mbs): Fold InferType() pass into LowerTEPass since it will always need to - // be called afterwards - return tvm::transform::CreateModulePass(pass_func, 1, "LowerTE", {}); + return tvm::transform::Sequential( + {tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}), InferType()}); } } // namespace tec } // namespace relay diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h index 082cd8c4491a..9d0eb1078ee0 100644 --- a/src/relay/backend/te_compiler.h +++ b/src/relay/backend/te_compiler.h @@ -165,13 +165,6 @@ Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets); */ Map GetPerTargetModules(IRModule mod); -/*! - * \brief Utility to extract all the Relay functions from an IRModule, with no PrimFuncs. - * \param mod The IRModule to extract the Relay functions from - * \return An IRModule containing only the Relay functions that are in the input mod (no PrimFuncs) - */ -IRModule GetMainModule(IRModule mod); - /*! \brief Lower an IRModule's primitive functions to TIR. * * This is the "back half" of the Relay compiler which lowers "primitive functions" diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc index 4a7974cae5ae..344d1cae7823 100644 --- a/src/relay/ir/transform.cc +++ b/src/relay/ir/transform.cc @@ -133,9 +133,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx) DLOG(INFO) << "Executing function pass : " << pass_info->name << " with opt level: " << pass_info->opt_level; - // Execute the pass function and return a new module. - IRModule updated_mod = - IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map); + IRModule updated_mod = mod->ShallowCopy(); std::vector > updates; for (const auto& it : updated_mod->functions) { diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index b48fbe44bd11..019659b3166e 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -30,6 +30,7 @@ */ #include +#include #include #include #include @@ -509,7 +510,10 @@ class NameMangleExtFuncs : public MixedModeMutator { // Walk the tree and mangle the functions. Then replace compiler functions // with mangled functions in the module - IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports()); + IRModule new_module = module_->ShallowCopy(); + new_module->functions = {}; + // IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports(), + // module_->source_map, module_->attrs); for (const auto& pair : glob_funcs) { if (auto* fn = pair.second.as()) { auto func = GetRef(fn); diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc index d03fc1488aea..04158aa02f64 100644 --- a/src/relay/transforms/to_basic_block_normal_form.cc +++ b/src/relay/transforms/to_basic_block_normal_form.cc @@ -52,7 +52,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) { DLOG(INFO) << "ToBBlock:" << std::endl << mod; // Create a new module by shallow copy. - auto mod_ = IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map); + auto mod_ = mod->ShallowCopy(); tvm::Map updates; auto funcs = mod_->functions; diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc index f29087dcc049..6c2371716b16 100644 --- a/src/relay/transforms/type_infer.cc +++ b/src/relay/transforms/type_infer.cc @@ -205,13 +205,17 @@ class TypeInferencer : private ExprFunctor, this->EmitFatal(Diagnostic::Error(op->span) << "Cannot do type inference on global variables " << "without a module"); } - if (mod_->ContainGlobalVar(var->name_hint)) { - relay::Function e = Downcast(mod_->Lookup(var)); - return e->checked_type(); - } else { - return op->checked_type_; + BaseFunc func = mod_->Lookup(var->name_hint); + + if (func->IsInstance()) { + relay::Function relay_func = Downcast(func); + return relay_func->checked_type(); + } } + // Return op->checked_type if the module doesn't contain the GlobalVar or the function is a + // PrimFunc (we don't typecheck PrimFuncs) + return op->checked_type_; } Type VisitExpr_(const ConstantNode* op) final { return op->tensor_type(); } @@ -822,8 +826,7 @@ Pass InferType() { [=](IRModule mod, const PassContext& pass_ctx) { DLOG(INFO) << "tvm::relay::transform::InferType"; // Execute the pass function and return a new module. - IRModule updated_mod = - IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map); + IRModule updated_mod = mod->ShallowCopy(); pass_ctx->diag_ctx = DiagnosticContext::Default(updated_mod); diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py index b90bce548a5e..092cae01f568 100644 --- a/tests/python/relay/test_backend_compile_engine.py +++ b/tests/python/relay/test_backend_compile_engine.py @@ -194,6 +194,8 @@ def get_func(shape): engine.dump() +# Note: Once compile engine is removed, we should keep this test so that +# we make sure that opt_level=0 passes are being called correctly. def test_compile_placeholder_bypass(): engine = relay.backend.compile_engine.get() x = relay.var("x", shape=(2, 3)) From 6d00a9c4729f945837ae521c087d4f4d3a8f4025 Mon Sep 17 00:00:00 2001 From: electriclilies Date: Tue, 7 Sep 2021 15:53:52 -0700 Subject: [PATCH 02/12] Fix rebase --- src/relay/backend/aot_executor_codegen.cc | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index e2fd502fc632..70779ac58abf 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -598,12 +598,8 @@ class AOTExecutorCodegen : public MixedModeVisitor { tec::UpdateFunctionMetadata(func, this->function_metadata_); })(mod); - Optional main_func_info = - lowered_mod->GetAttr("main_func_info"); - ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; - function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); - - Function lowered_main_func = Downcast(lowered_mod->Lookup("main")); + auto lowered_main = lowered_mod->Lookup("main"); + auto lowered_main_func = GetRef(lowered_main.as()); // Post-lowering storage map for writing main func - this should be the same map as previously // created, just referencing the new expressions created from lowering From 3d2ba1ba7602fb97995e7ae46b187edb0a3a10be Mon Sep 17 00:00:00 2001 From: electriclilies Date: Tue, 7 Sep 2021 22:57:13 -0700 Subject: [PATCH 03/12] Remove comment --- src/relay/transforms/partition_graph.cc | 3 +-- src/relay/transforms/to_basic_block_normal_form.cc | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index 019659b3166e..f74cf983ccae 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -512,8 +512,7 @@ class NameMangleExtFuncs : public MixedModeMutator { // with mangled functions in the module IRModule new_module = module_->ShallowCopy(); new_module->functions = {}; - // IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports(), - // module_->source_map, module_->attrs); + for (const auto& pair : glob_funcs) { if (auto* fn = pair.second.as()) { auto func = GetRef(fn); diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc index 04158aa02f64..8e952d60b8b7 100644 --- a/src/relay/transforms/to_basic_block_normal_form.cc +++ b/src/relay/transforms/to_basic_block_normal_form.cc @@ -52,7 +52,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) { DLOG(INFO) << "ToBBlock:" << std::endl << mod; // Create a new module by shallow copy. - auto mod_ = mod->ShallowCopy(); + IRModule mod_ = mod->ShallowCopy(); tvm::Map updates; auto funcs = mod_->functions; From ee52039cdc81a229477985ec2039f7a2b71d404f Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Wed, 8 Sep 2021 16:29:54 +0300 Subject: [PATCH 04/12] [TEC] Remove memory plan from LowerTEPass --- src/relay/backend/aot_executor_codegen.cc | 172 ++++++++++++++++++- src/relay/backend/graph_executor_codegen.cc | 2 +- src/relay/backend/interpreter.cc | 21 ++- src/relay/backend/te_compiler.cc | 178 +------------------- src/relay/backend/te_compiler.h | 4 +- 5 files changed, 190 insertions(+), 187 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 70779ac58abf..a928a76c6df1 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -45,6 +45,13 @@ namespace tvm { namespace relay { namespace backend { +struct EnumClassHash { + template + std::size_t operator()(T t) const { + return static_cast(t); + } +}; + using IntegerArray = Array; using StorageMap = std::unordered_map; @@ -277,6 +284,132 @@ class AOTExecutorCodegen : public MixedModeVisitor { } } + /*! + * \brief Update the "main" control function's metadata + * + * \param mod The module + * \param targets Map of targets + * \return function_infos Function info for each function in the module + */ + + backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets, + Map storage_info_map) { + CHECK_EQ(mod->functions.size(), 1) + << "There should only be one function in the module passed to UpdateMainWorkspaceSize"; + Function func = Downcast(mod->Lookup("main")); + + // This is a Map> + std::unordered_map, EnumClassHash> sid_workspace; + // This is a Map + std::unordered_map device_io; + // This is a Map + std::unordered_map device_consts; + + // Initialize the mapping from all storage identifiers to workspace sizes, + // the amount of device io, and the device constants. + for (const auto& kv : storage_info_map) { + backend::StorageInfo storage_info = kv.second; + std::vector storage_ids = storage_info->storage_ids; + std::vector devices = storage_info->device_types; + + CHECK_EQ(storage_ids.size(), devices.size()); + for (uint32_t i = 0; i < devices.size(); i++) { + sid_workspace[devices[i]][storage_ids[i]] = 0; + device_io[devices[i]] = 0; + device_consts[devices[i]] = 0; + } + } + + // Iterate the storage map to compute all the tensor sizes in the program. + // There are 3 cases in this code: + // + // First we need to compute the sizes of all + // inline constants. + // + // Second we compute the size of any bound variable as these are input and output + // sizes of the program. + // + // Finally for all other expressions we check which storage identifier they have + // been assigned and we compute the maximal size of the storage, as tensors can + // share storage with other tensors which are the same size or larger. + // + // In this final case there is only one allocation for all tensors which share storage + // which will be the maximal size of all tensors which were assigned to it. + for (const auto& kv : storage_info_map) { + Expr expr = kv.first; + int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type()); + backend::StorageInfo storage_info = kv.second; + std::vector storage_ids = storage_info->storage_ids; + std::vector devices = storage_info->device_types; + + if (expr->IsInstance()) { + for (const auto& dev : devices) { + device_consts[dev] += size_bytes; + } + continue; + } else if (expr->IsInstance() || expr.same_as(func->body)) { + CHECK_GE(devices.size(), 1) << "must be at least one device"; + for (const auto& dev : devices) { + device_io[dev] += size_bytes; + } + continue; + } + + // TODO(@electriclilies): This code is never being called which means sid_workspace is not + // updated.. This means that storage info is probably not being created correctly. Or is not + // equivalent to what was here previously + for (uint32_t i = 0; i < storage_ids.size(); i++) { + // Here we record the largest size of the tensor + // that share the same storage id, because storage_id will + // be shared between multiple tensors that are not live simultaneously. + if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) { + sid_workspace[devices[i]][storage_ids[i]] = size_bytes; + } + } + } + + // This is a Map + std::unordered_map device_workspace; + // Once we know the sizes of sids, we need to accumulate per device + for (const auto& dev_sid_size : sid_workspace) { + auto dev = dev_sid_size.first; + device_workspace[dev] = 0; + for (const auto& sid_size : dev_sid_size.second) { + device_workspace[dev] += sid_size.second; + } + } + + Map workspace_sizes; + Map io_sizes; + Map constant_sizes; + Map tir_primfuncs; + Map relay_primfuncs; + + // Initialize all target workspaces to zero + for (const auto& kv : targets) { + auto tgt = kv.second; + workspace_sizes.Set(tgt, 0); + } + + for (const auto& dev_and_size : device_workspace) { + auto tgt = GetTargetFromInteger(dev_and_size.first, targets); + workspace_sizes.Set(tgt, dev_and_size.second); + relay_primfuncs.Set(tgt, func); + } + for (const auto& dev_and_size : device_io) { + auto tgt = GetTargetFromInteger(dev_and_size.first, targets); + io_sizes.Set(tgt, dev_and_size.second); + } + + for (const auto& dev_and_size : device_consts) { + auto tgt = GetTargetFromInteger(dev_and_size.first, targets); + constant_sizes.Set(tgt, dev_and_size.second); + } + + return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs, + relay_primfuncs); + } + /*! * brief Call a function with a given name */ @@ -314,6 +447,36 @@ class AOTExecutorCodegen : public MixedModeVisitor { stmts_.push_back(body); } + Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) { + if (targets.size() == 1) { + // The homogeneous execution case, return the only target. + const auto& it = targets.begin(); + return (*it).second; + } else { + // The heterogeneous execution case, return the target associated with the + // given device type. + // If "dev_type" equals to 0, the device name only can be got from + // "targets", and it may not be "llvm", so here just set it to "unknown". + std::string dev_name = "unknown"; + if (dev_type != 0) { + dev_name = runtime::DeviceName(dev_type); + } + + if (targets.count(dev_type) == 0) { + std::stringstream msg; + msg << "No target is specified for provided device name: `" << dev_name << "`\n\n" + << dev_name << " mapped to device type (" << dev_type + << ") which was not found in the target map.\n" + << "Availible targets: \n"; + for (auto target : targets) { + msg << " " << target.first << "-> " << target.second << "\n"; + } + LOG(FATAL) << msg.str(); + } + return targets[dev_type]; + } + } + /*! * brief Copy a variable to the output. This function is mainly used in edge cases * when we want to return an input or a parameter. @@ -583,8 +746,15 @@ class AOTExecutorCodegen : public MixedModeVisitor { // performing the preexisting AOT executor code generation phase. IRModule mod = IRModule::FromExpr(func); + backend::FunctionInfo func_info; + + if (memory_plan.defined()) { + // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize + func_info = UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info); + } + IRModule lowered_mod = - LowerTEPass(targets_, device_context_map, memory_plan, mod_name, [this](Function func) { + LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) { // We need to maintain the constant map for external // functions so we pass this processing function which // allows us to process each function as we lower it. diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index aca95db34c4e..3e7c952589d5 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -222,7 +222,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator ctx(pass_ctx); diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 0393fdfec70d..5807ade08479 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -596,8 +596,7 @@ class LowerTensorExprMutator : public ExprMutator { const Op& debug_op_; }; -Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, - backend::StaticMemoryPlan memory_plan, const String& module_name, +Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name, TECompiler compiler, std::function process_fn) { runtime::TypedPackedFunc pass_func = [=](Function func, IRModule module, PassContext ctx) { @@ -608,162 +607,6 @@ Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {}); } -Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets) { - if (targets.size() == 1) { - // The homogeneous execution case, return the only target. - const auto& it = targets.begin(); - return (*it).second; - } else { - // The heterogeneous execution case, return the target associated with the - // given device type. - // If "dev_type" equals to 0, the device name only can be got from - // "targets", and it may not be "llvm", so here just set it to "unknown". - std::string dev_name = "unknown"; - if (dev_type != 0) { - dev_name = runtime::DeviceName(dev_type); - } - - if (targets.count(dev_type) == 0) { - std::stringstream msg; - msg << "No target is specified for provided device name: `" << dev_name << "`\n\n" - << dev_name << " mapped to device type (" << dev_type - << ") which was not found in the target map.\n" - << "Availible targets: \n"; - for (auto target : targets) { - msg << " " << target.first << "-> " << target.second << "\n"; - } - LOG(FATAL) << msg.str(); - } - return targets[dev_type]; - } -} - -/*! - * \brief Update the "main" control function's metadata - * - * \param mod The module - * \param targets Map of targets - * \return function_infos Function info for each function in the module - */ - -backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap targets, - Map storage_info_map) { - CHECK_EQ(mod->functions.size(), 1) - << "There should only be one function in the module passed to UpdateMainWorkspaceSize"; - Function func = Downcast(mod->Lookup("main")); - - // This is a Map> - std::unordered_map, EnumClassHash> sid_workspace; - // This is a Map - std::unordered_map device_io; - // This is a Map - std::unordered_map device_consts; - - // Initialize the mapping from all storage identifiers to workspace sizes, - // the amount of device io, and the device constants. - for (const auto& kv : storage_info_map) { - backend::StorageInfo storage_info = kv.second; - std::vector storage_ids = storage_info->storage_ids; - std::vector devices = storage_info->device_types; - - CHECK_EQ(storage_ids.size(), devices.size()); - for (uint32_t i = 0; i < devices.size(); i++) { - sid_workspace[devices[i]][storage_ids[i]] = 0; - device_io[devices[i]] = 0; - device_consts[devices[i]] = 0; - } - } - - // Iterate the storage map to compute all the tensor sizes in the program. - // There are 3 cases in this code: - // - // First we need to compute the sizes of all - // inline constants. - // - // Second we compute the size of any bound variable as these are input and output - // sizes of the program. - // - // Finally for all other expressions we check which storage identifier they have - // been assigned and we compute the maximal size of the storage, as tensors can - // share storage with other tensors which are the same size or larger. - // - // In this final case there is only one allocation for all tensors which share storage - // which will be the maximal size of all tensors which were assigned to it. - for (const auto& kv : storage_info_map) { - Expr expr = kv.first; - int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type()); - backend::StorageInfo storage_info = kv.second; - std::vector storage_ids = storage_info->storage_ids; - std::vector devices = storage_info->device_types; - - if (expr->IsInstance()) { - for (const auto& dev : devices) { - device_consts[dev] += size_bytes; - } - continue; - } else if (expr->IsInstance() || expr.same_as(func->body)) { - CHECK_GE(devices.size(), 1) << "must be at least one device"; - for (const auto& dev : devices) { - device_io[dev] += size_bytes; - } - continue; - } - - // TODO(@electriclilies): This code is never being called which means sid_workspace is not - // updated.. This means that storage info is probably not being created correctly. Or is not - // equivalent to what was here previously - for (uint32_t i = 0; i < storage_ids.size(); i++) { - // Here we record the largest size of the tensor - // that share the same storage id, because storage_id will - // be shared between multiple tensors that are not live simultaneously. - if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) { - sid_workspace[devices[i]][storage_ids[i]] = size_bytes; - } - } - } - - // This is a Map - std::unordered_map device_workspace; - // Once we know the sizes of sids, we need to accumulate per device - for (const auto& dev_sid_size : sid_workspace) { - auto dev = dev_sid_size.first; - device_workspace[dev] = 0; - for (const auto& sid_size : dev_sid_size.second) { - device_workspace[dev] += sid_size.second; - } - } - - Map workspace_sizes; - Map io_sizes; - Map constant_sizes; - Map tir_primfuncs; - Map relay_primfuncs; - - // Initialize all target workspaces to zero - for (const auto& kv : targets) { - auto tgt = kv.second; - workspace_sizes.Set(tgt, 0); - } - - for (const auto& dev_and_size : device_workspace) { - auto tgt = GetTargetFromInteger(dev_and_size.first, targets); - workspace_sizes.Set(tgt, dev_and_size.second); - relay_primfuncs.Set(tgt, func); - } - for (const auto& dev_and_size : device_io) { - auto tgt = GetTargetFromInteger(dev_and_size.first, targets); - io_sizes.Set(tgt, dev_and_size.second); - } - - for (const auto& dev_and_size : device_consts) { - auto tgt = GetTargetFromInteger(dev_and_size.first, targets); - constant_sizes.Set(tgt, dev_and_size.second); - } - - return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs, - relay_primfuncs); -} - /*! * \brief A function to create the function metadata for an input function (ie calculate buffer * input/output sizes) @@ -844,20 +687,13 @@ void UpdateFunctionMetadata(Function relay_func, } IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_context_map, - backend::StaticMemoryPlan memory_plan, const String& module_name, - std::function process_fn) { + const String& module_name, std::function process_fn) { DLOG(INFO) << "lowering module:\n" << PrettyPrint(module); TECompiler compiler; - backend::FunctionInfo func_info; - if (memory_plan.defined()) { - // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize - func_info = UpdateMainWorkspaceSize(module, targets, memory_plan->expr_to_storage_info); - } - - auto updated_module = LowerTensorExpr(targets, device_context_map, memory_plan, module_name, - compiler, process_fn)(module); + auto updated_module = + LowerTensorExpr(targets, device_context_map, module_name, compiler, process_fn)(module); // A temporary solution until we can rewrite the auto-scheduler task extraction code to work // in a more reasonable way. @@ -876,6 +712,7 @@ IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_con (*te_compiler_update_weights)(weight_map); } + backend::FunctionInfo func_info; // Copy the lowered functions into the return module updated_module->Update(compiler->GetLoweredFunctions()); @@ -919,12 +756,11 @@ Map GetPerTargetModules(IRModule mod) { return per_target_modules; } -Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, - backend::StaticMemoryPlan memory_plan, const String& module_name, +Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, const String& module_name, std::function process_fn) { runtime::TypedPackedFunc pass_func = [=](IRModule module, PassContext ctx) { - return LowerTE(module, targets, device_context_map, memory_plan, module_name, process_fn); + return LowerTE(module, targets, device_context_map, module_name, process_fn); }; return tvm::transform::Sequential( {tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}), InferType()}); diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h index 9d0eb1078ee0..c337ec784a3d 100644 --- a/src/relay/backend/te_compiler.h +++ b/src/relay/backend/te_compiler.h @@ -192,15 +192,13 @@ IRModule LowerTE( * * \param targets The mapping for devices to targets. * \param device_context_map An analysis result mapping each sub-expression to a device. - * \param memory_plan The memory plan used during lowering * \param module_name The name of this module * \param process_fn Callback allowing one-level up code generators to process * each function that we lower * \returns The pass which lowers primative functions to TIR */ transform::Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, - backend::StaticMemoryPlan memory_plan, const String& module_name, - std::function process_fn); + const String& module_name, std::function process_fn); } // namespace tec } // namespace relay } // namespace tvm From 8a0a3c97098f82440eafaec051f1703b815e4cfa Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Thu, 9 Sep 2021 10:41:46 +0300 Subject: [PATCH 05/12] Fix linting errors --- cmake/modules/contrib/EthosU.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake index 8f3e09b8179b..899f2da1c577 100644 --- a/cmake/modules/contrib/EthosU.cmake +++ b/cmake/modules/contrib/EthosU.cmake @@ -18,4 +18,4 @@ if(USE_ETHOSU) file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*) list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC}) -endif(USE_ETHOSU) \ No newline at end of file +endif(USE_ETHOSU) From 97332839b30332dcd9a406a9ad371f3877d013ad Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Fri, 10 Sep 2021 11:49:07 +0300 Subject: [PATCH 06/12] Fix PR comments --- src/relay/backend/aot_executor_codegen.cc | 13 +++---------- src/relay/backend/graph_executor_codegen.cc | 2 +- src/relay/backend/te_compiler.h | 13 ++----------- src/relay/backend/utils.h | 11 +++++++++++ 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 8b708a83fa94..40ff9008d3b7 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -38,20 +38,13 @@ #include #include -#include "te_compiler.h" -#include "utils.h" +#include "./te_compiler.h" +#include "./utils.h" namespace tvm { namespace relay { namespace backend { -struct EnumClassHash { - template - std::size_t operator()(T t) const { - return static_cast(t); - } -}; - using IntegerArray = Array; using StorageMap = std::unordered_map; @@ -724,7 +717,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { } IRModule lowered_mod = - LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) { + tec::LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) { // We need to maintain the constant map for external // functions so we pass this processing function which // allows us to process each function as we lower it. diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index 3e7c952589d5..4851bc8a8559 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -222,7 +222,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator - std::size_t operator()(T t) const { - return static_cast(t); - } -}; - // TODO(@jroesch, @chrisS) these should be a tvm::Map for uniformity sake // we should a version of context which works in Map -using TargetMap = std::unordered_map; +using TargetMap = std::unordered_map; using DeviceMap = std::unordered_map; using ProcessFn = std::function; diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index cf8a2dd4b8e0..ae8d7d2c2360 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -146,6 +146,17 @@ struct LoweredOutput { runtime::Metadata metadata; }; +/*! + * \brief This class is needed to avoid a GCC 5 bug that prevents maps containing enums from being + compiled. If i386 GCC version is increased, we can remove it. + */ +struct EnumClassHash { + template + std::size_t operator()(T t) const { + return static_cast(t); + } +}; + /*! * \brief A helper to expand the params by adding the ones used in a given expression. */ From 0a1beb6cec057fb4be125858596f3770b950b51f Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Fri, 10 Sep 2021 13:29:14 +0300 Subject: [PATCH 07/12] Remove updated module with function info from LowerTe --- src/relay/backend/te_compiler.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 0d1b7cef0220..f5576e623df5 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -742,14 +742,12 @@ IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_con (*te_compiler_update_weights)(weight_map); } - backend::FunctionInfo func_info; // Copy the lowered functions into the return module updated_module->Update(compiler->GetLoweredFunctions()); // Annotate the module with the external modules and function info updated_module = WithAttr(updated_module, "external_mods", compiler->LowerExternalFunctions()); - updated_module = WithAttr(updated_module, "main_func_info", func_info); return updated_module; } From 03a208cf886b6ded83926016e96327dd8688218f Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Fri, 10 Sep 2021 14:08:04 +0300 Subject: [PATCH 08/12] Refactor UpdateMainWorkspaceSize to update func info independently from LowerTEPass --- src/relay/backend/aot_executor_codegen.cc | 128 +------------------- src/relay/backend/graph_executor_codegen.cc | 15 ++- src/relay/backend/te_compiler.cc | 125 ++++++++++++++++++- src/relay/backend/te_compiler.h | 10 ++ 4 files changed, 146 insertions(+), 132 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 40ff9008d3b7..898731e6a54d 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -277,132 +277,6 @@ class AOTExecutorCodegen : public MixedModeVisitor { } } - /*! - * \brief Update the "main" control function's metadata - * - * \param mod The module - * \param targets Map of targets - * \return function_infos Function info for each function in the module - */ - - backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets, - Map storage_info_map) { - CHECK_EQ(mod->functions.size(), 1) - << "There should only be one function in the module passed to UpdateMainWorkspaceSize"; - Function func = Downcast(mod->Lookup("main")); - - // This is a Map> - std::unordered_map, EnumClassHash> sid_workspace; - // This is a Map - std::unordered_map device_io; - // This is a Map - std::unordered_map device_consts; - - // Initialize the mapping from all storage identifiers to workspace sizes, - // the amount of device io, and the device constants. - for (const auto& kv : storage_info_map) { - backend::StorageInfo storage_info = kv.second; - std::vector storage_ids = storage_info->storage_ids; - std::vector devices = storage_info->device_types; - - CHECK_EQ(storage_ids.size(), devices.size()); - for (uint32_t i = 0; i < devices.size(); i++) { - sid_workspace[devices[i]][storage_ids[i]] = 0; - device_io[devices[i]] = 0; - device_consts[devices[i]] = 0; - } - } - - // Iterate the storage map to compute all the tensor sizes in the program. - // There are 3 cases in this code: - // - // First we need to compute the sizes of all - // inline constants. - // - // Second we compute the size of any bound variable as these are input and output - // sizes of the program. - // - // Finally for all other expressions we check which storage identifier they have - // been assigned and we compute the maximal size of the storage, as tensors can - // share storage with other tensors which are the same size or larger. - // - // In this final case there is only one allocation for all tensors which share storage - // which will be the maximal size of all tensors which were assigned to it. - for (const auto& kv : storage_info_map) { - Expr expr = kv.first; - int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type()); - backend::StorageInfo storage_info = kv.second; - std::vector storage_ids = storage_info->storage_ids; - std::vector devices = storage_info->device_types; - - if (expr->IsInstance()) { - for (const auto& dev : devices) { - device_consts[dev] += size_bytes; - } - continue; - } else if (expr->IsInstance() || expr.same_as(func->body)) { - CHECK_GE(devices.size(), 1) << "must be at least one device"; - for (const auto& dev : devices) { - device_io[dev] += size_bytes; - } - continue; - } - - // TODO(@electriclilies): This code is never being called which means sid_workspace is not - // updated.. This means that storage info is probably not being created correctly. Or is not - // equivalent to what was here previously - for (uint32_t i = 0; i < storage_ids.size(); i++) { - // Here we record the largest size of the tensor - // that share the same storage id, because storage_id will - // be shared between multiple tensors that are not live simultaneously. - if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) { - sid_workspace[devices[i]][storage_ids[i]] = size_bytes; - } - } - } - - // This is a Map - std::unordered_map device_workspace; - // Once we know the sizes of sids, we need to accumulate per device - for (const auto& dev_sid_size : sid_workspace) { - auto dev = dev_sid_size.first; - device_workspace[dev] = 0; - for (const auto& sid_size : dev_sid_size.second) { - device_workspace[dev] += sid_size.second; - } - } - - Map workspace_sizes; - Map io_sizes; - Map constant_sizes; - Map tir_primfuncs; - Map relay_primfuncs; - - // Initialize all target workspaces to zero - for (const auto& kv : targets) { - auto tgt = kv.second; - workspace_sizes.Set(tgt, 0); - } - - for (const auto& dev_and_size : device_workspace) { - auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); - workspace_sizes.Set(tgt, dev_and_size.second); - relay_primfuncs.Set(tgt, func); - } - for (const auto& dev_and_size : device_io) { - auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); - io_sizes.Set(tgt, dev_and_size.second); - } - - for (const auto& dev_and_size : device_consts) { - auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); - constant_sizes.Set(tgt, dev_and_size.second); - } - - return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs, - relay_primfuncs); - } - /*! * brief Call a function with a given name */ @@ -713,7 +587,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { if (memory_plan.defined()) { // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize - func_info = UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info); + func_info = tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info); } IRModule lowered_mod = diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index 4851bc8a8559..b3c9746ab75f 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -36,8 +36,8 @@ #include #include -#include "te_compiler.h" -#include "utils.h" +#include "./te_compiler.h" +#include "./utils.h" namespace tvm { namespace relay { @@ -221,6 +221,14 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorexpr_to_storage_info); + } + IRModule lowered_mod = tec::LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) { // We need to maintain the constant map for external @@ -236,8 +244,11 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorfunction_metadata_); })(mod); + lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info); + Optional main_func_info = lowered_mod->GetAttr("main_func_info"); + ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index f5576e623df5..387f27c6fc79 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -17,7 +17,7 @@ * under the License. */ -#include "te_compiler.h" +#include "./te_compiler.h" #include #include @@ -42,8 +42,8 @@ #include #include -#include "te_compiler_cache.h" -#include "utils.h" +#include "./te_compiler_cache.h" +#include "./utils.h" namespace tvm { namespace relay { @@ -596,6 +596,125 @@ class LowerTensorExprMutator : public ExprMutator { const Op& debug_op_; }; +backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets, + Map storage_info_map) { + CHECK_EQ(mod->functions.size(), 1) + << "There should only be one function in the module passed to UpdateMainWorkspaceSize"; + Function func = Downcast(mod->Lookup("main")); + + // This is a Map> + std::unordered_map, backend::EnumClassHash> + sid_workspace; + // This is a Map + std::unordered_map device_io; + // This is a Map + std::unordered_map device_consts; + + // Initialize the mapping from all storage identifiers to workspace sizes, + // the amount of device io, and the device constants. + for (const auto& kv : storage_info_map) { + backend::StorageInfo storage_info = kv.second; + std::vector storage_ids = storage_info->storage_ids; + std::vector devices = storage_info->device_types; + + CHECK_EQ(storage_ids.size(), devices.size()); + for (uint32_t i = 0; i < devices.size(); i++) { + sid_workspace[devices[i]][storage_ids[i]] = 0; + device_io[devices[i]] = 0; + device_consts[devices[i]] = 0; + } + } + + // Iterate the storage map to compute all the tensor sizes in the program. + // There are 3 cases in this code: + // + // First we need to compute the sizes of all + // inline constants. + // + // Second we compute the size of any bound variable as these are input and output + // sizes of the program. + // + // Finally for all other expressions we check which storage identifier they have + // been assigned and we compute the maximal size of the storage, as tensors can + // share storage with other tensors which are the same size or larger. + // + // In this final case there is only one allocation for all tensors which share storage + // which will be the maximal size of all tensors which were assigned to it. + for (const auto& kv : storage_info_map) { + Expr expr = kv.first; + int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type()); + backend::StorageInfo storage_info = kv.second; + std::vector storage_ids = storage_info->storage_ids; + std::vector devices = storage_info->device_types; + + if (expr->IsInstance()) { + for (const auto& dev : devices) { + device_consts[dev] += size_bytes; + } + continue; + } else if (expr->IsInstance() || expr.same_as(func->body)) { + CHECK_GE(devices.size(), 1) << "must be at least one device"; + for (const auto& dev : devices) { + device_io[dev] += size_bytes; + } + continue; + } + + // TODO(@electriclilies): This code is never being called which means sid_workspace is not + // updated.. This means that storage info is probably not being created correctly. Or is not + // equivalent to what was here previously + for (uint32_t i = 0; i < storage_ids.size(); i++) { + // Here we record the largest size of the tensor + // that share the same storage id, because storage_id will + // be shared between multiple tensors that are not live simultaneously. + if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) { + sid_workspace[devices[i]][storage_ids[i]] = size_bytes; + } + } + } + + // This is a Map + std::unordered_map device_workspace; + // Once we know the sizes of sids, we need to accumulate per device + for (const auto& dev_sid_size : sid_workspace) { + auto dev = dev_sid_size.first; + device_workspace[dev] = 0; + for (const auto& sid_size : dev_sid_size.second) { + device_workspace[dev] += sid_size.second; + } + } + + Map workspace_sizes; + Map io_sizes; + Map constant_sizes; + Map tir_primfuncs; + Map relay_primfuncs; + + // Initialize all target workspaces to zero + for (const auto& kv : targets) { + auto tgt = kv.second; + workspace_sizes.Set(tgt, 0); + } + + for (const auto& dev_and_size : device_workspace) { + auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); + workspace_sizes.Set(tgt, dev_and_size.second); + relay_primfuncs.Set(tgt, func); + } + for (const auto& dev_and_size : device_io) { + auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); + io_sizes.Set(tgt, dev_and_size.second); + } + + for (const auto& dev_and_size : device_consts) { + auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets); + constant_sizes.Set(tgt, dev_and_size.second); + } + + return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs, + relay_primfuncs); +} + Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) { if (targets.size() == 1) { // The homogeneous execution case, return the only target. diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h index 8b0f2a6e802e..d5135e6301c4 100644 --- a/src/relay/backend/te_compiler.h +++ b/src/relay/backend/te_compiler.h @@ -149,6 +149,16 @@ void UpdateFunctionMetadata(Function relay_func, */ Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets); +/*! + * \brief Update the "main" control function's metadata + * + * \param mod The module + * \param targets Map of targets + * \return function_infos Function info for each function in the module + */ +backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets, + Map storage_info_map); + /*! \brief Utility to separate the functions in an IRModule by Target. * * \param mod The IRModule to extract the per target module from From b9654430bed4466dda468cc52b81635694fe3002 Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Fri, 10 Sep 2021 15:20:45 +0300 Subject: [PATCH 09/12] Fix aot failed tests --- src/relay/backend/aot_executor_codegen.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 898731e6a54d..a7bbcb007348 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -666,8 +666,11 @@ class AOTExecutorCodegen : public MixedModeVisitor { Downcast(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)), workspace_byte_alignment); + lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info); + Optional main_func_info = lowered_mod->GetAttr("main_func_info"); + ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size); function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); From c4cc22dc3c515028ebc7a3216f4e569db8d2d0fe Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Wed, 15 Sep 2021 11:14:36 +0300 Subject: [PATCH 10/12] Revert whitespaces fixes --- .github/ISSUE_TEMPLATE/ci-problem.md | 2 +- .github/ISSUE_TEMPLATE/documentation.md | 1 + 3rdparty/vta-hw | 2 +- cmake/modules/contrib/EthosU.cmake | 2 +- docs/langref/relay_pattern.rst | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/ci-problem.md b/.github/ISSUE_TEMPLATE/ci-problem.md index 5a5977ab5000..f46a42f42cf5 100644 --- a/.github/ISSUE_TEMPLATE/ci-problem.md +++ b/.github/ISSUE_TEMPLATE/ci-problem.md @@ -19,4 +19,4 @@ Provide a link to the specific run that has failed. ### Flakiness -Have you seen this multiple times in this branch or in other branches? +Have you seen this multiple times in this branch or in other branches? \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md index 0ad9424c1b5d..a1c1facb7e1b 100644 --- a/.github/ISSUE_TEMPLATE/documentation.md +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -19,3 +19,4 @@ Include the title of the document (e.g. "Getting Started with TVM"), and the typ If an RFC/discuss post exists, link it here. Otherwise, specify what actions should be taken to provide additional clarity/readability/reproducibility to the document. Include code snippets from the previous documentation if applicable. + diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index dfe9f572a43d..36a91576edf6 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit dfe9f572a43d41e0c1ecdf036cea97042a0febfe +Subproject commit 36a91576edf633479c78649e050f18dd2ddc8103 diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake index 899f2da1c577..8f3e09b8179b 100644 --- a/cmake/modules/contrib/EthosU.cmake +++ b/cmake/modules/contrib/EthosU.cmake @@ -18,4 +18,4 @@ if(USE_ETHOSU) file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*) list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC}) -endif(USE_ETHOSU) +endif(USE_ETHOSU) \ No newline at end of file diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst index 16211b2cb125..4682e5aa5b33 100644 --- a/docs/langref/relay_pattern.rst +++ b/docs/langref/relay_pattern.rst @@ -89,7 +89,7 @@ Or a convolution with a specific kernel size: x = relay.var('x') y = relay.var('y') assert is_conv2d.match(relay.op.nn.conv2d(x, y, kernel_size=[3, 3])) - + Matching an Optional Op From ed2c1cc6df49fdcc18e48fed46f1a903a9585a7a Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Wed, 15 Sep 2021 11:21:45 +0300 Subject: [PATCH 11/12] Remove obsolete function hoisting and minor cleanups --- src/relay/backend/interpreter.cc | 3 -- src/relay/backend/te_compiler.cc | 82 ++++++++++++++++---------------- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc index a757d2d29622..d87cf9811bc7 100644 --- a/src/relay/backend/interpreter.cc +++ b/src/relay/backend/interpreter.cc @@ -902,9 +902,6 @@ IRModule Prepare(IRModule mod, Device device, Target target) { // All calls to primitives will use the unique target. tec::DeviceMap device_map; - // No need for a memory plan. - backend::StaticMemoryPlan memory_plan; /*=nullptr*/ - // Run minimal transforms on module to establish invariants needed by interpreter. transform::Sequential seq({transform::SimplifyInference(), // FuseOps will mark wrapped calls to prim-ops with the 'Primitive' diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc index 387f27c6fc79..2e7eb6f9aa6b 100644 --- a/src/relay/backend/te_compiler.cc +++ b/src/relay/backend/te_compiler.cc @@ -596,6 +596,47 @@ class LowerTensorExprMutator : public ExprMutator { const Op& debug_op_; }; +Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) { + if (targets.size() == 1) { + // The homogeneous execution case, return the only target. + const auto& it = targets.begin(); + return (*it).second; + } else { + // The heterogeneous execution case, return the target associated with the + // given device type. + // If "dev_type" equals to 0, the device name only can be got from + // "targets", and it may not be "llvm", so here just set it to "unknown". + std::string dev_name = "unknown"; + if (dev_type != 0) { + dev_name = runtime::DeviceName(dev_type); + } + + if (targets.count(dev_type) == 0) { + std::stringstream msg; + msg << "No target is specified for provided device name: `" << dev_name << "`\n\n" + << dev_name << " mapped to device type (" << dev_type + << ") which was not found in the target map.\n" + << "Availible targets: \n"; + for (auto target : targets) { + msg << " " << target.first << "-> " << target.second << "\n"; + } + LOG(FATAL) << msg.str(); + } + return targets[dev_type]; + } +} + +Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name, + TECompiler compiler, std::function process_fn) { + runtime::TypedPackedFunc pass_func = + [=](Function func, IRModule module, PassContext ctx) { + LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn, + module_name, compiler); + return Downcast(lower_te.Mutate(func)); + }; + return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {}); +} + backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets, Map storage_info_map) { CHECK_EQ(mod->functions.size(), 1) @@ -715,47 +756,6 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa relay_primfuncs); } -Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) { - if (targets.size() == 1) { - // The homogeneous execution case, return the only target. - const auto& it = targets.begin(); - return (*it).second; - } else { - // The heterogeneous execution case, return the target associated with the - // given device type. - // If "dev_type" equals to 0, the device name only can be got from - // "targets", and it may not be "llvm", so here just set it to "unknown". - std::string dev_name = "unknown"; - if (dev_type != 0) { - dev_name = runtime::DeviceName(dev_type); - } - - if (targets.count(dev_type) == 0) { - std::stringstream msg; - msg << "No target is specified for provided device name: `" << dev_name << "`\n\n" - << dev_name << " mapped to device type (" << dev_type - << ") which was not found in the target map.\n" - << "Availible targets: \n"; - for (auto target : targets) { - msg << " " << target.first << "-> " << target.second << "\n"; - } - LOG(FATAL) << msg.str(); - } - return targets[dev_type]; - } -} - -Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name, - TECompiler compiler, std::function process_fn) { - runtime::TypedPackedFunc pass_func = - [=](Function func, IRModule module, PassContext ctx) { - LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn, - module_name, compiler); - return Downcast(lower_te.Mutate(func)); - }; - return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {}); -} - /*! * \brief A function to create the function metadata for an input function (ie calculate buffer * input/output sizes) From ce9fbdf806a17ae1bfd1c0a5181225ddbf061e0d Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Wed, 15 Sep 2021 12:39:22 +0300 Subject: [PATCH 12/12] Address PR comments --- .../reference-vm/arduino/base-box/base_box_provision.sh | 2 +- docker/install/ubuntu_install_vitis_ai_packages_ci.sh | 4 ++-- src/relay/backend/aot_executor_codegen.cc | 4 +--- src/relay/backend/graph_executor_codegen.cc | 4 +--- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh index d55100693d5c..11d89f2cd44e 100644 --- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh +++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. # -# Using this script we can reuse docker/install scripts to configure the reference +# Using this script we can reuse docker/install scripts to configure the reference # virtual machine similar to CI QEMU setup. # diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh index f6f270a07a83..25c214068cd0 100644 --- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh +++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index a7bbcb007348..ad9ba1b2069d 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -588,6 +588,7 @@ class AOTExecutorCodegen : public MixedModeVisitor { if (memory_plan.defined()) { // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize func_info = tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info); + mod = WithAttr(mod, "main_func_info", func_info); } IRModule lowered_mod = @@ -666,12 +667,9 @@ class AOTExecutorCodegen : public MixedModeVisitor { Downcast(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)), workspace_byte_alignment); - lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info); - Optional main_func_info = lowered_mod->GetAttr("main_func_info"); - ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size); function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc index b3c9746ab75f..92e7568d9f38 100644 --- a/src/relay/backend/graph_executor_codegen.cc +++ b/src/relay/backend/graph_executor_codegen.cc @@ -227,6 +227,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorexpr_to_storage_info); + mod = WithAttr(mod, "main_func_info", func_info); } IRModule lowered_mod = @@ -244,12 +245,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslatorfunction_metadata_); })(mod); - lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info); - Optional main_func_info = lowered_mod->GetAttr("main_func_info"); - ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point."; function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value()); Function lowered_main_func = Downcast(lowered_mod->Lookup("main"));