From f50495bf7219d681be7072138a1ad818681b2f5a Mon Sep 17 00:00:00 2001
From: electriclilies <lilyorthsmith@gmail.com>
Date: Fri, 3 Sep 2021 11:27:44 -0700
Subject: [PATCH 01/12] Clean up LowerTEPass

Add attrs to IRModule equal and hash

Make LowerTEPass opt_level 0

Copy IRModule attrs to per_target_modules

Add ShallowCopy to IRmodule
---
 include/tvm/ir/module.h                       | 20 ++++++-
 src/ir/module.cc                              | 11 +++-
 src/relay/backend/aot_executor_codegen.cc     | 10 ++--
 src/relay/backend/graph_executor_codegen.cc   | 13 ++---
 src/relay/backend/interpreter.cc              | 52 +++++++------------
 src/relay/backend/te_compiler.cc              | 27 ++--------
 src/relay/backend/te_compiler.h               |  7 ---
 src/relay/ir/transform.cc                     |  4 +-
 src/relay/transforms/partition_graph.cc       |  6 ++-
 .../transforms/to_basic_block_normal_form.cc  |  2 +-
 src/relay/transforms/type_infer.cc            | 17 +++---
 .../relay/test_backend_compile_engine.py      |  2 +
 12 files changed, 84 insertions(+), 87 deletions(-)
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index fefb08f878ef..852c7d0d8a98 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -122,6 +122,7 @@ class IRModuleNode : public Object {
     v->Visit("global_var_map_", &global_var_map_);
     v->Visit("global_type_var_map_", &global_type_var_map_);
     v->Visit("source_map", &source_map);
+    v->Visit("attrs", &attrs);
   }
 
   TVM_DLL bool SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const;
@@ -277,6 +278,12 @@ class IRModuleNode : public Object {
    */
   TVM_DLL void Update(const IRModule& other);
 
+  /*!
+   * \brief Create a shallow copy of this IRModule.
+   * \returns The shallow copy of the IRModule.
+   */
+  TVM_DLL IRModule ShallowCopy();
+
   /*!
    * \brief Import Relay code from the file at path.
    * \param path The path of the Relay code to import.
@@ -348,12 +355,14 @@ class IRModule : public ObjectRef {
    * \brief constructor
    * \param functions Functions in the module.
    * \param type_definitions Type definitions in the module.
-   * \param import_set Set of imported files in the module
+   * \param import_set Set of imported files in the module.
    * \param map The module source map.
+   * \param attrs The module attributes.
    */
   TVM_DLL explicit IRModule(Map<GlobalVar, BaseFunc> functions,
                             Map<GlobalTypeVar, TypeData> type_definitions = {},
-                            std::unordered_set<String> import_set = {}, parser::SourceMap map = {});
+                            std::unordered_set<String> import_set = {}, parser::SourceMap map = {},
+                            DictAttrs attrs = {});
 
   /*! \brief default constructor */
   IRModule() : IRModule(Map<GlobalVar, BaseFunc>({})) {}
@@ -415,6 +424,13 @@ class IRModule : public ObjectRef {
    */
   TVM_DLL static IRModule FromText(const String& text, const String& source_path);
 
+  /*!
+   * \brief Create a shallow copy of an IRModule.
+   * \param mod The module to copy.
+   * \return The copied module.
+   */
+  IRModule ShallowCopyIRModule(IRModule mod);
+
   /*! \brief Declare the container type. */
   using ContainerType = IRModuleNode;
 
diff --git a/src/ir/module.cc b/src/ir/module.cc
index d4129c84ccf5..15c441d61a23 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -43,7 +43,8 @@ namespace tvm {
 
 IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
                    tvm::Map<GlobalTypeVar, TypeData> type_definitions,
-                   std::unordered_set<String> import_set, parser::SourceMap source_map) {
+                   std::unordered_set<String> import_set, parser::SourceMap source_map,
+                   DictAttrs attrs) {
   auto n = make_object<IRModuleNode>();
   n->functions = std::move(functions);
   n->type_definitions = std::move(type_definitions);
@@ -52,6 +53,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
   n->constructor_tag_map_ = {};
   n->import_set_ = std::move(import_set);
   n->source_map = source_map;
+  n->attrs = std::move(attrs);
 
   for (const auto& kv : n->functions) {
     // set global var map
@@ -72,6 +74,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
 
 bool IRModuleNode::SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const {
   if (functions.size() != other->functions.size()) return false;
+  if (!equal(this->attrs, other->attrs)) return false;
   for (const auto& kv : this->functions) {
     if (!other->ContainGlobalVar(kv.first->name_hint)) return false;
     if (!equal(kv.second, other->Lookup(kv.first->name_hint))) return false;
@@ -112,6 +115,7 @@ void IRModuleNode::SHashReduce(SHashReducer hash_reduce) const {
     temp.emplace_back(kv.first->name_hint, kv.second);
   }
   reduce_temp();
+  hash_reduce(this->attrs);
 }
 
 bool IRModuleNode::ContainGlobalVar(const String& name) const {
@@ -361,6 +365,11 @@ void IRModuleNode::Update(const IRModule& mod) {
   }
 }
 
+IRModule IRModuleNode::ShallowCopy() {
+  return IRModule(this->functions, this->type_definitions, this->Imports(), this->source_map,
+                  this->attrs);
+}
+
 std::pair<IRModule, GlobalVar> IRModule::FromExprInContext(
     const RelayExpr& expr, const tvm::Map<GlobalVar, BaseFunc>& global_funcs,
     const tvm::Map<GlobalTypeVar, TypeData>& type_definitions,
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 0c094cb1fa2c..e2fd502fc632 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -598,8 +598,12 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
-    auto lowered_main = lowered_mod->Lookup("main");
-    auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
+    Optional<backend::FunctionInfo> main_func_info =
+        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
+    ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
+    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
+
+    Function lowered_main_func = Downcast<relay::Function>(lowered_mod->Lookup("main"));
 
     // Post-lowering storage map for writing main func - this should be the same map as previously
     // created, just referencing the new expressions created from lowering
@@ -676,7 +680,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     Optional<Array<tvm::runtime::Module>> external_modules =
         lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
-    ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point.";
+    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
 
     // This is the point where we separate the functions in the module by target
     ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index b7b388431ca1..aca95db34c4e 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -241,26 +241,23 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
 
-    // Get only the Relay functions out of the lowered module so we can run type inference on them
-    IRModule main_module = tec::GetMainModule(lowered_mod);
-    main_module = relay::transform::InferType()(main_module);
-    relay::Function main_func = Downcast<relay::Function>(main_module->Lookup("main"));
+    Function lowered_main_func = Downcast<Function>(lowered_mod->Lookup("main"));
 
     // Now that we have lowered all operators to TIR code, we can proceed with compilation.
     //
     // We need to unfortunately re-plan as the previous results have been invalidated by lowering
     // we will fix this in future refactors.
-    memory_plan_ = GraphPlanMemory(main_func);
+    memory_plan_ = GraphPlanMemory(lowered_main_func);
 
     // The graph planner also can not handle planning calls to global variables to we must remap
 
     // First we convert all the parameters into input nodes.
-    for (auto param : main_func->params) {
+    for (auto param : lowered_main_func->params) {
       auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
       var_map_[param.get()] = AddNode(node_ptr, param);
     }
 
-    heads_ = VisitExpr(main_func->body);
+    heads_ = VisitExpr(lowered_main_func->body);
     std::ostringstream os;
 
     dmlc::JSONWriter writer(&os);
@@ -277,7 +274,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
 
     Optional<Array<tvm::runtime::Module>> external_modules =
         lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
-    ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point.";
+    ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";
 
     // This is the point where we separate the functions in the module by target
     ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 82455bdf925c..df14b9e078b6 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -292,14 +292,8 @@ InterpreterState::InterpreterState(Expr current_expr, InterpreterState::Stack st
 class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
                     PatternFunctor<bool(const Pattern& p, const ObjectRef& v)> {
  public:
-  // TODO(mbs, electriclilies): Collapse mod and per_target_module once IRModule subsumes
-  // LoweredModule.
-  Interpreter(IRModule mod, Map<Target, IRModule> per_target_module, Device device, Target target)
-      : mod_(mod),
-        per_target_module_(per_target_module),
-        device_(device),
-        target_(target),
-        debug_op_(Op::Get("debug")) {}
+  Interpreter(IRModule unified_mod, Device device, Target target)
+      : unified_mod_(unified_mod), device_(device), target_(target), debug_op_(Op::Get("debug")) {}
 
   template <typename T>
   T WithFrame(const Frame& fr, const std::function<T()>& f) {
@@ -316,7 +310,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   ObjectRef VisitExpr_(const VarNode* var_node) final { return Lookup(GetRef<Var>(var_node)); }
 
   ObjectRef VisitExpr_(const GlobalVarNode* op) final {
-    return Eval(mod_->Lookup(GetRef<GlobalVar>(op)));
+    return Eval(unified_mod_->Lookup(GetRef<GlobalVar>(op)));
   }
 
   ObjectRef VisitExpr_(const OpNode* id) override {
@@ -387,9 +381,9 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
 
     // Project out just the function(s) we need.
     IRModule lowered_projected_mod;
+    Map<Target, IRModule> per_target_module = tec::GetPerTargetModules(unified_mod_);
     std::unordered_map<Target, IRModule, backend::TargetStrHash, backend::TargetStrEqual>
-        per_target_module_std_map =
-            backend::TargetModuleMapToTargetStrModuleMap(per_target_module_);
+        per_target_module_std_map = backend::TargetModuleMapToTargetStrModuleMap(per_target_module);
     auto mod_itr = per_target_module_std_map.find(target);
     ICHECK(mod_itr != per_target_module_std_map.end())
         << "No target module for target '" << target->str() << "'";
@@ -876,13 +870,11 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
   }
 
  private:
-  // Main module. All expressions are eval'ed w.r.t. the definitions in this module. This module
-  // may contain calls to TIR functions bound in a per_target_module_ below.
-  IRModule mod_;
-  // Map from target key to lowered TIR functions derived from mod_.
-  // Note that primitives are implicitly executed on target_, while shape functions are implicitly
-  // executed on the default 'cpu' host. Thus this map has at most two entries.
-  Map<Target, IRModule> per_target_module_;
+  // Unified module. Functions are annotated with their target.
+  // All expressions are eval'ed w.r.t. the definitions in this module.
+  // This module contains functions that used to be in main_module and the per_target_module (TIR
+  // functions) in one module.
+  IRModule unified_mod_;
   // Cached packed functions for the primitives and shape functions, keyed by target and
   // global var name.
   std::unordered_map<std::pair<Target, std::string>, PackedFunc, PairHash> compiled_packed_funcs_;
@@ -902,7 +894,7 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
  * rewritten \p mod and target-specific modules containing bindings for all TIR primitive
  * functions needed by the rewritten module.
  */
-std::pair<IRModule, Map<Target, IRModule>> Prepare(IRModule mod, Device device, Target target) {
+IRModule Prepare(IRModule mod, Device device, Target target) {
   // Things to initialize to pass into tec::LowerTEPass
   // We only have one device-specific target.
   tec::TargetMap targets = {{device.device_type, target}};
@@ -930,8 +922,7 @@ std::pair<IRModule, Map<Target, IRModule>> Prepare(IRModule mod, Device device,
   With<transform::PassContext> ctx(pass_ctx);
   mod = seq(mod);
 
-  // Lower all primitive functions reachable from expr.
-  return {tec::GetMainModule(mod), tec::GetPerTargetModules(mod)};
+  return mod;
 }
 
 /*! \brief Check if an expression could be changed by \p Prepare.
@@ -1020,11 +1011,9 @@ TypedPackedFunc<ObjectRef(Array<Expr>)> EvalFunction(IRModule mod, Expr expr, De
     // and can just eval it directly.
     expr_to_eval = expr;
   }
-  std::pair<IRModule, Map<Target, IRModule>> main_and_lowered =
-      Prepare(mod_with_expr, device, target);
-  std::shared_ptr<Interpreter> intrp = std::make_shared<Interpreter>(
-      /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device,
-      target);
+  IRModule lowered_mod = Prepare(mod_with_expr, device, target);
+
+  std::shared_ptr<Interpreter> intrp = std::make_shared<Interpreter>(lowered_mod, device, target);
 
   //
   // Step 2: Evaluate target function to a closure.
@@ -1063,12 +1052,11 @@ ObjectRef Eval(Expr expr, Map<GlobalTypeVar, TypeData> type_definitions,
                std::unordered_set<String> import_set, Device device, Target target) {
   std::pair<IRModule, GlobalVar> mod_and_global =
       IRModule::FromExprInContext(expr, /*global_funcs=*/{}, type_definitions, import_set);
-  std::pair<IRModule, Map<Target, IRModule>> main_and_lowered =
-      Prepare(mod_and_global.first, device, target);
-  Interpreter intrp(
-      /*mod=*/main_and_lowered.first, /*per_target_module=*/main_and_lowered.second, device,
-      target);
-  Expr expr_to_eval = main_and_lowered.first->GetGlobalVar(mod_and_global.second->name_hint);
+
+  IRModule mod = Prepare(mod_and_global.first, device, target);
+
+  Interpreter intrp(mod, device, target);
+  Expr expr_to_eval = mod->GetGlobalVar(mod_and_global.second->name_hint);
   if (expr.as<BaseFuncNode>() == nullptr) {
     // TODO(mbs): IRModule::FromExpr will implicitly close over the free vars of expr
     // unless it is a function, so we must reverse that in the expression to eval.
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 4d7f50b4f3a0..0393fdfec70d 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -900,8 +900,9 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
 
       // Put the function in per_target_modules
       if (!per_target_modules.count(target.value())) {
-        // Initialize the IRModule for this target and add the function
-        IRModule target_module;
+        // Initialize the IRModule for this target with the attributes from the input IRModule
+        IRModule target_module = IRModule({}, {}, {}, {}, mod->attrs);
+        // Add the function to the IRModule
         target_module->Add(var, func);
         per_target_modules[target.value()] = target_module;
       } else {
@@ -918,23 +919,6 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-IRModule GetMainModule(IRModule mod) {
-  IRModule main_module;
-  // Copy the type defs
-  for (const auto& kv : mod->type_definitions) {
-    main_module->AddTypeDef(kv.first, kv.second);
-  }
-  // Copy all Relay functions (we don't include PrimFuncs)
-  for (auto kv : mod->functions) {
-    const GlobalVar& var = kv.first;
-    const BaseFunc& func = kv.second;
-    if (func->IsInstance<tvm::relay::FunctionNode>()) {
-      main_module->Add(var, func);
-    }
-  }
-  return main_module;
-}
-
 Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
                  backend::StaticMemoryPlan memory_plan, const String& module_name,
                  std::function<void(Function)> process_fn) {
@@ -942,9 +926,8 @@ Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
                                                                             PassContext ctx) {
     return LowerTE(module, targets, device_context_map, memory_plan, module_name, process_fn);
   };
-  // TODO(@electriclilies, mbs): Fold InferType() pass into LowerTEPass since it will always need to
-  // be called afterwards
-  return tvm::transform::CreateModulePass(pass_func, 1, "LowerTE", {});
+  return tvm::transform::Sequential(
+      {tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}), InferType()});
 }
 }  // namespace tec
 }  // namespace relay
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 082cd8c4491a..9d0eb1078ee0 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -165,13 +165,6 @@ Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets);
  */
 Map<Target, IRModule> GetPerTargetModules(IRModule mod);
 
-/*!
- * \brief Utility to extract all the Relay functions from an IRModule, with no PrimFuncs.
- * \param mod The IRModule to extract the Relay functions from
- * \return An IRModule containing only the Relay functions that are in the input mod (no PrimFuncs)
- */
-IRModule GetMainModule(IRModule mod);
-
 /*! \brief Lower an IRModule's primitive functions to TIR.
  *
  * This is the "back half" of the Relay compiler which lowers "primitive functions"
diff --git a/src/relay/ir/transform.cc b/src/relay/ir/transform.cc
index 4a7974cae5ae..344d1cae7823 100644
--- a/src/relay/ir/transform.cc
+++ b/src/relay/ir/transform.cc
@@ -133,9 +133,7 @@ IRModule FunctionPassNode::operator()(IRModule mod, const PassContext& pass_ctx)
   DLOG(INFO) << "Executing function pass : " << pass_info->name
              << " with opt level: " << pass_info->opt_level;
 
-  // Execute the pass function and return a new module.
-  IRModule updated_mod =
-      IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+  IRModule updated_mod = mod->ShallowCopy();
 
   std::vector<std::pair<GlobalVar, Function> > updates;
   for (const auto& it : updated_mod->functions) {
diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index b48fbe44bd11..019659b3166e 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -30,6 +30,7 @@
  */
 
 #include <tvm/ir/error.h>
+#include <tvm/ir/module.h>
 #include <tvm/relay/analysis.h>
 #include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/expr.h>
@@ -509,7 +510,10 @@ class NameMangleExtFuncs : public MixedModeMutator {
 
     // Walk the tree and mangle the functions. Then replace compiler functions
     // with mangled functions in the module
-    IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports());
+    IRModule new_module = module_->ShallowCopy();
+    new_module->functions = {};
+    // IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports(),
+    //                               module_->source_map, module_->attrs);
     for (const auto& pair : glob_funcs) {
       if (auto* fn = pair.second.as<FunctionNode>()) {
         auto func = GetRef<Function>(fn);
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index d03fc1488aea..04158aa02f64 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -52,7 +52,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) {
   DLOG(INFO) << "ToBBlock:" << std::endl << mod;
 
   // Create a new module by shallow copy.
-  auto mod_ = IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+  auto mod_ = mod->ShallowCopy();
 
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = mod_->functions;
diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc
index f29087dcc049..6c2371716b16 100644
--- a/src/relay/transforms/type_infer.cc
+++ b/src/relay/transforms/type_infer.cc
@@ -205,13 +205,17 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       this->EmitFatal(Diagnostic::Error(op->span) << "Cannot do type inference on global variables "
                                                   << "without a module");
     }
-
     if (mod_->ContainGlobalVar(var->name_hint)) {
-      relay::Function e = Downcast<Function>(mod_->Lookup(var));
-      return e->checked_type();
-    } else {
-      return op->checked_type_;
+      BaseFunc func = mod_->Lookup(var->name_hint);
+
+      if (func->IsInstance<FunctionNode>()) {
+        relay::Function relay_func = Downcast<Function>(func);
+        return relay_func->checked_type();
+      }
     }
+    // Return op->checked_type if the module doesn't contain the GlobalVar or the function is a
+    // PrimFunc (we don't typecheck PrimFuncs)
+    return op->checked_type_;
   }
 
   Type VisitExpr_(const ConstantNode* op) final { return op->tensor_type(); }
@@ -822,8 +826,7 @@ Pass InferType() {
       [=](IRModule mod, const PassContext& pass_ctx) {
         DLOG(INFO) << "tvm::relay::transform::InferType";
         // Execute the pass function and return a new module.
-        IRModule updated_mod =
-            IRModule(mod->functions, mod->type_definitions, mod->Imports(), mod->source_map);
+        IRModule updated_mod = mod->ShallowCopy();
 
         pass_ctx->diag_ctx = DiagnosticContext::Default(updated_mod);
 
diff --git a/tests/python/relay/test_backend_compile_engine.py b/tests/python/relay/test_backend_compile_engine.py
index b90bce548a5e..092cae01f568 100644
--- a/tests/python/relay/test_backend_compile_engine.py
+++ b/tests/python/relay/test_backend_compile_engine.py
@@ -194,6 +194,8 @@ def get_func(shape):
     engine.dump()
 
 
+# Note: Once compile engine is removed, we should keep this test so that
+# we make sure that opt_level=0 passes are being called correctly.
 def test_compile_placeholder_bypass():
     engine = relay.backend.compile_engine.get()
     x = relay.var("x", shape=(2, 3))

From 6d00a9c4729f945837ae521c087d4f4d3a8f4025 Mon Sep 17 00:00:00 2001
From: electriclilies <lilyorthsmith@gmail.com>
Date: Tue, 7 Sep 2021 15:53:52 -0700
Subject: [PATCH 02/12] Fix rebase

---
 src/relay/backend/aot_executor_codegen.cc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index e2fd502fc632..70779ac58abf 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -598,12 +598,8 @@ class AOTExecutorCodegen : public MixedModeVisitor {
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
-    Optional<backend::FunctionInfo> main_func_info =
-        lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
-    ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
-    function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
-
-    Function lowered_main_func = Downcast<relay::Function>(lowered_mod->Lookup("main"));
+    auto lowered_main = lowered_mod->Lookup("main");
+    auto lowered_main_func = GetRef<Function>(lowered_main.as<FunctionNode>());
 
     // Post-lowering storage map for writing main func - this should be the same map as previously
     // created, just referencing the new expressions created from lowering

From 3d2ba1ba7602fb97995e7ae46b187edb0a3a10be Mon Sep 17 00:00:00 2001
From: electriclilies <lilyorthsmith@gmail.com>
Date: Tue, 7 Sep 2021 22:57:13 -0700
Subject: [PATCH 03/12] Remove comment

---
 src/relay/transforms/partition_graph.cc            | 3 +--
 src/relay/transforms/to_basic_block_normal_form.cc | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc
index 019659b3166e..f74cf983ccae 100644
--- a/src/relay/transforms/partition_graph.cc
+++ b/src/relay/transforms/partition_graph.cc
@@ -512,8 +512,7 @@ class NameMangleExtFuncs : public MixedModeMutator {
     // with mangled functions in the module
     IRModule new_module = module_->ShallowCopy();
     new_module->functions = {};
-    // IRModule new_module = IRModule({}, module_->type_definitions, module_->Imports(),
-    //                               module_->source_map, module_->attrs);
+
     for (const auto& pair : glob_funcs) {
       if (auto* fn = pair.second.as<FunctionNode>()) {
         auto func = GetRef<Function>(fn);
diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc
index 04158aa02f64..8e952d60b8b7 100644
--- a/src/relay/transforms/to_basic_block_normal_form.cc
+++ b/src/relay/transforms/to_basic_block_normal_form.cc
@@ -52,7 +52,7 @@ IRModule ToBasicBlockNormalForm(const IRModule& mod) {
   DLOG(INFO) << "ToBBlock:" << std::endl << mod;
 
   // Create a new module by shallow copy.
-  auto mod_ = mod->ShallowCopy();
+  IRModule mod_ = mod->ShallowCopy();
 
   tvm::Map<GlobalVar, Function> updates;
   auto funcs = mod_->functions;

From ee52039cdc81a229477985ec2039f7a2b71d404f Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 8 Sep 2021 16:29:54 +0300
Subject: [PATCH 04/12] [TEC] Remove memory plan from LowerTEPass

---
 src/relay/backend/aot_executor_codegen.cc   | 172 ++++++++++++++++++-
 src/relay/backend/graph_executor_codegen.cc |   2 +-
 src/relay/backend/interpreter.cc            |  21 ++-
 src/relay/backend/te_compiler.cc            | 178 +-------------------
 src/relay/backend/te_compiler.h             |   4 +-
 5 files changed, 190 insertions(+), 187 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 70779ac58abf..a928a76c6df1 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -45,6 +45,13 @@ namespace tvm {
 namespace relay {
 namespace backend {
 
+struct EnumClassHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
 using IntegerArray = Array<Integer>;
 using StorageMap =
     std::unordered_map<Expr, StorageInfo, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
@@ -277,6 +284,132 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
+  /*!
+   * \brief Update the "main" control function's metadata
+   *
+   * \param mod The module
+   * \param targets Map of targets
+   * \return function_infos Function info for each function in the module
+   */
+
+  backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+                                                Map<Expr, backend::StorageInfo> storage_info_map) {
+    CHECK_EQ(mod->functions.size(), 1)
+        << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
+    Function func = Downcast<Function>(mod->Lookup("main"));
+
+    // This is a Map<device,Map<storage_id, size>>
+    std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
+    // This is a Map<device, size_of_inputs_and_outputs>
+    std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
+    // This is a Map<device, size_of_constants>
+    std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;
+
+    // Initialize the mapping from all storage identifiers to workspace sizes,
+    // the amount of device io, and the device constants.
+    for (const auto& kv : storage_info_map) {
+      backend::StorageInfo storage_info = kv.second;
+      std::vector<int64_t> storage_ids = storage_info->storage_ids;
+      std::vector<DLDeviceType> devices = storage_info->device_types;
+
+      CHECK_EQ(storage_ids.size(), devices.size());
+      for (uint32_t i = 0; i < devices.size(); i++) {
+        sid_workspace[devices[i]][storage_ids[i]] = 0;
+        device_io[devices[i]] = 0;
+        device_consts[devices[i]] = 0;
+      }
+    }
+
+    // Iterate the storage map to compute all the tensor sizes in the program.
+    // There are 3 cases in this code:
+    //
+    // First we need to compute the sizes of all
+    // inline constants.
+    //
+    // Second we compute the size of any bound variable as these are input and output
+    // sizes of the program.
+    //
+    // Finally for all other expressions we check which storage identifier they have
+    // been assigned and we compute the maximal size of the storage, as tensors can
+    // share storage with other tensors which are the same size or larger.
+    //
+    // In this final case there is only one allocation for all tensors which share storage
+    // which will be the maximal size of all tensors which were assigned to it.
+    for (const auto& kv : storage_info_map) {
+      Expr expr = kv.first;
+      int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
+      backend::StorageInfo storage_info = kv.second;
+      std::vector<int64_t> storage_ids = storage_info->storage_ids;
+      std::vector<DLDeviceType> devices = storage_info->device_types;
+
+      if (expr->IsInstance<ConstantNode>()) {
+        for (const auto& dev : devices) {
+          device_consts[dev] += size_bytes;
+        }
+        continue;
+      } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
+        CHECK_GE(devices.size(), 1) << "must be at least one device";
+        for (const auto& dev : devices) {
+          device_io[dev] += size_bytes;
+        }
+        continue;
+      }
+
+      // TODO(@electriclilies): This code is never being called which means sid_workspace is not
+      // updated.. This means that storage info is probably not being created correctly. Or is not
+      // equivalent to what was here previously
+      for (uint32_t i = 0; i < storage_ids.size(); i++) {
+        // Here we record the largest size of the tensor
+        // that share the same storage id, because storage_id will
+        // be shared between multiple tensors that are not live simultaneously.
+        if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
+          sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
+        }
+      }
+    }
+
+    // This is a Map<device, workspace_size>
+    std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
+    // Once we know the sizes of sids, we need to accumulate per device
+    for (const auto& dev_sid_size : sid_workspace) {
+      auto dev = dev_sid_size.first;
+      device_workspace[dev] = 0;
+      for (const auto& sid_size : dev_sid_size.second) {
+        device_workspace[dev] += sid_size.second;
+      }
+    }
+
+    Map<Target, Integer> workspace_sizes;
+    Map<Target, Integer> io_sizes;
+    Map<Target, Integer> constant_sizes;
+    Map<Target, tir::PrimFunc> tir_primfuncs;
+    Map<Target, Function> relay_primfuncs;
+
+    // Initialize all target workspaces to zero
+    for (const auto& kv : targets) {
+      auto tgt = kv.second;
+      workspace_sizes.Set(tgt, 0);
+    }
+
+    for (const auto& dev_and_size : device_workspace) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+      workspace_sizes.Set(tgt, dev_and_size.second);
+      relay_primfuncs.Set(tgt, func);
+    }
+    for (const auto& dev_and_size : device_io) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+      io_sizes.Set(tgt, dev_and_size.second);
+    }
+
+    for (const auto& dev_and_size : device_consts) {
+      auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
+      constant_sizes.Set(tgt, dev_and_size.second);
+    }
+
+    return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
+                                 relay_primfuncs);
+  }
+
   /*!
    * brief Call a function with a given name
    */
@@ -314,6 +447,36 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     stmts_.push_back(body);
   }
 
+  Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
+    if (targets.size() == 1) {
+      // The homogeneous execution case, return the only target.
+      const auto& it = targets.begin();
+      return (*it).second;
+    } else {
+      // The heterogeneous execution case, return the target associated with the
+      // given device type.
+      // If "dev_type" equals to 0, the device name only can be got from
+      // "targets", and it may not be "llvm", so here just set it to "unknown".
+      std::string dev_name = "unknown";
+      if (dev_type != 0) {
+        dev_name = runtime::DeviceName(dev_type);
+      }
+
+      if (targets.count(dev_type) == 0) {
+        std::stringstream msg;
+        msg << "No target is specified for provided device name: `" << dev_name << "`\n\n"
+            << dev_name << " mapped to device type (" << dev_type
+            << ") which was not found in the target map.\n"
+            << "Availible targets: \n";
+        for (auto target : targets) {
+          msg << "  " << target.first << "-> " << target.second << "\n";
+        }
+        LOG(FATAL) << msg.str();
+      }
+      return targets[dev_type];
+    }
+  }
+
   /*!
    * brief Copy a variable to the output. This function is mainly used in edge cases
    * when we want to return an input or a parameter.
@@ -583,8 +746,15 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     // performing the preexisting AOT executor code generation phase.
     IRModule mod = IRModule::FromExpr(func);
 
+    backend::FunctionInfo func_info;
+
+    if (memory_plan.defined()) {
+      // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
+      func_info = UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
+    }
+
     IRModule lowered_mod =
-        LowerTEPass(targets_, device_context_map, memory_plan, mod_name, [this](Function func) {
+        LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index aca95db34c4e..3e7c952589d5 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -222,7 +222,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     }
 
     IRModule lowered_mod =
-        LowerTEPass(targets_, device_context_map, memory_plan_, mod_name_, [this](Function func) {
+        LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index df14b9e078b6..fb33787c57c7 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -906,17 +906,16 @@ IRModule Prepare(IRModule mod, Device device, Target target) {
   backend::StaticMemoryPlan memory_plan; /*=nullptr*/
 
   // Run minimal transforms on module to establish invariants needed by interpreter.
-  transform::Sequential seq(
-      {transform::SimplifyInference(),
-       // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
-       // attribute.
-       transform::FuseOps(/*fuse_opt_level=*/0), transform::ToANormalForm(),
-       // eta expand to support constructors in argument position
-       transform::EtaExpand(
-           /*expand_constructor=*/true, /*expand_global_var=*/false),
-       transform::InferType(),
-       tec::LowerTEPass(targets, device_map, memory_plan, /*module_name=*/"intrp",
-                        [](Function func) { /* no-op */ })});
+  transform::Sequential seq({transform::SimplifyInference(),
+                             // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
+                             // attribute.
+                             transform::FuseOps(/*fuse_opt_level=*/0), transform::ToANormalForm(),
+                             // eta expand to support constructors in argument position
+                             transform::EtaExpand(
+                                 /*expand_constructor=*/true, /*expand_global_var=*/false),
+                             transform::InferType(),
+                             tec::LowerTEPass(targets, device_map, /*module_name=*/"intrp",
+                                              [](Function func) { /* no-op */ })});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 0393fdfec70d..5807ade08479 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -596,8 +596,7 @@ class LowerTensorExprMutator : public ExprMutator {
   const Op& debug_op_;
 };
 
-Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map,
-                     backend::StaticMemoryPlan memory_plan, const String& module_name,
+Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name,
                      TECompiler compiler, std::function<void(Function)> process_fn) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function func, IRModule module, PassContext ctx) {
@@ -608,162 +607,6 @@ Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map,
   return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
 }
 
-Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets) {
-  if (targets.size() == 1) {
-    // The homogeneous execution case, return the only target.
-    const auto& it = targets.begin();
-    return (*it).second;
-  } else {
-    // The heterogeneous execution case, return the target associated with the
-    // given device type.
-    // If "dev_type" equals to 0, the device name only can be got from
-    // "targets", and it may not be "llvm", so here just set it to "unknown".
-    std::string dev_name = "unknown";
-    if (dev_type != 0) {
-      dev_name = runtime::DeviceName(dev_type);
-    }
-
-    if (targets.count(dev_type) == 0) {
-      std::stringstream msg;
-      msg << "No target is specified for provided device name: `" << dev_name << "`\n\n"
-          << dev_name << " mapped to device type (" << dev_type
-          << ") which was not found in the target map.\n"
-          << "Availible targets: \n";
-      for (auto target : targets) {
-        msg << "  " << target.first << "-> " << target.second << "\n";
-      }
-      LOG(FATAL) << msg.str();
-    }
-    return targets[dev_type];
-  }
-}
-
-/*!
- * \brief Update the "main" control function's metadata
- *
- * \param mod The module
- * \param targets Map of targets
- * \return function_infos Function info for each function in the module
- */
-
-backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, TargetMap targets,
-                                              Map<Expr, backend::StorageInfo> storage_info_map) {
-  CHECK_EQ(mod->functions.size(), 1)
-      << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
-  Function func = Downcast<Function>(mod->Lookup("main"));
-
-  // This is a Map<device,Map<storage_id, size>>
-  std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
-  // This is a Map<device, size_of_inputs_and_outputs>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
-  // This is a Map<device, size_of_constants>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;
-
-  // Initialize the mapping from all storage identifiers to workspace sizes,
-  // the amount of device io, and the device constants.
-  for (const auto& kv : storage_info_map) {
-    backend::StorageInfo storage_info = kv.second;
-    std::vector<int64_t> storage_ids = storage_info->storage_ids;
-    std::vector<DLDeviceType> devices = storage_info->device_types;
-
-    CHECK_EQ(storage_ids.size(), devices.size());
-    for (uint32_t i = 0; i < devices.size(); i++) {
-      sid_workspace[devices[i]][storage_ids[i]] = 0;
-      device_io[devices[i]] = 0;
-      device_consts[devices[i]] = 0;
-    }
-  }
-
-  // Iterate the storage map to compute all the tensor sizes in the program.
-  // There are 3 cases in this code:
-  //
-  // First we need to compute the sizes of all
-  // inline constants.
-  //
-  // Second we compute the size of any bound variable as these are input and output
-  // sizes of the program.
-  //
-  // Finally for all other expressions we check which storage identifier they have
-  // been assigned and we compute the maximal size of the storage, as tensors can
-  // share storage with other tensors which are the same size or larger.
-  //
-  // In this final case there is only one allocation for all tensors which share storage
-  // which will be the maximal size of all tensors which were assigned to it.
-  for (const auto& kv : storage_info_map) {
-    Expr expr = kv.first;
-    int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
-    backend::StorageInfo storage_info = kv.second;
-    std::vector<int64_t> storage_ids = storage_info->storage_ids;
-    std::vector<DLDeviceType> devices = storage_info->device_types;
-
-    if (expr->IsInstance<ConstantNode>()) {
-      for (const auto& dev : devices) {
-        device_consts[dev] += size_bytes;
-      }
-      continue;
-    } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
-      CHECK_GE(devices.size(), 1) << "must be at least one device";
-      for (const auto& dev : devices) {
-        device_io[dev] += size_bytes;
-      }
-      continue;
-    }
-
-    // TODO(@electriclilies): This code is never being called which means sid_workspace is not
-    // updated.. This means that storage info is probably not being created correctly. Or is not
-    // equivalent to what was here previously
-    for (uint32_t i = 0; i < storage_ids.size(); i++) {
-      // Here we record the largest size of the tensor
-      // that share the same storage id, because storage_id will
-      // be shared between multiple tensors that are not live simultaneously.
-      if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
-        sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
-      }
-    }
-  }
-
-  // This is a Map<device, workspace_size>
-  std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
-  // Once we know the sizes of sids, we need to accumulate per device
-  for (const auto& dev_sid_size : sid_workspace) {
-    auto dev = dev_sid_size.first;
-    device_workspace[dev] = 0;
-    for (const auto& sid_size : dev_sid_size.second) {
-      device_workspace[dev] += sid_size.second;
-    }
-  }
-
-  Map<Target, Integer> workspace_sizes;
-  Map<Target, Integer> io_sizes;
-  Map<Target, Integer> constant_sizes;
-  Map<Target, tir::PrimFunc> tir_primfuncs;
-  Map<Target, Function> relay_primfuncs;
-
-  // Initialize all target workspaces to zero
-  for (const auto& kv : targets) {
-    auto tgt = kv.second;
-    workspace_sizes.Set(tgt, 0);
-  }
-
-  for (const auto& dev_and_size : device_workspace) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
-    workspace_sizes.Set(tgt, dev_and_size.second);
-    relay_primfuncs.Set(tgt, func);
-  }
-  for (const auto& dev_and_size : device_io) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
-    io_sizes.Set(tgt, dev_and_size.second);
-  }
-
-  for (const auto& dev_and_size : device_consts) {
-    auto tgt = GetTargetFromInteger(dev_and_size.first, targets);
-    constant_sizes.Set(tgt, dev_and_size.second);
-  }
-
-  return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
-                               relay_primfuncs);
-}
-
 /*!
  * \brief A function to create the function metadata for an input function (ie calculate buffer
  * input/output sizes)
@@ -844,20 +687,13 @@ void UpdateFunctionMetadata(Function relay_func,
 }
 
 IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_context_map,
-                 backend::StaticMemoryPlan memory_plan, const String& module_name,
-                 std::function<void(Function)> process_fn) {
+                 const String& module_name, std::function<void(Function)> process_fn) {
   DLOG(INFO) << "lowering module:\n" << PrettyPrint(module);
 
   TECompiler compiler;
 
-  backend::FunctionInfo func_info;
-  if (memory_plan.defined()) {
-    // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
-    func_info = UpdateMainWorkspaceSize(module, targets, memory_plan->expr_to_storage_info);
-  }
-
-  auto updated_module = LowerTensorExpr(targets, device_context_map, memory_plan, module_name,
-                                        compiler, process_fn)(module);
+  auto updated_module =
+      LowerTensorExpr(targets, device_context_map, module_name, compiler, process_fn)(module);
 
   // A temporary solution until we can rewrite the auto-scheduler task extraction code to work
   // in a more reasonable way.
@@ -876,6 +712,7 @@ IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_con
 
     (*te_compiler_update_weights)(weight_map);
   }
+  backend::FunctionInfo func_info;
 
   // Copy the lowered functions into the return module
   updated_module->Update(compiler->GetLoweredFunctions());
@@ -919,12 +756,11 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
-                 backend::StaticMemoryPlan memory_plan, const String& module_name,
+Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map, const String& module_name,
                  std::function<void(Function)> process_fn) {
   runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
                                                                             PassContext ctx) {
-    return LowerTE(module, targets, device_context_map, memory_plan, module_name, process_fn);
+    return LowerTE(module, targets, device_context_map, module_name, process_fn);
   };
   return tvm::transform::Sequential(
       {tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}), InferType()});
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 9d0eb1078ee0..c337ec784a3d 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -192,15 +192,13 @@ IRModule LowerTE(
  *
  * \param targets The mapping for devices to targets.
  * \param device_context_map An analysis result mapping each sub-expression to a device.
- * \param memory_plan The memory plan used during lowering
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
  * \returns The pass which lowers primative functions to TIR
  */
 transform::Pass LowerTEPass(TargetMap targets, DeviceMap device_context_map,
-                            backend::StaticMemoryPlan memory_plan, const String& module_name,
-                            std::function<void(Function)> process_fn);
+                            const String& module_name, std::function<void(Function)> process_fn);
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm

From 8a0a3c97098f82440eafaec051f1703b815e4cfa Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Thu, 9 Sep 2021 10:41:46 +0300
Subject: [PATCH 05/12] Fix linting errors

---
 cmake/modules/contrib/EthosU.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake
index 8f3e09b8179b..899f2da1c577 100644
--- a/cmake/modules/contrib/EthosU.cmake
+++ b/cmake/modules/contrib/EthosU.cmake
@@ -18,4 +18,4 @@
 if(USE_ETHOSU)
   file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*)
   list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC})
-endif(USE_ETHOSU)
\ No newline at end of file
+endif(USE_ETHOSU)

From 97332839b30332dcd9a406a9ad371f3877d013ad Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 10 Sep 2021 11:49:07 +0300
Subject: [PATCH 06/12] Fix PR comments

---
 src/relay/backend/aot_executor_codegen.cc   | 13 +++----------
 src/relay/backend/graph_executor_codegen.cc |  2 +-
 src/relay/backend/te_compiler.h             | 13 ++-----------
 src/relay/backend/utils.h                   | 11 +++++++++++
 4 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 8b708a83fa94..40ff9008d3b7 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -38,20 +38,13 @@
 #include <string>
 #include <vector>
 
-#include "te_compiler.h"
-#include "utils.h"
+#include "./te_compiler.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
 namespace backend {
 
-struct EnumClassHash {
-  template <typename T>
-  std::size_t operator()(T t) const {
-    return static_cast<std::size_t>(t);
-  }
-};
-
 using IntegerArray = Array<Integer>;
 using StorageMap =
     std::unordered_map<Expr, StorageInfo, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
@@ -724,7 +717,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
 
     IRModule lowered_mod =
-        LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) {
+        tec::LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 3e7c952589d5..4851bc8a8559 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -222,7 +222,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     }
 
     IRModule lowered_mod =
-        LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
+        tec::LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
           // We need to maintain the constant map for external
           // functions so we pass this processing function which
           // allows us to process each function as we lower it.
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index c337ec784a3d..8b0f2a6e802e 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -52,24 +52,15 @@
 #include "../transforms/infer_layout_utils.h"
 #include "../transforms/pass_utils.h"
 #include "./te_compiler_cache.h"
-#include "utils.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
 namespace tec {
 
-// This class is needed to avoid a GCC 5 bug that prevents maps containing enums
-// from being compiled. If i386 GCC version is increased, we can remove it.
-struct EnumClassHash {
-  template <typename T>
-  std::size_t operator()(T t) const {
-    return static_cast<std::size_t>(t);
-  }
-};
-
 // TODO(@jroesch, @chrisS) these should be a tvm::Map for uniformity sake
 // we should a version of context which works in Map
-using TargetMap = std::unordered_map<DLDeviceType, Target, EnumClassHash>;
+using TargetMap = std::unordered_map<DLDeviceType, Target, backend::EnumClassHash>;
 using DeviceMap =
     std::unordered_map<Expr, tvm::Device, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
 using ProcessFn = std::function<void(Function)>;
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index cf8a2dd4b8e0..ae8d7d2c2360 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -146,6 +146,17 @@ struct LoweredOutput {
   runtime::Metadata metadata;
 };
 
+/*!
+ * \brief This class is needed to avoid a GCC 5 bug that prevents maps containing enums from being
+ compiled. If i386 GCC version is increased, we can remove it.
+ */
+struct EnumClassHash {
+  template <typename T>
+  std::size_t operator()(T t) const {
+    return static_cast<std::size_t>(t);
+  }
+};
+
 /*!
  * \brief A helper to expand the params by adding the ones used in a given expression.
  */

From 0a1beb6cec057fb4be125858596f3770b950b51f Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 10 Sep 2021 13:29:14 +0300
Subject: [PATCH 07/12] Remove updated module with function info from LowerTe

---
 src/relay/backend/te_compiler.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 0d1b7cef0220..f5576e623df5 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -742,14 +742,12 @@ IRModule LowerTE(const IRModule& module, TargetMap targets, DeviceMap device_con
 
     (*te_compiler_update_weights)(weight_map);
   }
-  backend::FunctionInfo func_info;
 
   // Copy the lowered functions into the return module
   updated_module->Update(compiler->GetLoweredFunctions());
 
   // Annotate the module with the external modules and function info
   updated_module = WithAttr(updated_module, "external_mods", compiler->LowerExternalFunctions());
-  updated_module = WithAttr(updated_module, "main_func_info", func_info);
 
   return updated_module;
 }

From 03a208cf886b6ded83926016e96327dd8688218f Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 10 Sep 2021 14:08:04 +0300
Subject: [PATCH 08/12] Refactor UpdateMainWorkspaceSize to update func info
 independently from LowerTEPass

---
 src/relay/backend/aot_executor_codegen.cc   | 128 +-------------------
 src/relay/backend/graph_executor_codegen.cc |  15 ++-
 src/relay/backend/te_compiler.cc            | 125 ++++++++++++++++++-
 src/relay/backend/te_compiler.h             |  10 ++
 4 files changed, 146 insertions(+), 132 deletions(-)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 40ff9008d3b7..898731e6a54d 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -277,132 +277,6 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     }
   }
 
-  /*!
-   * \brief Update the "main" control function's metadata
-   *
-   * \param mod The module
-   * \param targets Map of targets
-   * \return function_infos Function info for each function in the module
-   */
-
-  backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
-                                                Map<Expr, backend::StorageInfo> storage_info_map) {
-    CHECK_EQ(mod->functions.size(), 1)
-        << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
-    Function func = Downcast<Function>(mod->Lookup("main"));
-
-    // This is a Map<device,Map<storage_id, size>>
-    std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
-    // This is a Map<device, size_of_inputs_and_outputs>
-    std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
-    // This is a Map<device, size_of_constants>
-    std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;
-
-    // Initialize the mapping from all storage identifiers to workspace sizes,
-    // the amount of device io, and the device constants.
-    for (const auto& kv : storage_info_map) {
-      backend::StorageInfo storage_info = kv.second;
-      std::vector<int64_t> storage_ids = storage_info->storage_ids;
-      std::vector<DLDeviceType> devices = storage_info->device_types;
-
-      CHECK_EQ(storage_ids.size(), devices.size());
-      for (uint32_t i = 0; i < devices.size(); i++) {
-        sid_workspace[devices[i]][storage_ids[i]] = 0;
-        device_io[devices[i]] = 0;
-        device_consts[devices[i]] = 0;
-      }
-    }
-
-    // Iterate the storage map to compute all the tensor sizes in the program.
-    // There are 3 cases in this code:
-    //
-    // First we need to compute the sizes of all
-    // inline constants.
-    //
-    // Second we compute the size of any bound variable as these are input and output
-    // sizes of the program.
-    //
-    // Finally for all other expressions we check which storage identifier they have
-    // been assigned and we compute the maximal size of the storage, as tensors can
-    // share storage with other tensors which are the same size or larger.
-    //
-    // In this final case there is only one allocation for all tensors which share storage
-    // which will be the maximal size of all tensors which were assigned to it.
-    for (const auto& kv : storage_info_map) {
-      Expr expr = kv.first;
-      int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
-      backend::StorageInfo storage_info = kv.second;
-      std::vector<int64_t> storage_ids = storage_info->storage_ids;
-      std::vector<DLDeviceType> devices = storage_info->device_types;
-
-      if (expr->IsInstance<ConstantNode>()) {
-        for (const auto& dev : devices) {
-          device_consts[dev] += size_bytes;
-        }
-        continue;
-      } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
-        CHECK_GE(devices.size(), 1) << "must be at least one device";
-        for (const auto& dev : devices) {
-          device_io[dev] += size_bytes;
-        }
-        continue;
-      }
-
-      // TODO(@electriclilies): This code is never being called which means sid_workspace is not
-      // updated.. This means that storage info is probably not being created correctly. Or is not
-      // equivalent to what was here previously
-      for (uint32_t i = 0; i < storage_ids.size(); i++) {
-        // Here we record the largest size of the tensor
-        // that share the same storage id, because storage_id will
-        // be shared between multiple tensors that are not live simultaneously.
-        if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
-          sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
-        }
-      }
-    }
-
-    // This is a Map<device, workspace_size>
-    std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
-    // Once we know the sizes of sids, we need to accumulate per device
-    for (const auto& dev_sid_size : sid_workspace) {
-      auto dev = dev_sid_size.first;
-      device_workspace[dev] = 0;
-      for (const auto& sid_size : dev_sid_size.second) {
-        device_workspace[dev] += sid_size.second;
-      }
-    }
-
-    Map<Target, Integer> workspace_sizes;
-    Map<Target, Integer> io_sizes;
-    Map<Target, Integer> constant_sizes;
-    Map<Target, tir::PrimFunc> tir_primfuncs;
-    Map<Target, Function> relay_primfuncs;
-
-    // Initialize all target workspaces to zero
-    for (const auto& kv : targets) {
-      auto tgt = kv.second;
-      workspace_sizes.Set(tgt, 0);
-    }
-
-    for (const auto& dev_and_size : device_workspace) {
-      auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-      workspace_sizes.Set(tgt, dev_and_size.second);
-      relay_primfuncs.Set(tgt, func);
-    }
-    for (const auto& dev_and_size : device_io) {
-      auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-      io_sizes.Set(tgt, dev_and_size.second);
-    }
-
-    for (const auto& dev_and_size : device_consts) {
-      auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
-      constant_sizes.Set(tgt, dev_and_size.second);
-    }
-
-    return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
-                                 relay_primfuncs);
-  }
-
   /*!
    * brief Call a function with a given name
    */
@@ -713,7 +587,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
 
     if (memory_plan.defined()) {
       // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
-      func_info = UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
+      func_info = tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
     }
 
     IRModule lowered_mod =
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index 4851bc8a8559..b3c9746ab75f 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -36,8 +36,8 @@
 #include <string>
 #include <vector>
 
-#include "te_compiler.h"
-#include "utils.h"
+#include "./te_compiler.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
@@ -221,6 +221,14 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       device_context_map.insert({expr, dev});
     }
 
+    backend::FunctionInfo func_info;
+
+    if (memory_plan_.defined()) {
+      // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
+      func_info =
+          relay::tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan_->expr_to_storage_info);
+    }
+
     IRModule lowered_mod =
         tec::LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
           // We need to maintain the constant map for external
@@ -236,8 +244,11 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
+    lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info);
+
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
+
     ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
 
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index f5576e623df5..387f27c6fc79 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-#include "te_compiler.h"
+#include "./te_compiler.h"
 
 #include <tvm/driver/driver_api.h>
 #include <tvm/ir/attrs.h>
@@ -42,8 +42,8 @@
 #include <utility>
 #include <vector>
 
-#include "te_compiler_cache.h"
-#include "utils.h"
+#include "./te_compiler_cache.h"
+#include "./utils.h"
 
 namespace tvm {
 namespace relay {
@@ -596,6 +596,125 @@ class LowerTensorExprMutator : public ExprMutator {
   const Op& debug_op_;
 };
 
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+                                              Map<Expr, backend::StorageInfo> storage_info_map) {
+  CHECK_EQ(mod->functions.size(), 1)
+      << "There should only be one function in the module passed to UpdateMainWorkspaceSize";
+  Function func = Downcast<Function>(mod->Lookup("main"));
+
+  // This is a Map<device,Map<storage_id, size>>
+  std::unordered_map<DLDeviceType, std::unordered_map<int, int>, backend::EnumClassHash>
+      sid_workspace;
+  // This is a Map<device, size_of_inputs_and_outputs>
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_io;
+  // This is a Map<device, size_of_constants>
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_consts;
+
+  // Initialize the mapping from all storage identifiers to workspace sizes,
+  // the amount of device io, and the device constants.
+  for (const auto& kv : storage_info_map) {
+    backend::StorageInfo storage_info = kv.second;
+    std::vector<int64_t> storage_ids = storage_info->storage_ids;
+    std::vector<DLDeviceType> devices = storage_info->device_types;
+
+    CHECK_EQ(storage_ids.size(), devices.size());
+    for (uint32_t i = 0; i < devices.size(); i++) {
+      sid_workspace[devices[i]][storage_ids[i]] = 0;
+      device_io[devices[i]] = 0;
+      device_consts[devices[i]] = 0;
+    }
+  }
+
+  // Iterate the storage map to compute all the tensor sizes in the program.
+  // There are 3 cases in this code:
+  //
+  // First we need to compute the sizes of all
+  // inline constants.
+  //
+  // Second we compute the size of any bound variable as these are input and output
+  // sizes of the program.
+  //
+  // Finally for all other expressions we check which storage identifier they have
+  // been assigned and we compute the maximal size of the storage, as tensors can
+  // share storage with other tensors which are the same size or larger.
+  //
+  // In this final case there is only one allocation for all tensors which share storage
+  // which will be the maximal size of all tensors which were assigned to it.
+  for (const auto& kv : storage_info_map) {
+    Expr expr = kv.first;
+    int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
+    backend::StorageInfo storage_info = kv.second;
+    std::vector<int64_t> storage_ids = storage_info->storage_ids;
+    std::vector<DLDeviceType> devices = storage_info->device_types;
+
+    if (expr->IsInstance<ConstantNode>()) {
+      for (const auto& dev : devices) {
+        device_consts[dev] += size_bytes;
+      }
+      continue;
+    } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
+      CHECK_GE(devices.size(), 1) << "must be at least one device";
+      for (const auto& dev : devices) {
+        device_io[dev] += size_bytes;
+      }
+      continue;
+    }
+
+    // TODO(@electriclilies): This code is never being called which means sid_workspace is not
+    // updated.. This means that storage info is probably not being created correctly. Or is not
+    // equivalent to what was here previously
+    for (uint32_t i = 0; i < storage_ids.size(); i++) {
+      // Here we record the largest size of the tensor
+      // that share the same storage id, because storage_id will
+      // be shared between multiple tensors that are not live simultaneously.
+      if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
+        sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
+      }
+    }
+  }
+
+  // This is a Map<device, workspace_size>
+  std::unordered_map<DLDeviceType, int, backend::EnumClassHash> device_workspace;
+  // Once we know the sizes of sids, we need to accumulate per device
+  for (const auto& dev_sid_size : sid_workspace) {
+    auto dev = dev_sid_size.first;
+    device_workspace[dev] = 0;
+    for (const auto& sid_size : dev_sid_size.second) {
+      device_workspace[dev] += sid_size.second;
+    }
+  }
+
+  Map<Target, Integer> workspace_sizes;
+  Map<Target, Integer> io_sizes;
+  Map<Target, Integer> constant_sizes;
+  Map<Target, tir::PrimFunc> tir_primfuncs;
+  Map<Target, Function> relay_primfuncs;
+
+  // Initialize all target workspaces to zero
+  for (const auto& kv : targets) {
+    auto tgt = kv.second;
+    workspace_sizes.Set(tgt, 0);
+  }
+
+  for (const auto& dev_and_size : device_workspace) {
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
+    workspace_sizes.Set(tgt, dev_and_size.second);
+    relay_primfuncs.Set(tgt, func);
+  }
+  for (const auto& dev_and_size : device_io) {
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
+    io_sizes.Set(tgt, dev_and_size.second);
+  }
+
+  for (const auto& dev_and_size : device_consts) {
+    auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
+    constant_sizes.Set(tgt, dev_and_size.second);
+  }
+
+  return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
+                               relay_primfuncs);
+}
+
 Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
   if (targets.size() == 1) {
     // The homogeneous execution case, return the only target.
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index 8b0f2a6e802e..d5135e6301c4 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -149,6 +149,16 @@ void UpdateFunctionMetadata(Function relay_func,
  */
 Target GetTargetFromInteger(DLDeviceType dev_type, TargetMap targets);
 
+/*!
+ * \brief Update the "main" control function's metadata
+ *
+ * \param mod The module
+ * \param targets Map of targets
+ * \return function_infos Function info for each function in the module
+ */
+backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
+                                              Map<Expr, backend::StorageInfo> storage_info_map);
+
 /*! \brief Utility to separate the functions in an IRModule by Target.
  *
  * \param mod The IRModule to extract the per target module from

From b9654430bed4466dda468cc52b81635694fe3002 Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Fri, 10 Sep 2021 15:20:45 +0300
Subject: [PATCH 09/12] Fix aot failed tests

---
 src/relay/backend/aot_executor_codegen.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 898731e6a54d..a7bbcb007348 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -666,8 +666,11 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         Downcast<tir::PrimFunc>(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)),
         workspace_byte_alignment);
 
+    lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info);
+
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
+
     ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
     main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size);
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());

From c4cc22dc3c515028ebc7a3216f4e569db8d2d0fe Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 15 Sep 2021 11:14:36 +0300
Subject: [PATCH 10/12] Revert whitespaces fixes

---
 .github/ISSUE_TEMPLATE/ci-problem.md    | 2 +-
 .github/ISSUE_TEMPLATE/documentation.md | 1 +
 3rdparty/vta-hw                         | 2 +-
 cmake/modules/contrib/EthosU.cmake      | 2 +-
 docs/langref/relay_pattern.rst          | 2 +-
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/ci-problem.md b/.github/ISSUE_TEMPLATE/ci-problem.md
index 5a5977ab5000..f46a42f42cf5 100644
--- a/.github/ISSUE_TEMPLATE/ci-problem.md
+++ b/.github/ISSUE_TEMPLATE/ci-problem.md
@@ -19,4 +19,4 @@ Provide a link to the specific run that has failed.
 
 ### Flakiness
 
-Have you seen this multiple times in this branch or in other branches?
+Have you seen this multiple times in this branch or in other branches?
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
index 0ad9424c1b5d..a1c1facb7e1b 100644
--- a/.github/ISSUE_TEMPLATE/documentation.md
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -19,3 +19,4 @@ Include the title of the document (e.g. "Getting Started with TVM"), and the typ
 If an RFC/discuss post exists, link it here.
 
 Otherwise, specify what actions should be taken to provide additional clarity/readability/reproducibility to the document. Include code snippets from the previous documentation if applicable.
+
diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw
index dfe9f572a43d..36a91576edf6 160000
--- a/3rdparty/vta-hw
+++ b/3rdparty/vta-hw
@@ -1 +1 @@
-Subproject commit dfe9f572a43d41e0c1ecdf036cea97042a0febfe
+Subproject commit 36a91576edf633479c78649e050f18dd2ddc8103
diff --git a/cmake/modules/contrib/EthosU.cmake b/cmake/modules/contrib/EthosU.cmake
index 899f2da1c577..8f3e09b8179b 100644
--- a/cmake/modules/contrib/EthosU.cmake
+++ b/cmake/modules/contrib/EthosU.cmake
@@ -18,4 +18,4 @@
 if(USE_ETHOSU)
   file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*)
   list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC})
-endif(USE_ETHOSU)
+endif(USE_ETHOSU)
\ No newline at end of file
diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst
index 16211b2cb125..4682e5aa5b33 100644
--- a/docs/langref/relay_pattern.rst
+++ b/docs/langref/relay_pattern.rst
@@ -89,7 +89,7 @@ Or a convolution with a specific kernel size:
         x = relay.var('x')
         y = relay.var('y')
         assert is_conv2d.match(relay.op.nn.conv2d(x, y, kernel_size=[3, 3]))
-
+      
 
 
 Matching an Optional Op

From ed2c1cc6df49fdcc18e48fed46f1a903a9585a7a Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 15 Sep 2021 11:21:45 +0300
Subject: [PATCH 11/12] Remove obsolete function hoisting and minor cleanups

---
 src/relay/backend/interpreter.cc |  3 --
 src/relay/backend/te_compiler.cc | 82 ++++++++++++++++----------------
 2 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index a757d2d29622..d87cf9811bc7 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -902,9 +902,6 @@ IRModule Prepare(IRModule mod, Device device, Target target) {
   // All calls to primitives will use the unique target.
   tec::DeviceMap device_map;
 
-  // No need for a memory plan.
-  backend::StaticMemoryPlan memory_plan; /*=nullptr*/
-
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq({transform::SimplifyInference(),
                              // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 387f27c6fc79..2e7eb6f9aa6b 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -596,6 +596,47 @@ class LowerTensorExprMutator : public ExprMutator {
   const Op& debug_op_;
 };
 
+Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
+  if (targets.size() == 1) {
+    // The homogeneous execution case, return the only target.
+    const auto& it = targets.begin();
+    return (*it).second;
+  } else {
+    // The heterogeneous execution case, return the target associated with the
+    // given device type.
+    // If "dev_type" equals to 0, the device name only can be got from
+    // "targets", and it may not be "llvm", so here just set it to "unknown".
+    std::string dev_name = "unknown";
+    if (dev_type != 0) {
+      dev_name = runtime::DeviceName(dev_type);
+    }
+
+    if (targets.count(dev_type) == 0) {
+      std::stringstream msg;
+      msg << "No target is specified for provided device name: `" << dev_name << "`\n\n"
+          << dev_name << " mapped to device type (" << dev_type
+          << ") which was not found in the target map.\n"
+          << "Availible targets: \n";
+      for (auto target : targets) {
+        msg << "  " << target.first << "-> " << target.second << "\n";
+      }
+      LOG(FATAL) << msg.str();
+    }
+    return targets[dev_type];
+  }
+}
+
+Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name,
+                     TECompiler compiler, std::function<void(Function)> process_fn) {
+  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
+      [=](Function func, IRModule module, PassContext ctx) {
+        LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn,
+                                        module_name, compiler);
+        return Downcast<Function>(lower_te.Mutate(func));
+      };
+  return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
+}
+
 backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
                                               Map<Expr, backend::StorageInfo> storage_info_map) {
   CHECK_EQ(mod->functions.size(), 1)
@@ -715,47 +756,6 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
                                relay_primfuncs);
 }
 
-Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
-  if (targets.size() == 1) {
-    // The homogeneous execution case, return the only target.
-    const auto& it = targets.begin();
-    return (*it).second;
-  } else {
-    // The heterogeneous execution case, return the target associated with the
-    // given device type.
-    // If "dev_type" equals to 0, the device name only can be got from
-    // "targets", and it may not be "llvm", so here just set it to "unknown".
-    std::string dev_name = "unknown";
-    if (dev_type != 0) {
-      dev_name = runtime::DeviceName(dev_type);
-    }
-
-    if (targets.count(dev_type) == 0) {
-      std::stringstream msg;
-      msg << "No target is specified for provided device name: `" << dev_name << "`\n\n"
-          << dev_name << " mapped to device type (" << dev_type
-          << ") which was not found in the target map.\n"
-          << "Availible targets: \n";
-      for (auto target : targets) {
-        msg << "  " << target.first << "-> " << target.second << "\n";
-      }
-      LOG(FATAL) << msg.str();
-    }
-    return targets[dev_type];
-  }
-}
-
-Pass LowerTensorExpr(TargetMap targets, DeviceMap device_context_map, const String& module_name,
-                     TECompiler compiler, std::function<void(Function)> process_fn) {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-      [=](Function func, IRModule module, PassContext ctx) {
-        LowerTensorExprMutator lower_te(module, targets, device_context_map, process_fn,
-                                        module_name, compiler);
-        return Downcast<Function>(lower_te.Mutate(func));
-      };
-  return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
-}
-
 /*!
  * \brief A function to create the function metadata for an input function (ie calculate buffer
  * input/output sizes)

From ce9fbdf806a17ae1bfd1c0a5181225ddbf061e0d Mon Sep 17 00:00:00 2001
From: Michalis Papadimitriou <mikepapadim@hotmail.com>
Date: Wed, 15 Sep 2021 12:39:22 +0300
Subject: [PATCH 12/12] Address PR comments

---
 .../reference-vm/arduino/base-box/base_box_provision.sh       | 2 +-
 docker/install/ubuntu_install_vitis_ai_packages_ci.sh         | 4 ++--
 src/relay/backend/aot_executor_codegen.cc                     | 4 +---
 src/relay/backend/graph_executor_codegen.cc                   | 4 +---
 4 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
index d55100693d5c..11d89f2cd44e 100644
--- a/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
+++ b/apps/microtvm/reference-vm/arduino/base-box/base_box_provision.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-#   Using this script we can reuse docker/install scripts to configure the reference
+#   Using this script we can reuse docker/install scripts to configure the reference 
 #   virtual machine similar to CI QEMU setup.
 #
 
diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
index f6f270a07a83..25c214068cd0 100644
--- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
+++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh
@@ -6,9 +6,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-#
+# 
 #   http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index a7bbcb007348..ad9ba1b2069d 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -588,6 +588,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
     if (memory_plan.defined()) {
       // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
       func_info = tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
+      mod = WithAttr(mod, "main_func_info", func_info);
     }
 
     IRModule lowered_mod =
@@ -666,12 +667,9 @@ class AOTExecutorCodegen : public MixedModeVisitor {
         Downcast<tir::PrimFunc>(mod_run->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix)),
         workspace_byte_alignment);
 
-    lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info);
-
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
 
-    ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
     main_func_info.value()->workspace_sizes.Set(target_host_, main_workspace_size);
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
 
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index b3c9746ab75f..92e7568d9f38 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -227,6 +227,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       // TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
       func_info =
           relay::tec::UpdateMainWorkspaceSize(mod, targets_, memory_plan_->expr_to_storage_info);
+      mod = WithAttr(mod, "main_func_info", func_info);
     }
 
     IRModule lowered_mod =
@@ -244,12 +245,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
           tec::UpdateFunctionMetadata(func, this->function_metadata_);
         })(mod);
 
-    lowered_mod = WithAttr(lowered_mod, "main_func_info", func_info);
-
     Optional<backend::FunctionInfo> main_func_info =
         lowered_mod->GetAttr<backend::FunctionInfo>("main_func_info");
 
-    ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
     function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());
 
     Function lowered_main_func = Downcast<Function>(lowered_mod->Lookup("main"));