Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Relay] Remove memory planing from LowerTEPass #8974

Merged
merged 18 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/ci-problem.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@ Provide a link to the specific run that has failed.

### Flakiness

Have you seen this multiple times in this branch or in other branches?
Have you seen this multiple times in this branch or in other branches?
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# specific language governing permissions and limitations
# under the License.
#
# Using this script we can reuse docker/install scripts to configure the reference
# Using this script we can reuse docker/install scripts to configure the reference
# virtual machine similar to CI QEMU setup.
#

Expand Down
2 changes: 1 addition & 1 deletion cmake/modules/contrib/EthosU.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
if(USE_ETHOSU)
file(GLOB ETHOSU_RELAY_CONTRIB_SRC src/relay/backend/contrib/ethosu/*)
list(APPEND COMPILER_SRCS ${ETHOSU_RELAY_CONTRIB_SRC})
endif(USE_ETHOSU)
endif(USE_ETHOSU)
2 changes: 1 addition & 1 deletion docs/langref/relay_pattern.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Or a convolution with a specific kernel size:
x = relay.var('x')
y = relay.var('y')
assert is_conv2d.match(relay.op.nn.conv2d(x, y, kernel_size=[3, 3]))



Matching an Optional Op
Expand Down
20 changes: 18 additions & 2 deletions include/tvm/ir/module.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class IRModuleNode : public Object {
v->Visit("global_var_map_", &global_var_map_);
v->Visit("global_type_var_map_", &global_type_var_map_);
v->Visit("source_map", &source_map);
v->Visit("attrs", &attrs);
}

TVM_DLL bool SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const;
Expand Down Expand Up @@ -277,6 +278,12 @@ class IRModuleNode : public Object {
*/
TVM_DLL void Update(const IRModule& other);

/*!
* \brief Create a shallow copy of this IRModule.
* \returns The shallow copy of the IRModule.
*/
TVM_DLL IRModule ShallowCopy();

/*!
* \brief Import Relay code from the file at path.
* \param path The path of the Relay code to import.
Expand Down Expand Up @@ -348,12 +355,14 @@ class IRModule : public ObjectRef {
* \brief constructor
* \param functions Functions in the module.
* \param type_definitions Type definitions in the module.
* \param import_set Set of imported files in the module
* \param import_set Set of imported files in the module.
* \param map The module source map.
* \param attrs The module attributes.
*/
TVM_DLL explicit IRModule(Map<GlobalVar, BaseFunc> functions,
Map<GlobalTypeVar, TypeData> type_definitions = {},
std::unordered_set<String> import_set = {}, parser::SourceMap map = {});
std::unordered_set<String> import_set = {}, parser::SourceMap map = {},
DictAttrs attrs = {});

/*! \brief default constructor */
IRModule() : IRModule(Map<GlobalVar, BaseFunc>({})) {}
Expand Down Expand Up @@ -415,6 +424,13 @@ class IRModule : public ObjectRef {
*/
TVM_DLL static IRModule FromText(const String& text, const String& source_path);

/*!
* \brief Create a shallow copy of an IRModule.
* \param mod The module to copy.
* \return The copied module.
*/
IRModule ShallowCopyIRModule(IRModule mod);

/*! \brief Declare the container type. */
using ContainerType = IRModuleNode;

Expand Down
11 changes: 10 additions & 1 deletion src/ir/module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ namespace tvm {

IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
tvm::Map<GlobalTypeVar, TypeData> type_definitions,
std::unordered_set<String> import_set, parser::SourceMap source_map) {
std::unordered_set<String> import_set, parser::SourceMap source_map,
DictAttrs attrs) {
auto n = make_object<IRModuleNode>();
n->functions = std::move(functions);
n->type_definitions = std::move(type_definitions);
Expand All @@ -52,6 +53,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,
n->constructor_tag_map_ = {};
n->import_set_ = std::move(import_set);
n->source_map = source_map;
n->attrs = std::move(attrs);

for (const auto& kv : n->functions) {
// set global var map
Expand All @@ -72,6 +74,7 @@ IRModule::IRModule(tvm::Map<GlobalVar, BaseFunc> functions,

bool IRModuleNode::SEqualReduce(const IRModuleNode* other, SEqualReducer equal) const {
if (functions.size() != other->functions.size()) return false;
if (!equal(this->attrs, other->attrs)) return false;
for (const auto& kv : this->functions) {
if (!other->ContainGlobalVar(kv.first->name_hint)) return false;
if (!equal(kv.second, other->Lookup(kv.first->name_hint))) return false;
Expand Down Expand Up @@ -112,6 +115,7 @@ void IRModuleNode::SHashReduce(SHashReducer hash_reduce) const {
temp.emplace_back(kv.first->name_hint, kv.second);
}
reduce_temp();
hash_reduce(this->attrs);
}

bool IRModuleNode::ContainGlobalVar(const String& name) const {
Expand Down Expand Up @@ -361,6 +365,11 @@ void IRModuleNode::Update(const IRModule& mod) {
}
}

IRModule IRModuleNode::ShallowCopy() {
return IRModule(this->functions, this->type_definitions, this->Imports(), this->source_map,
this->attrs);
}

std::pair<IRModule, GlobalVar> IRModule::FromExprInContext(
const RelayExpr& expr, const tvm::Map<GlobalVar, BaseFunc>& global_funcs,
const tvm::Map<GlobalTypeVar, TypeData>& type_definitions,
Expand Down
144 changes: 142 additions & 2 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ namespace tvm {
namespace relay {
namespace backend {

struct EnumClassHash {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you EnumClassHash into "utils.h" and remove this definition of it and the definition in the te compiler?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

template <typename T>
std::size_t operator()(T t) const {
return static_cast<std::size_t>(t);
}
};

using IntegerArray = Array<Integer>;
using StorageMap =
std::unordered_map<Expr, StorageInfo, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>;
Expand Down Expand Up @@ -277,6 +284,132 @@ class AOTExecutorCodegen : public MixedModeVisitor {
}
}

/*!
* \brief Update the "main" control function's metadata
*
* \param mod The module
* \param targets Map of targets
* \return function_infos Function info for each function in the module
*/

backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMap targets,
Map<Expr, backend::StorageInfo> storage_info_map) {
CHECK_EQ(mod->functions.size(), 1)
<< "There should only be one function in the module passed to UpdateMainWorkspaceSize";
Function func = Downcast<Function>(mod->Lookup("main"));

// This is a Map<device,Map<storage_id, size>>
std::unordered_map<DLDeviceType, std::unordered_map<int, int>, EnumClassHash> sid_workspace;
// This is a Map<device, size_of_inputs_and_outputs>
std::unordered_map<DLDeviceType, int, EnumClassHash> device_io;
// This is a Map<device, size_of_constants>
std::unordered_map<DLDeviceType, int, EnumClassHash> device_consts;

// Initialize the mapping from all storage identifiers to workspace sizes,
// the amount of device io, and the device constants.
for (const auto& kv : storage_info_map) {
backend::StorageInfo storage_info = kv.second;
std::vector<int64_t> storage_ids = storage_info->storage_ids;
std::vector<DLDeviceType> devices = storage_info->device_types;

CHECK_EQ(storage_ids.size(), devices.size());
for (uint32_t i = 0; i < devices.size(); i++) {
sid_workspace[devices[i]][storage_ids[i]] = 0;
device_io[devices[i]] = 0;
device_consts[devices[i]] = 0;
}
}

// Iterate the storage map to compute all the tensor sizes in the program.
// There are 3 cases in this code:
//
// First we need to compute the sizes of all
// inline constants.
//
// Second we compute the size of any bound variable as these are input and output
// sizes of the program.
//
// Finally for all other expressions we check which storage identifier they have
// been assigned and we compute the maximal size of the storage, as tensors can
// share storage with other tensors which are the same size or larger.
//
// In this final case there is only one allocation for all tensors which share storage
// which will be the maximal size of all tensors which were assigned to it.
for (const auto& kv : storage_info_map) {
Expr expr = kv.first;
int64_t size_bytes = backend::CalculateRelayExprSizeBytes(expr->checked_type());
backend::StorageInfo storage_info = kv.second;
std::vector<int64_t> storage_ids = storage_info->storage_ids;
std::vector<DLDeviceType> devices = storage_info->device_types;

if (expr->IsInstance<ConstantNode>()) {
for (const auto& dev : devices) {
device_consts[dev] += size_bytes;
}
continue;
} else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
CHECK_GE(devices.size(), 1) << "must be at least one device";
for (const auto& dev : devices) {
device_io[dev] += size_bytes;
}
continue;
}

// TODO(@electriclilies): This code is never being called which means sid_workspace is not
// updated.. This means that storage info is probably not being created correctly. Or is not
// equivalent to what was here previously
for (uint32_t i = 0; i < storage_ids.size(); i++) {
// Here we record the largest size of the tensor
// that share the same storage id, because storage_id will
// be shared between multiple tensors that are not live simultaneously.
if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
}
}
}

// This is a Map<device, workspace_size>
std::unordered_map<DLDeviceType, int, EnumClassHash> device_workspace;
// Once we know the sizes of sids, we need to accumulate per device
for (const auto& dev_sid_size : sid_workspace) {
auto dev = dev_sid_size.first;
device_workspace[dev] = 0;
for (const auto& sid_size : dev_sid_size.second) {
device_workspace[dev] += sid_size.second;
}
}

Map<Target, Integer> workspace_sizes;
Map<Target, Integer> io_sizes;
Map<Target, Integer> constant_sizes;
Map<Target, tir::PrimFunc> tir_primfuncs;
Map<Target, Function> relay_primfuncs;

// Initialize all target workspaces to zero
for (const auto& kv : targets) {
auto tgt = kv.second;
workspace_sizes.Set(tgt, 0);
}

for (const auto& dev_and_size : device_workspace) {
auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
workspace_sizes.Set(tgt, dev_and_size.second);
relay_primfuncs.Set(tgt, func);
}
for (const auto& dev_and_size : device_io) {
auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
io_sizes.Set(tgt, dev_and_size.second);
}

for (const auto& dev_and_size : device_consts) {
auto tgt = tec::GetTargetFromInteger(dev_and_size.first, targets);
constant_sizes.Set(tgt, dev_and_size.second);
}

return backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
relay_primfuncs);
}

/*!
* brief Call a function with a given name
*/
Expand Down Expand Up @@ -583,8 +716,15 @@ class AOTExecutorCodegen : public MixedModeVisitor {
// performing the preexisting AOT executor code generation phase.
IRModule mod = IRModule::FromExpr(func);

backend::FunctionInfo func_info;

if (memory_plan.defined()) {
// TODO(@electriclilies, @jroesch): remove UpdateMainWorkspaceSize
func_info = UpdateMainWorkspaceSize(mod, targets_, memory_plan->expr_to_storage_info);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just put the func_info on the mod here before passing the module into LowerTE? Then you don't need to re-extract it later, and also the logic surrounding func_info is all in one place. (LowerTEPass should preserve all attributes on modules passed into it)


IRModule lowered_mod =
LowerTEPass(targets_, device_context_map, memory_plan, mod_name, [this](Function func) {
LowerTEPass(targets_, device_context_map, mod_name, [this](Function func) {
// We need to maintain the constant map for external
// functions so we pass this processing function which
// allows us to process each function as we lower it.
Expand Down Expand Up @@ -676,7 +816,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {

Optional<Array<tvm::runtime::Module>> external_modules =
lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point.";
ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";

// This is the point where we separate the functions in the module by target
ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
Expand Down
15 changes: 6 additions & 9 deletions src/relay/backend/graph_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
}

IRModule lowered_mod =
LowerTEPass(targets_, device_context_map, memory_plan_, mod_name_, [this](Function func) {
LowerTEPass(targets_, device_context_map, mod_name_, [this](Function func) {
// We need to maintain the constant map for external
// functions so we pass this processing function which
// allows us to process each function as we lower it.
Expand All @@ -241,26 +241,23 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
ICHECK(main_func_info) << "The attribute \"main_func_info\" should be set at this point.";
function_metadata_.Set(runtime::symbol::tvm_module_main, main_func_info.value());

// Get only the Relay functions out of the lowered module so we can run type inference on them
IRModule main_module = tec::GetMainModule(lowered_mod);
main_module = relay::transform::InferType()(main_module);
relay::Function main_func = Downcast<relay::Function>(main_module->Lookup("main"));
Function lowered_main_func = Downcast<Function>(lowered_mod->Lookup("main"));

// Now that we have lowered all operators to TIR code, we can proceed with compilation.
//
// We need to unfortunately re-plan as the previous results have been invalidated by lowering
// we will fix this in future refactors.
memory_plan_ = GraphPlanMemory(main_func);
memory_plan_ = GraphPlanMemory(lowered_main_func);

// The graph planner also can not handle planning calls to global variables to we must remap

// First we convert all the parameters into input nodes.
for (auto param : main_func->params) {
for (auto param : lowered_main_func->params) {
auto node_ptr = GraphInputNode::make_node_ptr(param->name_hint(), GraphAttrs());
var_map_[param.get()] = AddNode(node_ptr, param);
}

heads_ = VisitExpr(main_func->body);
heads_ = VisitExpr(lowered_main_func->body);
std::ostringstream os;

dmlc::JSONWriter writer(&os);
Expand All @@ -277,7 +274,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<

Optional<Array<tvm::runtime::Module>> external_modules =
lowered_mod->GetAttr<Array<tvm::runtime::Module>>("external_mods");
ICHECK(external_modules) << "Attribute \"external_modules\" should be set at this point.";
ICHECK(external_modules) << "Attribute \"external_mods\" should be set at this point.";

// This is the point where we separate the functions in the module by target
ret.lowered_funcs = tec::GetPerTargetModules(lowered_mod);
Expand Down
Loading