Skip to content

Commit d709b14

Browse files
committed
[microNPU] enable USMP
* Fixing unit tests * Added a guard for USMP bufferinfo extraction ignore non-global allocates * fixed export_model_library_format to use target_kind type Change-Id: I9c6c90d8787c39697fca24af299f8309f40d3743
1 parent c1edeb8 commit d709b14

File tree

6 files changed

+111
-73
lines changed

6 files changed

+111
-73
lines changed

python/tvm/micro/model_library_format.py

Lines changed: 51 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -177,37 +177,26 @@ def _build_function_memory_map(function_metadata):
177177
"""
178178
device_max_workspace = dict()
179179
main_func_metadata = function_metadata[MAIN_FUNC_NAME_STR]
180-
main_targets = dict(main_func_metadata.workspace_sizes).keys()
181-
from tvm.driver import tvmc # pylint: disable=import-outside-toplevel
182-
183-
external_codegens = tvmc.composite_target.get_codegen_names()
184180
func_entries = []
185181
target_local_entries = dict()
186-
for main_target in main_targets:
187-
device_max_workspace[main_target] = 0
188-
for func_name, finfo in function_metadata.items():
189-
if func_name == MAIN_FUNC_NAME_STR:
190-
continue
191-
target_local_entries[func_name] = list()
192182

193-
for func_name, finfo in function_metadata.items():
194-
# Skip a few unsupported cases:
195-
# 1. The main function metadata is exported elsewhere.
196-
# 2. BYOC operator implementations do not currently export useful FunctionInfo.
197-
if func_name == MAIN_FUNC_NAME_STR or not finfo.tir_primfuncs:
198-
continue
199-
if main_target in finfo.workspace_sizes.keys():
200-
workspace_size = finfo.workspace_sizes[main_target]
201-
target_entry = {
202-
"device": int(main_target.kind.device_type),
203-
"workspace_size_bytes": int(workspace_size),
204-
}
205-
target_local_entries[func_name].append(target_entry)
206-
if workspace_size > device_max_workspace.get(main_target, 0):
207-
device_max_workspace[main_target] = workspace_size
208-
# TODO(Mousius) - Remove this massive hack when Targets are unified
209-
if main_target.kind.name in external_codegens:
210-
device_max_workspace[main_target] += int(workspace_size)
183+
for func_name, finfo in function_metadata.items():
184+
# Skip a few unsupported cases:
185+
# 1. The main function metadata is exported elsewhere.
186+
# 2. BYOC operator implementations do not currently export useful FunctionInfo.
187+
if func_name == MAIN_FUNC_NAME_STR or not finfo.tir_primfuncs:
188+
continue
189+
if func_name not in target_local_entries.keys():
190+
target_local_entries[func_name] = list()
191+
for target in dict(finfo.workspace_sizes).keys():
192+
workspace_size = finfo.workspace_sizes[target]
193+
target_entry = {
194+
"device": int(target.kind.device_type),
195+
"workspace_size_bytes": int(workspace_size),
196+
}
197+
target_local_entries[func_name].append(target_entry)
198+
if workspace_size >= device_max_workspace.get(int(target.kind.device_type), 0):
199+
device_max_workspace[int(target.kind.device_type)] = workspace_size
211200

212201
for func_name, target_entries_ in target_local_entries.items():
213202
func_entry = {
@@ -216,32 +205,46 @@ def _build_function_memory_map(function_metadata):
216205
}
217206
func_entries.append(func_entry)
218207

219-
target_main_entries = list()
220-
for main_target in main_targets:
221-
main_func_local_workspace = main_func_metadata.workspace_sizes[main_target]
222-
main_func_constants = (
223-
main_func_metadata.constant_sizes[main_target]
224-
if main_target in main_func_metadata.constant_sizes.keys()
225-
else 0
208+
target_main_entries = dict()
209+
210+
def _create_empty_entry(target_device_type):
211+
return {
212+
"device": int(target_device_type),
213+
"workspace_size_bytes": 0,
214+
"constants_size_bytes": 0,
215+
"io_size_bytes": 0,
216+
}
217+
218+
for target in dict(main_func_metadata.workspace_sizes).keys():
219+
main_func_local_workspace = main_func_metadata.workspace_sizes[target]
220+
target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
221+
int(target.kind.device_type)
226222
)
227-
main_func_io = (
228-
main_func_metadata.io_sizes[main_target]
229-
if main_target in main_func_metadata.io_sizes.keys()
230-
else 0
223+
target_main_entries[int(target.kind.device_type)]["workspace_size_bytes"] = int(
224+
device_max_workspace.get(int(target.kind.device_type), 0)
225+
) + int(main_func_local_workspace)
226+
227+
for target in dict(main_func_metadata.constant_sizes).keys():
228+
if int(target.kind.device_type) not in target_main_entries.keys():
229+
target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
230+
int(target.kind.device_type)
231+
)
232+
target_main_entries[int(target.kind.device_type)]["constants_size_bytes"] = int(
233+
main_func_metadata.constant_sizes[target]
231234
)
232-
target_main_entries.append(
233-
{
234-
"device": int(main_target.kind.device_type),
235-
"workspace_size_bytes": int(device_max_workspace[main_target])
236-
+ int(main_func_local_workspace),
237-
"constants_size_bytes": int(main_func_constants),
238-
"io_size_bytes": int(main_func_io),
239-
}
235+
236+
for target in dict(main_func_metadata.io_sizes).keys():
237+
if int(target.kind.device_type) not in target_main_entries.keys():
238+
target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
239+
int(target.kind.device_type)
240+
)
241+
target_main_entries[int(target.kind.device_type)]["io_size_bytes"] = int(
242+
main_func_metadata.io_sizes[target]
240243
)
241244

242245
ret = {
243246
"operator_functions": func_entries,
244-
"main": target_main_entries,
247+
"main": list(target_main_entries.values()),
245248
}
246249
return ret
247250

src/tir/usmp/analysis/extract_buffer_info.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636

3737
#include <stack>
3838

39+
#include "../../../runtime/thread_storage_scope.h"
40+
3941
namespace tvm {
4042
namespace tir {
4143
namespace usmp {
@@ -257,14 +259,14 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) {
257259
void BufferInfoExtractor::VisitStmt_(const AllocateNode* op) {
258260
ScopeInfo& current_scope_info = scope_stack_.top();
259261
const auto& type = Downcast<PointerType>(op->buffer_var->type_annotation);
260-
const auto& storage_scope = type->storage_scope;
262+
const auto& storage_scope = runtime::StorageScope::Create(type->storage_scope);
261263

262264
// If the allocate is in a for loop, USMP currently only looks at serial for loops.
263265
// If its not a serial for loop, then memory planner will omit them in the current memory planning
264266
// process leaving them to as tir.allocate nodes for codegen. Additionally, the USMP can only work
265267
// with buffers that have global storage_scope
266268

267-
if (storage_scope == "global") {
269+
if (storage_scope.rank == runtime::StorageRank::kGlobal) {
268270
if (!current_scope_info.for_loop.defined()) {
269271
RecordAllocateNodeInfo(op);
270272
} else if (current_scope_info.for_loop.defined() &&

src/tir/usmp/transform/assign_pool_info.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,13 @@ class PoolInfoAssigner : public StmtExprMutator {
7676

7777
usmp::PoolInfo PoolInfoAssigner::CreateDefaultMemoryPool(const tvm::IRModule& module) {
7878
Map<Target, String> target_access;
79+
tir::PrimFunc tir_main_func =
80+
Downcast<tir::PrimFunc>(module->Lookup(::tvm::runtime::symbol::tvm_run_func_suffix));
81+
Target target_host = tir_main_func->GetAttr<Target>(tvm::attr::kTarget).value();
7982
for (const auto& kv : module->functions) {
8083
BaseFunc func = kv.second;
8184
Optional<Target> target = func->GetAttr<Target>(tvm::attr::kTarget);
82-
ICHECK(target) << "main function does not have a target attr";
83-
target_access.Set(target.value(), usmp::kTargetPoolReadWriteAccess);
85+
target_access.Set(target.value_or(target_host), usmp::kTargetPoolReadWriteAccess);
8486
}
8587
return usmp::PoolInfo("global_workspace", target_access, usmp::kUnrestrictedPoolSizeHint,
8688
Bool(true));

tests/python/contrib/test_ethosu/infra.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ def deserialize_command_stream(blob):
189189
return cmms
190190

191191

192-
def create_test_runner(accel="ethos-u55-256"):
192+
def create_test_runner(accel="ethos-u55-256", enable_usmp=True):
193193
file_dir = os.path.dirname(os.path.abspath(__file__))
194194
test_root = os.path.join(file_dir, "reference_system")
195195
_, ethosu_variant, ethosu_macs = accel.split("-")
@@ -215,13 +215,15 @@ def create_test_runner(accel="ethos-u55-256"):
215215
"relay.ext.ethos-u.options": {
216216
"accelerator_config": accel,
217217
},
218-
"tir.usmp.enable": True,
218+
"tir.usmp.enable": enable_usmp,
219219
},
220220
)
221221

222222

223-
def build_source(module, inputs, outputs, accel="ethos-u55-256", output_tolerance=0):
224-
test_runner = create_test_runner(accel)
223+
def build_source(
224+
module, inputs, outputs, accel="ethos-u55-256", output_tolerance=0, enable_usmp=True
225+
):
226+
test_runner = create_test_runner(accel, enable_usmp)
225227
return compile_models(
226228
models=AOTTestModel(
227229
module=module,

tests/python/contrib/test_ethosu/test_networks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32"]
3838

3939

40-
def test_forward_mobilenet_v1(accel_type="ethos-u55-256"):
40+
@pytest.mark.parametrize("enable_usmp", [True, False])
41+
def test_forward_mobilenet_v1(enable_usmp, accel_type="ethos-u55-256"):
4142
"""Test the Mobilenet V1 TF Lite model."""
4243
np.random.seed(23)
4344
tflite_model_file = tf_testing.get_workload_official(
@@ -59,7 +60,7 @@ def test_forward_mobilenet_v1(accel_type="ethos-u55-256"):
5960

6061
mod = partition_for_ethosu(relay_mod, params)
6162
compiled_models = infra.build_source(
62-
mod, input_data, output_data, accel_type, output_tolerance=10
63+
mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp
6364
)
6465
infra.verify_source(compiled_models, accel_type)
6566

tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ def populate_ethosu_copy_calls(stmt):
645645
{
646646
"src": "placeholder_5",
647647
"dest": "placeholder_d_global",
648-
"length": 8,
648+
"length": 32,
649649
},
650650
],
651651
},
@@ -851,24 +851,45 @@ def _check_buffer(address, region, length, buffer_var):
851851
length, dtype=buffer_dtype
852852
)
853853
elif buffer_type == tir_to_cs_translator.BufferType.scratch:
854-
shape = list(buffer_info[buffer_var].shape)
855-
assert length == np.prod(shape)
856-
assert address < scratch_size
854+
assert address < tvmbaw_workspace_size
857855

858-
size_in_bytes = int(np.prod(shape)) * dtype_bytes
856+
size_in_bytes = allocate_node_sizes[buffer_var]
859857
# Every buffer is adjusted to align to 16 bytes
860858
size_in_bytes = util.round_up(size_in_bytes, 16)
861-
assert address + size_in_bytes <= scratch_size
859+
assert address + size_in_bytes <= tvmbaw_workspace_size
862860
# The scratch area should not be used by any other buffer
863-
assert not scratch_mask[address : address + size_in_bytes].any()
861+
assert not tvmbaw_workspace_mask[address : address + size_in_bytes].any()
864862
# The scratch area is marked as used
865-
scratch_mask[address : address + size_in_bytes] = np.ones(size_in_bytes, dtype="uint8")
863+
tvmbaw_workspace_mask[address : address + size_in_bytes] = np.ones(
864+
size_in_bytes, dtype="uint8"
865+
)
866866
elif buffer_type == tir_to_cs_translator.BufferType.input:
867867
assert address == 0
868868
else:
869869
assert buffer_type == tir_to_cs_translator.BufferType.output
870870
assert address == 0
871871

872+
def _get_allocate_node_sizes(mod):
873+
# There should only be a single function
874+
assert len(mod.functions.items()) == 1
875+
primfunc = mod.functions.items()[0][1]
876+
_allocate_node_sizes = dict()
877+
878+
def analyze_remaining_allocates(stmt):
879+
if isinstance(stmt, tvm.tir.stmt.Allocate):
880+
allocate = stmt
881+
pointer_type = allocate.buffer_var.type_annotation
882+
storage_scope = pointer_type.storage_scope
883+
if storage_scope == "global":
884+
dtype_bytes = np.iinfo(np.dtype(allocate.dtype)).bits // 8
885+
size_in_bytes = int(dtype_bytes * np.prod(list(allocate.extents)))
886+
# Every memory address the NPU access have to be 16 byte aligned
887+
size_in_bytes = util.round_up(size_in_bytes, 16)
888+
_allocate_node_sizes[allocate.buffer_var] = size_in_bytes
889+
890+
tvm.tir.stmt_functor.post_order_visit(primfunc.body, analyze_remaining_allocates)
891+
return _allocate_node_sizes
892+
872893
def verify(npu_ops):
873894
"""This wrapper verifies the allocated addresses matches with original tir buffers"""
874895
checked_buffers = set()
@@ -933,22 +954,29 @@ def check_buffer(address, region, length, buffer_var):
933954
tir_mod = test_case["tir_module"]
934955
tir_mod["main"] = tir_mod["main"].with_attr("target", tvm.target.Target("ethos-u"))
935956
tir_mod = tvm.tir.transform.MakeUnpackedAPI()(tir_mod)
957+
candidate_regions_for_scratch = [5, 2, 1]
958+
(
959+
scratch_region_map,
960+
tvmbaw_workspace_size,
961+
_,
962+
) = tir_to_cs_translator.analyze_scratch_memory_acesses(
963+
tir_mod, candidate_regions_for_scratch
964+
)
965+
allocate_node_sizes = _get_allocate_node_sizes(tir_mod)
936966
buffer_info = tir_to_cs_translator.extract_buffer_info(tir_mod, test_case["param_dict"])
937967
extern_calls = extract_call_extern_list(tir_mod)
938968
_npu_ops = list()
939969
for extern_call in extern_calls:
940970
_npu_ops.append(tir_to_cs_translator.translate_ethosu_tir_call_extern(extern_call))
941971
npu_op_tir_buffers = collect_tir_buffer_info(_npu_ops)
942-
(
943-
_npu_ops,
944-
constant_hex_string,
945-
scratch_size,
946-
) = tir_to_cs_translator.assign_addresses(buffer_info, _npu_ops)
947-
scratch_mask = np.zeros(scratch_size, dtype="uint8")
972+
(_npu_ops, constant_hex_string) = tir_to_cs_translator.assign_addresses(
973+
buffer_info, _npu_ops, scratch_region_map
974+
)
975+
tvmbaw_workspace_mask = np.zeros(tvmbaw_workspace_size, dtype="uint8")
948976
constant_tensor_read_mask = np.zeros(len(constant_hex_string) // 2, dtype="uint8")
949977
verify(_npu_ops)
950978
# This will be only 1 if all allocated scratch is used.
951-
assert np.prod(scratch_mask) == 1
979+
assert np.prod(tvmbaw_workspace_mask) == 1
952980
# This will be only 1 if all constant tensors is read at least once.
953981
assert np.prod(constant_tensor_read_mask) == 1
954982

0 commit comments

Comments
 (0)