[microNPU] Refactor base address determination to codegen

manupak · manupak · commit 4e6074923671 · 2022-01-24T18:47:02.000Z
* Renaming runtime_allocate to be scratch again.
* Docstring adjustments.

Change-Id: Ife8baf97f3dc9348718bd03e62549169a466fc34
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py b/python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
@@ -36,15 +36,15 @@ class BufferType(Enum):
 
     constant = auto()
     input_or_output = auto()
-    runtime_allocate = auto()
+    scratch = auto()
     input = auto()
     output = auto()
     shram = auto()
 
 
 _REGION_MAP = {
     BufferType.constant: 0,
-    BufferType.runtime_allocate: 1,
+    BufferType.scratch: 1,
     BufferType.input: 3,
     BufferType.output: 4,
     BufferType.shram: int((1 << 8) | (3 << 0)),
@@ -103,23 +103,23 @@ def translate(tir_module, params):
         An hex string of the bytes that includes concat'd
         encoded weights, encoded biases and scales.
     base_addresses : List[util.BaseAddress]
-        base addresses
+        base addresses to be used by the driver
     """
 
     buffer_info = extract_buffer_info(tir_module, params)
     call_extern_list = extract_call_extern_list(tir_module)
     _npu_ops = list()
     for call_extern in call_extern_list:
         _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
-    _npu_ops, constant_data, runtime_allocation_size = assign_addresses(buffer_info, _npu_ops)
+    _npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops)
     base_addresses = extract_param_base_addresses(tir_module, buffer_info)
-    if runtime_allocation_size > 0:
+    if scratch_size > 0:
         base_addresses.append(
             util.BaseAddress(
-                "runtime_allocation",
+                "scratch",
                 None,
-                _REGION_MAP[BufferType.runtime_allocate],
-                runtime_allocation_size,
+                _REGION_MAP[BufferType.scratch],
+                scratch_size,
                 True,
             )
         )
@@ -248,7 +248,7 @@ def populate_allocate_buffer_info(stmt):
             if storage_scope == "local":
                 buffer_type = BufferType.shram
             else:
-                buffer_type = BufferType.runtime_allocate
+                buffer_type = BufferType.scratch
             buffer_info[allocate.buffer_var] = BufferInfo(
                 None,
                 allocate.extents,
@@ -280,7 +280,7 @@ def assign_addresses(buffer_info, npu_ops):
         A list of Vela NpuOps with addesses within scratch and constant buffers
     constant_tensor : NDArray
         A unified constant data array of uint8 as the constant buffer
-    runtime_allocation_size : int
+    scratch_size : int
         The size of the scratch tensor.
     """
 
@@ -327,7 +327,7 @@ def classify_io(buffer):
 
         raise ValueError(f"Unused IO : {buffer} in tir module.")
 
-    runtime_allocation_size = 0
+    scratch_size = 0
     constant_hex_data = []
     total_constant_len = 0
     buffer_addresses = dict()
@@ -352,7 +352,9 @@ def classify_io(buffer):
                 assert buffer_type in (BufferType.input, BufferType.output)
                 address = 0
                 buffer_addresses[_buffer] = (address, buffer_type)
-                buffer_info[_buffer] = BufferInfo(None, info.dtype, info.dtype, buffer_type)
+                buffer_info[_buffer] = BufferInfo(
+                    values=None, shape=info.dtype, dtype=info.dtype, btype=buffer_type
+                )
             elif info.btype == BufferType.shram:
                 accl_config = util.get_accelerator_config()
                 arch_config = get_accelerator_arch_config(accl_config)
@@ -363,9 +365,9 @@ def classify_io(buffer):
                 size_in_bytes = int(dtype_bytes * np.prod(list(info.shape)))
                 # Every memory address the NPU access have to be 16 byte aligned
                 size_in_bytes = util.round_up(size_in_bytes, 16)
-                assert info.btype == BufferType.runtime_allocate
-                address = runtime_allocation_size
-                runtime_allocation_size += size_in_bytes
+                assert info.btype == BufferType.scratch
+                address = scratch_size
+                scratch_size += size_in_bytes
                 buffer_addresses[_buffer] = (address, info.btype)
 
     for npu_op in npu_ops:
@@ -382,7 +384,7 @@ def classify_io(buffer):
     return (
         npu_ops,
         constant_data,
-        runtime_allocation_size,
+        scratch_size,
     )
 
 
diff --git a/src/relay/backend/contrib/ethosu/utils.h b/src/relay/backend/contrib/ethosu/utils.h
@@ -36,7 +36,8 @@ namespace ethosu {
 
 /*!
  * \brief Base addresses are input pointers to
- * the driver that get accessed by produced
+ * the driver that get accessed by the command stream
+ * using offsets to read/write data.
  */
 struct BaseAddressNode : public Object {
   /*! \brief The identifier, usually it the param name of the PrimFunc that gets lowered */
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -230,12 +230,12 @@ def test_buffer_info_extraction():
                 "ethosu_conv2d_2": (
                     [1024],
                     "uint8",
-                    tir_to_cs_translator.BufferType.runtime_allocate,
+                    tir_to_cs_translator.BufferType.scratch,
                 ),
                 "ethosu_conv2d_3": (
                     [2048],
                     "uint8",
-                    tir_to_cs_translator.BufferType.runtime_allocate,
+                    tir_to_cs_translator.BufferType.scratch,
                 ),
             },
         },
@@ -776,15 +776,15 @@ def _check_buffer(address, region, length, buffer_var):
         original tir buffers.
         - If its constant, this will check
           the slice in the constant tensor has the values.
-        - If its runtime_allocation, this will check
-          the slice is within runtime_allocation and does not have conflicts
-          with other runtime_allocation tensors.
+        - If its scratch, this will check
+          the slice is within scratch and does not have conflicts
+          with other scratch tensors.
         - If its input/output, this will check the
           address is zero
         """
         inverse_region_map = {
             0: tir_to_cs_translator.BufferType.constant,
-            1: tir_to_cs_translator.BufferType.runtime_allocate,
+            1: tir_to_cs_translator.BufferType.scratch,
             3: tir_to_cs_translator.BufferType.input,
             4: tir_to_cs_translator.BufferType.output,
         }
@@ -804,21 +804,19 @@ def _check_buffer(address, region, length, buffer_var):
             constant_tensor_read_mask[address : address + length] = np.ones(
                 length, dtype=buffer_dtype
             )
-        elif buffer_type == tir_to_cs_translator.BufferType.runtime_allocate:
+        elif buffer_type == tir_to_cs_translator.BufferType.scratch:
             shape = list(buffer_info[buffer_var].shape)
             assert length == np.prod(shape)
-            assert address < runtime_allocation_size
+            assert address < scratch_size
 
             size_in_bytes = int(np.prod(shape)) * dtype_bytes
             # Every buffer is adjusted to align to 16 bytes
             size_in_bytes = util.round_up(size_in_bytes, 16)
-            assert address + size_in_bytes <= runtime_allocation_size
-            # The runtime_allocation area should not be used by anyother buffer
-            assert not runtime_allocation_mask[address : address + size_in_bytes].any()
-            # The runtime_allocation area is marked as used
-            runtime_allocation_mask[address : address + size_in_bytes] = np.ones(
-                size_in_bytes, dtype="uint8"
-            )
+            assert address + size_in_bytes <= scratch_size
+            # The scratch area should not be used by any other buffer
+            assert not scratch_mask[address : address + size_in_bytes].any()
+            # The scratch area is marked as used
+            scratch_mask[address : address + size_in_bytes] = np.ones(size_in_bytes, dtype="uint8")
         elif buffer_type == tir_to_cs_translator.BufferType.input:
             assert address == 0
         else:
@@ -898,13 +896,13 @@ def check_buffer(address, region, length, buffer_var):
         (
             _npu_ops,
             constant_hex_string,
-            runtime_allocation_size,
+            scratch_size,
         ) = tir_to_cs_translator.assign_addresses(buffer_info, _npu_ops)
-        runtime_allocation_mask = np.zeros(runtime_allocation_size, dtype="uint8")
+        scratch_mask = np.zeros(scratch_size, dtype="uint8")
         constant_tensor_read_mask = np.zeros(len(constant_hex_string) // 2, dtype="uint8")
         verify(_npu_ops)
-        # This will be only 1 if all allocated runtime_allocation is used.
-        assert np.prod(runtime_allocation_mask) == 1
+        # This will be only 1 if all allocated scratch is used.
+        assert np.prod(scratch_mask) == 1
         # This will be only 1 if all constant tensors is read at least once.
         assert np.prod(constant_tensor_read_mask) == 1