feat: enhance profiling and benchmarking (#3012)

rkuester · web-flow · commit b2f27186d243 · 2024-12-14T19:00:55.000Z
Modify `PrintMemoryPlan` in `greedy_memory_planner.cc` for better
handling of tensor indices and scratch buffers.

Fix `total_ticks_per_tag_` usage in `micro_profiler.cc` and add
`ClearEvents` method.

Update `Makefile.inc` and `generic_model_benchmark.cc` to support
alternate memory regions and CRC32 checks for data integrity.

Include compression data in `metrics.cc` allocation records and
handle architecture-specific directives in
`show_meta_data.cc.template`.

BUG=see description
diff --git a/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc b/tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc
@@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {
   } else if (i < 62) {
     return 'A' + (i - 36);
   }
-  return '*';
+  return GetOrdinalCharacter(i % 62);
 }
 
 }  // namespace
@@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
   CalculateOffsetsIfNeeded();
 
   for (int i = 0; i < buffer_count_; ++i) {
-    MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",
-                GetOrdinalCharacter(i), i, requirements_[i].size,
-                buffer_offsets_[i], requirements_[i].first_time_used,
+    char c = '*';
+    if (requirements_[i].first_time_used != requirements_[i].last_time_used) {
+      // not a scratch buffer nor subgraph output tensor
+      c = GetOrdinalCharacter(i);
+    }
+    MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,
+                i, requirements_[i].size, buffer_offsets_[i],
+                requirements_[i].first_time_used,
                 requirements_[i].last_time_used);
   }
 
@@ -379,15 +384,20 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
       const int line_end = ((offset + size) * kLineWidth) / max_size;
       for (int n = line_start; n < line_end; ++n) {
         if (line[n] == '.') {
-          line[n] = GetOrdinalCharacter(i);
+          if (requirements->first_time_used == requirements->last_time_used) {
+            // scratch buffer or subgraph output tensor
+            line[n] = '*';
+          } else {
+            line[n] = GetOrdinalCharacter(i);
+          }
         } else {
           line[n] = '!';
         }
       }
     }
     line[kLineWidth] = 0;
 
-    MicroPrintf("%s%d: %s (%dk)", t < 10 ? " " : "", t, (const char*)line,
+    MicroPrintf("%4d: %s (%dk)", t, (const char*)line,
                 (memory_use + 1023) / 1024);
   }
 }
diff --git a/tensorflow/lite/micro/micro_profiler.cc b/tensorflow/lite/micro/micro_profiler.cc
@@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() {
     TFLITE_DCHECK(tags_[i] != nullptr);
     int position = FindExistingOrNextPosition(tags_[i]);
     TFLITE_DCHECK(position >= 0);
-    total_ticks_per_tag[position].tag = tags_[i];
-    total_ticks_per_tag[position].ticks =
-        total_ticks_per_tag[position].ticks + ticks;
+    total_ticks_per_tag_[position].tag = tags_[i];
+    total_ticks_per_tag_[position].ticks =
+        total_ticks_per_tag_[position].ticks + ticks;
     total_ticks += ticks;
   }
 
   for (int i = 0; i < num_events_; ++i) {
-    TicksPerTag each_tag_entry = total_ticks_per_tag[i];
+    TicksPerTag each_tag_entry = total_ticks_per_tag_[i];
     if (each_tag_entry.tag == nullptr) {
       break;
     }
@@ -112,12 +112,21 @@ void MicroProfiler::LogTicksPerTagCsv() {
 int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) {
   int pos = 0;
   for (; pos < num_events_; pos++) {
-    TicksPerTag each_tag_entry = total_ticks_per_tag[pos];
+    TicksPerTag each_tag_entry = total_ticks_per_tag_[pos];
     if (each_tag_entry.tag == nullptr ||
         strcmp(each_tag_entry.tag, tag_name) == 0) {
       return pos;
     }
   }
   return pos < num_events_ ? pos : -1;
 }
+
+void MicroProfiler::ClearEvents() {
+  for (int i = 0; i < num_events_; i++) {
+    total_ticks_per_tag_[i].tag = nullptr;
+  }
+
+  num_events_ = 0;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_profiler.h b/tensorflow/lite/micro/micro_profiler.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface {
   virtual void EndEvent(uint32_t event_handle) override;
 
   // Clears all the events that have been currently profiled.
-  void ClearEvents() { num_events_ = 0; }
+  void ClearEvents();
 
   // Returns the sum of the ticks taken across all the events. This number
   // is only meaningful if all of the events are disjoint (the end time of
@@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface {
   // In practice, the number of tags will be much lower than the number of
   // events. But it is theoretically possible that each event to be unique and
   // hence we allow total_ticks_per_tag to have kMaxEvents entries.
-  TicksPerTag total_ticks_per_tag[kMaxEvents] = {};
+  TicksPerTag total_ticks_per_tag_[kMaxEvents] = {};
 
   int FindExistingOrNextPosition(const char* tag_name);
 
diff --git a/tensorflow/lite/micro/tools/benchmarking/Makefile.inc b/tensorflow/lite/micro/tools/benchmarking/Makefile.inc
@@ -20,6 +20,15 @@ endif
     $(GENERATED_SRCS_DIR)$(GENERIC_BENCHMARK_MODEL_DIR)$(GENERIC_BENCHMARK_MODEL_NAME)_model_data.h
 endif
 
+ifeq ($(ENABLE_COMPRESSION), yes)
+ifneq ($(GENERIC_BENCHMARK_ALT_MEM_ATTR),)
+    CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_ATTR=$(GENERIC_BENCHMARK_ALT_MEM_ATTR)
+endif
+ifneq ($(GENERIC_BENCHMARK_ALT_MEM_SIZE),)
+    CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_SIZE=$(GENERIC_BENCHMARK_ALT_MEM_SIZE)
+endif
+endif
+
 GENERIC_BENCHMARK_SRCS := \
 $(MICROLITE_BENCHMARK_ROOT_DIR)/generic_model_benchmark.cc \
 $(MICROLITE_BENCHMARK_ROOT_DIR)/metrics.cc \
diff --git a/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh b/tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh
@@ -52,7 +52,7 @@ function substitute_strings() {
   IFS=${SAVED_IFS}
   replacement=()
   for line in "${lines_array[@]}"; do
-    line=$(sed -e 's/"/\\"/g' <<< "${line}")
+    line=$(sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' <<< "${line}")
     line=$(printf '"%s",\n    ' "${line}")
     replacement+=( "${line}" )
   done
diff --git a/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc b/tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <sys/types.h>
 
 #include <cstring>
+#include <initializer_list>
 #include <memory>
 #include <random>
 #include <type_traits>
@@ -56,19 +57,37 @@ limitations under the License.
 
 #endif  // defind(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
 
+#if defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && \
+    !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)
+#error "GENERIC_BENCHMARK_ALT_MEM_SIZE missing from CXXFLAGS"
+#endif  // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
+        // !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)
+
+#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
+    !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)
+#error "GENERIC_BENCHMARK_ALT_MEM_ATTR missing from CXXFLAGS"
+#endif  // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
+        // !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)
+
+#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
+    defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && defined(USE_TFLM_COMPRESSION)
+#define USE_ALT_DECOMPRESSION_MEM
+#endif  // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
+        // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
+        // defined(USE_TFLM_COMPRESSION)
+
 /*
- * Generic model benchmark.  Evaluates runtime performance of a provided model
- * with random inputs.
+ * Generic model benchmark.  Evaluates runtime performance of a provided
+ * model with random inputs.
  */
 
 namespace tflite {
-
 namespace {
 
 using Profiler = ::tflite::MicroProfiler;
 
-// Seed used for the random input. Input data shouldn't affect invocation timing
-// so randomness isn't really needed.
+// Seed used for the random input. Input data shouldn't affect invocation
+// timing so randomness isn't really needed.
 constexpr uint32_t kRandomSeed = 0xFB;
 
 #if !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
@@ -80,6 +99,11 @@ constexpr size_t kTensorArenaSize = GENERIC_BENCHMARK_TENSOR_ARENA_SIZE;
 constexpr size_t kTensorArenaSize = 5e6 - MODEL_SIZE;
 #endif  // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
 
+#if defined(USE_ALT_DECOMPRESSION_MEM)
+constexpr size_t kAltMemorySize = GENERIC_BENCHMARK_ALT_MEM_SIZE;
+alignas(16) GENERIC_BENCHMARK_ALT_MEM_ATTR uint8_t g_alt_memory[kAltMemorySize];
+#endif  // defined(USE_ALT_DECOMPRESSION_MEM)
+
 constexpr int kNumResourceVariable = 100;
 
 void SetRandomInput(const uint32_t random_seed,
@@ -130,39 +154,146 @@ bool ReadFile(const char* file_name, void* buffer, size_t buffer_size) {
 }
 #endif  // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
 
+constexpr uint32_t kCrctabLen = 256;
+uint32_t crctab[kCrctabLen];
+
+void GenCRC32Table() {
+  constexpr uint32_t kPolyN = 0xEDB88320;
+  for (size_t index = 0; index < kCrctabLen; index++) {
+    crctab[index] = index;
+    for (int i = 0; i < 8; i++) {
+      if (crctab[index] & 1) {
+        crctab[index] = (crctab[index] >> 1) ^ kPolyN;
+      } else {
+        crctab[index] >>= 1;
+      }
+    }
+  }
+}
+
+uint32_t ComputeCRC32(const uint8_t* data, const size_t data_length) {
+  uint32_t crc32 = ~0U;
+
+  for (size_t i = 0; i < data_length; i++) {
+    // crctab is an array of 256 32-bit constants
+    const uint32_t index = (crc32 ^ data[i]) & (kCrctabLen - 1);
+    crc32 = (crc32 >> 8) ^ crctab[index];
+  }
+
+  // invert all bits of result
+  crc32 ^= ~0U;
+  return crc32;
+}
+
+void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) {
+  GenCRC32Table();
+  for (size_t i = 0; i < interpreter->outputs_size(); ++i) {
+    TfLiteTensor* output = interpreter->output_tensor(i);
+    uint8_t* output_values = tflite::GetTensorData<uint8_t>(output);
+    uint32_t crc32_value = ComputeCRC32(output_values, output->bytes);
+    MicroPrintf("Output CRC32: 0x%X", crc32_value);
+  }
+}
+
+void ShowInputCRC32(tflite::MicroInterpreter* interpreter) {
+  GenCRC32Table();
+  for (size_t i = 0; i < interpreter->inputs_size(); ++i) {
+    TfLiteTensor* input = interpreter->input_tensor(i);
+    uint8_t* input_values = tflite::GetTensorData<uint8_t>(input);
+    uint32_t crc32_value = ComputeCRC32(input_values, input->bytes);
+    MicroPrintf("Input CRC32: 0x%X", crc32_value);
+  }
+}
+
 int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
-  Profiler profiler;
+  static Profiler profiler;
+  static Profiler profiler2;
+  TfLiteStatus status;
+
+// use this to keep the application size stable regardless of whether
+// compression is being used
+#ifdef USE_TFLM_COMPRESSION
+  constexpr bool using_compression = true;
+#else   // USE_TFLM_COMPRESSION
+  constexpr bool using_compression = false;
+#endif  // USE_TFLM_COMPRESSION
+
   alignas(16) static uint8_t tensor_arena[kTensorArenaSize];
 
-  uint32_t event_handle = profiler.BeginEvent("TfliteGetModel");
+#ifdef USE_ALT_DECOMPRESSION_MEM
+  std::initializer_list<tflite::MicroContext::AlternateMemoryRegion>
+      alt_memory_region = {{g_alt_memory, kAltMemorySize}};
+#endif  // USE_ALT_DECOMPRESSION_MEM
+
+  uint32_t event_handle = profiler.BeginEvent("tflite::GetModel");
   const tflite::Model* model = tflite::GetModel(model_data);
   profiler.EndEvent(event_handle);
 
+  event_handle = profiler.BeginEvent("tflite::CreateOpResolver");
   TflmOpResolver op_resolver;
-  TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver));
+  status = CreateOpResolver(op_resolver);
+  if (status != kTfLiteOk) {
+    MicroPrintf("tflite::CreateOpResolver failed");
+    return -1;
+  }
+  profiler.EndEvent(event_handle);
 
+  event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create");
   tflite::RecordingMicroAllocator* allocator(
       tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize));
+  profiler.EndEvent(event_handle);
+  event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation");
   tflite::RecordingMicroInterpreter interpreter(
       model, op_resolver, allocator,
       tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable),
       &profiler);
-  TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
+  profiler.EndEvent(event_handle);
+
+#ifdef USE_ALT_DECOMPRESSION_MEM
+  event_handle =
+      profiler.BeginEvent("tflite::MicroInterpreter::SetDecompressionMemory");
+  status = interpreter.SetDecompressionMemory(alt_memory_region);
+  if (status != kTfLiteOk) {
+    MicroPrintf("tflite::MicroInterpreter::SetDecompressionMemory failed");
+    return -1;
+  }
+  profiler.EndEvent(event_handle);
+#endif  // USE_ALT_DECOMPRESSION_MEM
+
+  event_handle =
+      profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors");
+  status = interpreter.AllocateTensors();
+  if (status != kTfLiteOk) {
+    MicroPrintf("tflite::MicroInterpreter::AllocateTensors failed");
+    return -1;
+  }
+  profiler.EndEvent(event_handle);
 
-  profiler.Log();
+  profiler.LogTicksPerTagCsv();
   profiler.ClearEvents();
 
+  if (using_compression) {
+    status = interpreter.SetAlternateProfiler(&profiler2);
+    if (status != kTfLiteOk) {
+      MicroPrintf("tflite::MicroInterpreter::SetAlternateProfiler failed");
+      return -1;
+    }
+  }
+
   MicroPrintf("");  // null MicroPrintf serves as a newline.
 
-  // For streaming models, the interpreter will return kTfLiteAbort if the model
-  // does not yet have enough data to make an inference. As such, we need to
-  // invoke the interpreter multiple times until we either receive an error or
-  // kTfLiteOk. This loop also works for non-streaming models, as they'll just
-  // return kTfLiteOk after the first invocation.
+  // For streaming models, the interpreter will return kTfLiteAbort if the
+  // model does not yet have enough data to make an inference. As such, we
+  // need to invoke the interpreter multiple times until we either receive an
+  // error or kTfLiteOk. This loop also works for non-streaming models, as
+  // they'll just return kTfLiteOk after the first invocation.
   uint32_t seed = kRandomSeed;
   while (true) {
     SetRandomInput(seed++, interpreter);
-    TfLiteStatus status = interpreter.Invoke();
+    ShowInputCRC32(&interpreter);
+    MicroPrintf("");  // null MicroPrintf serves as a newline.
+
+    status = interpreter.Invoke();
     if ((status != kTfLiteOk) && (static_cast<int>(status) != kTfLiteAbort)) {
       MicroPrintf("Model interpreter invocation failed: %d\n", status);
       return -1;
@@ -174,6 +305,17 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
     MicroPrintf("");  // null MicroPrintf serves as a newline.
     profiler.ClearEvents();
 
+    if (using_compression) {
+      profiler2.Log();
+      MicroPrintf("");  // null MicroPrintf serves as a newline.
+      profiler2.LogTicksPerTagCsv();
+      MicroPrintf("");  // null MicroPrintf serves as a newline.
+      profiler2.ClearEvents();
+    }
+
+    ShowOutputCRC32(&interpreter);
+    MicroPrintf("");  // null MicroPrintf serves as a newline.
+
     if (status == kTfLiteOk) {
       break;
     }
diff --git a/tensorflow/lite/micro/tools/benchmarking/metrics.cc b/tensorflow/lite/micro/tools/benchmarking/metrics.cc
diff --git a/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template b/tensorflow/lite/micro/tools/benchmarking/show_meta_data.cc.template

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {`
`31`	`31`	`} else if (i < 62) {`
`32`	`32`	`return 'A' + (i - 36);`
`33`	`33`	`}`
`34`		`- return '*';`
	`34`	`+ return GetOrdinalCharacter(i % 62);`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`} // namespace`
`@@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {`
`335`	`335`	`CalculateOffsetsIfNeeded();`
`336`	`336`
`337`	`337`	`for (int i = 0; i < buffer_count_; ++i) {`
`338`		`- MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",`
`339`		`- GetOrdinalCharacter(i), i, requirements_[i].size,`
`340`		`- buffer_offsets_[i], requirements_[i].first_time_used,`
	`338`	`+ char c = '*';`
	`339`	`+ if (requirements_[i].first_time_used != requirements_[i].last_time_used) {`
	`340`	`+ // not a scratch buffer nor subgraph output tensor`
	`341`	`+ c = GetOrdinalCharacter(i);`
	`342`	`+ }`
	`343`	`+ MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,`
	`344`	`+ i, requirements_[i].size, buffer_offsets_[i],`
	`345`	`+ requirements_[i].first_time_used,`
`341`	`346`	`requirements_[i].last_time_used);`
`342`	`347`	`}`
`343`	`348`
`@@ -379,15 +384,20 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {`
`379`	`384`	`const int line_end = ((offset + size) * kLineWidth) / max_size;`
`380`	`385`	`for (int n = line_start; n < line_end; ++n) {`
`381`	`386`	`if (line[n] == '.') {`
`382`		`- line[n] = GetOrdinalCharacter(i);`
	`387`	`+ if (requirements->first_time_used == requirements->last_time_used) {`
	`388`	`+ // scratch buffer or subgraph output tensor`
	`389`	`+ line[n] = '*';`
	`390`	`+ } else {`
	`391`	`+ line[n] = GetOrdinalCharacter(i);`
	`392`	`+ }`
`383`	`393`	`} else {`
`384`	`394`	`line[n] = '!';`
`385`	`395`	`}`
`386`	`396`	`}`
`387`	`397`	`}`
`388`	`398`	`line[kLineWidth] = 0;`
`389`	`399`
`390`		`- MicroPrintf("%s%d: %s (%dk)", t < 10 ? " " : "", t, (const char*)line,`
	`400`	`+ MicroPrintf("%4d: %s (%dk)", t, (const char*)line,`
`391`	`401`	`(memory_use + 1023) / 1024);`
`392`	`402`	`}`
`393`	`403`	`}`