Skip to content

Commit b2f2718

Browse files
authored
feat: enhance profiling and benchmarking (#3012)
Modify `PrintMemoryPlan` in `greedy_memory_planner.cc` for better handling of tensor indices and scratch buffers. Fix `total_ticks_per_tag_` usage in `micro_profiler.cc` and add `ClearEvents` method. Update `Makefile.inc` and `generic_model_benchmark.cc` to support alternate memory regions and CRC32 checks for data integrity. Include compression data in `metrics.cc` allocation records and handle architecture-specific directives in `show_meta_data.cc.template`. BUG=see description
1 parent a535080 commit b2f2718

File tree

8 files changed

+228
-41
lines changed

8 files changed

+228
-41
lines changed

tensorflow/lite/micro/memory_planner/greedy_memory_planner.cc

+16-6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ char GetOrdinalCharacter(int i) {
3131
} else if (i < 62) {
3232
return 'A' + (i - 36);
3333
}
34-
return '*';
34+
return GetOrdinalCharacter(i % 62);
3535
}
3636

3737
} // namespace
@@ -335,9 +335,14 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
335335
CalculateOffsetsIfNeeded();
336336

337337
for (int i = 0; i < buffer_count_; ++i) {
338-
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d",
339-
GetOrdinalCharacter(i), i, requirements_[i].size,
340-
buffer_offsets_[i], requirements_[i].first_time_used,
338+
char c = '*';
339+
if (requirements_[i].first_time_used != requirements_[i].last_time_used) {
340+
// not a scratch buffer nor subgraph output tensor
341+
c = GetOrdinalCharacter(i);
342+
}
343+
MicroPrintf("%c (id=%d): size=%d, offset=%d, first_used=%d last_used=%d", c,
344+
i, requirements_[i].size, buffer_offsets_[i],
345+
requirements_[i].first_time_used,
341346
requirements_[i].last_time_used);
342347
}
343348

@@ -379,15 +384,20 @@ void GreedyMemoryPlanner::PrintMemoryPlan() {
379384
const int line_end = ((offset + size) * kLineWidth) / max_size;
380385
for (int n = line_start; n < line_end; ++n) {
381386
if (line[n] == '.') {
382-
line[n] = GetOrdinalCharacter(i);
387+
if (requirements->first_time_used == requirements->last_time_used) {
388+
// scratch buffer or subgraph output tensor
389+
line[n] = '*';
390+
} else {
391+
line[n] = GetOrdinalCharacter(i);
392+
}
383393
} else {
384394
line[n] = '!';
385395
}
386396
}
387397
}
388398
line[kLineWidth] = 0;
389399

390-
MicroPrintf("%s%d: %s (%dk)", t < 10 ? " " : "", t, (const char*)line,
400+
MicroPrintf("%4d: %s (%dk)", t, (const char*)line,
391401
(memory_use + 1023) / 1024);
392402
}
393403
}

tensorflow/lite/micro/micro_profiler.cc

+14-5
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,14 @@ void MicroProfiler::LogTicksPerTagCsv() {
8686
TFLITE_DCHECK(tags_[i] != nullptr);
8787
int position = FindExistingOrNextPosition(tags_[i]);
8888
TFLITE_DCHECK(position >= 0);
89-
total_ticks_per_tag[position].tag = tags_[i];
90-
total_ticks_per_tag[position].ticks =
91-
total_ticks_per_tag[position].ticks + ticks;
89+
total_ticks_per_tag_[position].tag = tags_[i];
90+
total_ticks_per_tag_[position].ticks =
91+
total_ticks_per_tag_[position].ticks + ticks;
9292
total_ticks += ticks;
9393
}
9494

9595
for (int i = 0; i < num_events_; ++i) {
96-
TicksPerTag each_tag_entry = total_ticks_per_tag[i];
96+
TicksPerTag each_tag_entry = total_ticks_per_tag_[i];
9797
if (each_tag_entry.tag == nullptr) {
9898
break;
9999
}
@@ -112,12 +112,21 @@ void MicroProfiler::LogTicksPerTagCsv() {
112112
int MicroProfiler::FindExistingOrNextPosition(const char* tag_name) {
113113
int pos = 0;
114114
for (; pos < num_events_; pos++) {
115-
TicksPerTag each_tag_entry = total_ticks_per_tag[pos];
115+
TicksPerTag each_tag_entry = total_ticks_per_tag_[pos];
116116
if (each_tag_entry.tag == nullptr ||
117117
strcmp(each_tag_entry.tag, tag_name) == 0) {
118118
return pos;
119119
}
120120
}
121121
return pos < num_events_ ? pos : -1;
122122
}
123+
124+
void MicroProfiler::ClearEvents() {
125+
for (int i = 0; i < num_events_; i++) {
126+
total_ticks_per_tag_[i].tag = nullptr;
127+
}
128+
129+
num_events_ = 0;
130+
}
131+
123132
} // namespace tflite

tensorflow/lite/micro/micro_profiler.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
1+
/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
22
33
Licensed under the Apache License, Version 2.0 (the "License");
44
you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class MicroProfiler : public MicroProfilerInterface {
4545
virtual void EndEvent(uint32_t event_handle) override;
4646

4747
// Clears all the events that have been currently profiled.
48-
void ClearEvents() { num_events_ = 0; }
48+
void ClearEvents();
4949

5050
// Returns the sum of the ticks taken across all the events. This number
5151
// is only meaningful if all of the events are disjoint (the end time of
@@ -83,7 +83,7 @@ class MicroProfiler : public MicroProfilerInterface {
8383
// In practice, the number of tags will be much lower than the number of
8484
// events. But it is theoretically possible that each event to be unique and
8585
// hence we allow total_ticks_per_tag to have kMaxEvents entries.
86-
TicksPerTag total_ticks_per_tag[kMaxEvents] = {};
86+
TicksPerTag total_ticks_per_tag_[kMaxEvents] = {};
8787

8888
int FindExistingOrNextPosition(const char* tag_name);
8989

tensorflow/lite/micro/tools/benchmarking/Makefile.inc

+9
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,15 @@ endif
2020
$(GENERATED_SRCS_DIR)$(GENERIC_BENCHMARK_MODEL_DIR)$(GENERIC_BENCHMARK_MODEL_NAME)_model_data.h
2121
endif
2222

23+
ifeq ($(ENABLE_COMPRESSION), yes)
24+
ifneq ($(GENERIC_BENCHMARK_ALT_MEM_ATTR),)
25+
CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_ATTR=$(GENERIC_BENCHMARK_ALT_MEM_ATTR)
26+
endif
27+
ifneq ($(GENERIC_BENCHMARK_ALT_MEM_SIZE),)
28+
CXXFLAGS += -DGENERIC_BENCHMARK_ALT_MEM_SIZE=$(GENERIC_BENCHMARK_ALT_MEM_SIZE)
29+
endif
30+
endif
31+
2332
GENERIC_BENCHMARK_SRCS := \
2433
$(MICROLITE_BENCHMARK_ROOT_DIR)/generic_model_benchmark.cc \
2534
$(MICROLITE_BENCHMARK_ROOT_DIR)/metrics.cc \

tensorflow/lite/micro/tools/benchmarking/collect_meta_data.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function substitute_strings() {
5252
IFS=${SAVED_IFS}
5353
replacement=()
5454
for line in "${lines_array[@]}"; do
55-
line=$(sed -e 's/"/\\"/g' <<< "${line}")
55+
line=$(sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' <<< "${line}")
5656
line=$(printf '"%s",\n ' "${line}")
5757
replacement+=( "${line}" )
5858
done

tensorflow/lite/micro/tools/benchmarking/generic_model_benchmark.cc

+158-16
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License.
1818
#include <sys/types.h>
1919

2020
#include <cstring>
21+
#include <initializer_list>
2122
#include <memory>
2223
#include <random>
2324
#include <type_traits>
@@ -56,19 +57,37 @@ limitations under the License.
5657

5758
#endif // defind(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
5859

60+
#if defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && \
61+
!defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)
62+
#error "GENERIC_BENCHMARK_ALT_MEM_SIZE missing from CXXFLAGS"
63+
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
64+
// !defined(GENERIC_BENCHMARK_ALT_MEM_SIZE)
65+
66+
#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
67+
!defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)
68+
#error "GENERIC_BENCHMARK_ALT_MEM_ATTR missing from CXXFLAGS"
69+
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
70+
// !defined(GENERIC_BENCHMARK_ALT_MEM_ATTR)
71+
72+
#if defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) && \
73+
defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) && defined(USE_TFLM_COMPRESSION)
74+
#define USE_ALT_DECOMPRESSION_MEM
75+
#endif // defined(GENERIC_BENCHMARK_ALT_MEM_SIZE) &&
76+
// defined(GENERIC_BENCHMARK_ALT_MEM_ATTR) &&
77+
// defined(USE_TFLM_COMPRESSION)
78+
5979
/*
60-
* Generic model benchmark. Evaluates runtime performance of a provided model
61-
* with random inputs.
80+
* Generic model benchmark. Evaluates runtime performance of a provided
81+
* model with random inputs.
6282
*/
6383

6484
namespace tflite {
65-
6685
namespace {
6786

6887
using Profiler = ::tflite::MicroProfiler;
6988

70-
// Seed used for the random input. Input data shouldn't affect invocation timing
71-
// so randomness isn't really needed.
89+
// Seed used for the random input. Input data shouldn't affect invocation
90+
// timing so randomness isn't really needed.
7291
constexpr uint32_t kRandomSeed = 0xFB;
7392

7493
#if !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
@@ -80,6 +99,11 @@ constexpr size_t kTensorArenaSize = GENERIC_BENCHMARK_TENSOR_ARENA_SIZE;
8099
constexpr size_t kTensorArenaSize = 5e6 - MODEL_SIZE;
81100
#endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
82101

102+
#if defined(USE_ALT_DECOMPRESSION_MEM)
103+
constexpr size_t kAltMemorySize = GENERIC_BENCHMARK_ALT_MEM_SIZE;
104+
alignas(16) GENERIC_BENCHMARK_ALT_MEM_ATTR uint8_t g_alt_memory[kAltMemorySize];
105+
#endif // defined(USE_ALT_DECOMPRESSION_MEM)
106+
83107
constexpr int kNumResourceVariable = 100;
84108

85109
void SetRandomInput(const uint32_t random_seed,
@@ -130,39 +154,146 @@ bool ReadFile(const char* file_name, void* buffer, size_t buffer_size) {
130154
}
131155
#endif // !defined(GENERIC_BENCHMARK_USING_BUILTIN_MODEL)
132156

157+
constexpr uint32_t kCrctabLen = 256;
158+
uint32_t crctab[kCrctabLen];
159+
160+
void GenCRC32Table() {
161+
constexpr uint32_t kPolyN = 0xEDB88320;
162+
for (size_t index = 0; index < kCrctabLen; index++) {
163+
crctab[index] = index;
164+
for (int i = 0; i < 8; i++) {
165+
if (crctab[index] & 1) {
166+
crctab[index] = (crctab[index] >> 1) ^ kPolyN;
167+
} else {
168+
crctab[index] >>= 1;
169+
}
170+
}
171+
}
172+
}
173+
174+
uint32_t ComputeCRC32(const uint8_t* data, const size_t data_length) {
175+
uint32_t crc32 = ~0U;
176+
177+
for (size_t i = 0; i < data_length; i++) {
178+
// crctab is an array of 256 32-bit constants
179+
const uint32_t index = (crc32 ^ data[i]) & (kCrctabLen - 1);
180+
crc32 = (crc32 >> 8) ^ crctab[index];
181+
}
182+
183+
// invert all bits of result
184+
crc32 ^= ~0U;
185+
return crc32;
186+
}
187+
188+
void ShowOutputCRC32(tflite::MicroInterpreter* interpreter) {
189+
GenCRC32Table();
190+
for (size_t i = 0; i < interpreter->outputs_size(); ++i) {
191+
TfLiteTensor* output = interpreter->output_tensor(i);
192+
uint8_t* output_values = tflite::GetTensorData<uint8_t>(output);
193+
uint32_t crc32_value = ComputeCRC32(output_values, output->bytes);
194+
MicroPrintf("Output CRC32: 0x%X", crc32_value);
195+
}
196+
}
197+
198+
void ShowInputCRC32(tflite::MicroInterpreter* interpreter) {
199+
GenCRC32Table();
200+
for (size_t i = 0; i < interpreter->inputs_size(); ++i) {
201+
TfLiteTensor* input = interpreter->input_tensor(i);
202+
uint8_t* input_values = tflite::GetTensorData<uint8_t>(input);
203+
uint32_t crc32_value = ComputeCRC32(input_values, input->bytes);
204+
MicroPrintf("Input CRC32: 0x%X", crc32_value);
205+
}
206+
}
207+
133208
int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
134-
Profiler profiler;
209+
static Profiler profiler;
210+
static Profiler profiler2;
211+
TfLiteStatus status;
212+
213+
// use this to keep the application size stable regardless of whether
214+
// compression is being used
215+
#ifdef USE_TFLM_COMPRESSION
216+
constexpr bool using_compression = true;
217+
#else // USE_TFLM_COMPRESSION
218+
constexpr bool using_compression = false;
219+
#endif // USE_TFLM_COMPRESSION
220+
135221
alignas(16) static uint8_t tensor_arena[kTensorArenaSize];
136222

137-
uint32_t event_handle = profiler.BeginEvent("TfliteGetModel");
223+
#ifdef USE_ALT_DECOMPRESSION_MEM
224+
std::initializer_list<tflite::MicroContext::AlternateMemoryRegion>
225+
alt_memory_region = {{g_alt_memory, kAltMemorySize}};
226+
#endif // USE_ALT_DECOMPRESSION_MEM
227+
228+
uint32_t event_handle = profiler.BeginEvent("tflite::GetModel");
138229
const tflite::Model* model = tflite::GetModel(model_data);
139230
profiler.EndEvent(event_handle);
140231

232+
event_handle = profiler.BeginEvent("tflite::CreateOpResolver");
141233
TflmOpResolver op_resolver;
142-
TF_LITE_ENSURE_STATUS(CreateOpResolver(op_resolver));
234+
status = CreateOpResolver(op_resolver);
235+
if (status != kTfLiteOk) {
236+
MicroPrintf("tflite::CreateOpResolver failed");
237+
return -1;
238+
}
239+
profiler.EndEvent(event_handle);
143240

241+
event_handle = profiler.BeginEvent("tflite::RecordingMicroAllocator::Create");
144242
tflite::RecordingMicroAllocator* allocator(
145243
tflite::RecordingMicroAllocator::Create(tensor_arena, kTensorArenaSize));
244+
profiler.EndEvent(event_handle);
245+
event_handle = profiler.BeginEvent("tflite::MicroInterpreter instantiation");
146246
tflite::RecordingMicroInterpreter interpreter(
147247
model, op_resolver, allocator,
148248
tflite::MicroResourceVariables::Create(allocator, kNumResourceVariable),
149249
&profiler);
150-
TF_LITE_ENSURE_STATUS(interpreter.AllocateTensors());
250+
profiler.EndEvent(event_handle);
251+
252+
#ifdef USE_ALT_DECOMPRESSION_MEM
253+
event_handle =
254+
profiler.BeginEvent("tflite::MicroInterpreter::SetDecompressionMemory");
255+
status = interpreter.SetDecompressionMemory(alt_memory_region);
256+
if (status != kTfLiteOk) {
257+
MicroPrintf("tflite::MicroInterpreter::SetDecompressionMemory failed");
258+
return -1;
259+
}
260+
profiler.EndEvent(event_handle);
261+
#endif // USE_ALT_DECOMPRESSION_MEM
262+
263+
event_handle =
264+
profiler.BeginEvent("tflite::MicroInterpreter::AllocateTensors");
265+
status = interpreter.AllocateTensors();
266+
if (status != kTfLiteOk) {
267+
MicroPrintf("tflite::MicroInterpreter::AllocateTensors failed");
268+
return -1;
269+
}
270+
profiler.EndEvent(event_handle);
151271

152-
profiler.Log();
272+
profiler.LogTicksPerTagCsv();
153273
profiler.ClearEvents();
154274

275+
if (using_compression) {
276+
status = interpreter.SetAlternateProfiler(&profiler2);
277+
if (status != kTfLiteOk) {
278+
MicroPrintf("tflite::MicroInterpreter::SetAlternateProfiler failed");
279+
return -1;
280+
}
281+
}
282+
155283
MicroPrintf(""); // null MicroPrintf serves as a newline.
156284

157-
// For streaming models, the interpreter will return kTfLiteAbort if the model
158-
// does not yet have enough data to make an inference. As such, we need to
159-
// invoke the interpreter multiple times until we either receive an error or
160-
// kTfLiteOk. This loop also works for non-streaming models, as they'll just
161-
// return kTfLiteOk after the first invocation.
285+
// For streaming models, the interpreter will return kTfLiteAbort if the
286+
// model does not yet have enough data to make an inference. As such, we
287+
// need to invoke the interpreter multiple times until we either receive an
288+
// error or kTfLiteOk. This loop also works for non-streaming models, as
289+
// they'll just return kTfLiteOk after the first invocation.
162290
uint32_t seed = kRandomSeed;
163291
while (true) {
164292
SetRandomInput(seed++, interpreter);
165-
TfLiteStatus status = interpreter.Invoke();
293+
ShowInputCRC32(&interpreter);
294+
MicroPrintf(""); // null MicroPrintf serves as a newline.
295+
296+
status = interpreter.Invoke();
166297
if ((status != kTfLiteOk) && (static_cast<int>(status) != kTfLiteAbort)) {
167298
MicroPrintf("Model interpreter invocation failed: %d\n", status);
168299
return -1;
@@ -174,6 +305,17 @@ int Benchmark(const uint8_t* model_data, tflite::PrettyPrintType print_type) {
174305
MicroPrintf(""); // null MicroPrintf serves as a newline.
175306
profiler.ClearEvents();
176307

308+
if (using_compression) {
309+
profiler2.Log();
310+
MicroPrintf(""); // null MicroPrintf serves as a newline.
311+
profiler2.LogTicksPerTagCsv();
312+
MicroPrintf(""); // null MicroPrintf serves as a newline.
313+
profiler2.ClearEvents();
314+
}
315+
316+
ShowOutputCRC32(&interpreter);
317+
MicroPrintf(""); // null MicroPrintf serves as a newline.
318+
177319
if (status == kTfLiteOk) {
178320
break;
179321
}

0 commit comments

Comments
 (0)