Skip to content

Commit c0946e1

Browse files
[Runtime] Flush L2 cache in time eval (#15305)
This PR introduces an optional cache flush functionality to `time_evaluator`. It is implemented by allocating two large empty NDArrays on the device so that the L2 cache are flushed. This gives us more accurate evaluation on the performance of a runtime function.
1 parent e2d6511 commit c0946e1

File tree

7 files changed

+40
-22
lines changed

7 files changed

+40
-22
lines changed

include/tvm/runtime/profiling.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -579,13 +579,15 @@ PackedFunc ProfileFunction(Module mod, std::string func_name, int device_type, i
579579
* defined by `repeats_to_cooldown`.
580580
* \param repeats_to_cooldown The number of repeats before the
581581
* cooldown is activated.
582+
* \param cache_flush_bytes The number of bytes to flush from cache before
582583
* \param f_preproc The function to be executed before we execute time
583584
* evaluator.
584585
* \return f_timer A timer function.
585586
*/
586587
PackedFunc WrapTimeEvaluator(PackedFunc f, Device dev, int number, int repeat, int min_repeat_ms,
587588
int limit_zero_time_iterations, int cooldown_interval_ms,
588-
int repeats_to_cooldown, PackedFunc f_preproc = nullptr);
589+
int repeats_to_cooldown, int cache_flush_bytes = 0,
590+
PackedFunc f_preproc = nullptr);
589591

590592
} // namespace profiling
591593
} // namespace runtime

python/tvm/runtime/module.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def time_evaluator(
316316
limit_zero_time_iterations=100,
317317
cooldown_interval_ms=0,
318318
repeats_to_cooldown=1,
319+
cache_flush_bytes=0,
319320
f_preproc="",
320321
):
321322
"""Get an evaluator that measures time cost of running function.
@@ -358,6 +359,9 @@ def time_evaluator(
358359
repeats_to_cooldown: int, optional
359360
The number of repeats before the cooldown is activated.
360361
362+
cache_flush_bytes: int, optional
363+
The number of bytes to flush from the cache before each repeat.
364+
361365
f_preproc: str, optional
362366
The preprocess function name we want to execute before executing the time evaluator.
363367
@@ -384,6 +388,7 @@ def time_evaluator(
384388
limit_zero_time_iterations,
385389
cooldown_interval_ms,
386390
repeats_to_cooldown,
391+
cache_flush_bytes,
387392
f_preproc,
388393
)
389394

src/runtime/crt/common/crt_runtime_api.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -489,14 +489,15 @@ int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* re
489489
int* ret_type_code) {
490490
ret_val[0].v_handle = NULL;
491491
ret_type_code[0] = kTVMNullptr;
492-
if (num_args < 11) {
492+
if (num_args < 12) {
493493
TVMAPIErrorf("not enough args");
494494
return kTvmErrorFunctionCallNumArguments;
495495
}
496496
if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
497497
type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
498498
type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMArgInt ||
499-
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMStr) {
499+
type_codes[8] != kTVMArgInt || type_codes[9] != kTVMArgInt || type_codes[10] != kTVMArgInt ||
500+
type_codes[11] != kTVMStr) {
500501
TVMAPIErrorf("one or more invalid arg types");
501502
return kTvmErrorFunctionCallWrongArgType;
502503
}

src/runtime/graph_executor/debug/graph_executor_debug.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ std::vector<double> GraphExecutorDebug::RunOpRPC(int index, int number, int repe
143143
->
144144
operator()(module_, name, static_cast<int>(dev.device_type), dev.device_id, number,
145145
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
146-
repeats_to_cooldown, "");
146+
repeats_to_cooldown, /*cache_flush_bytes=*/0, "");
147147

148148
int num_flat_args = num_inputs + num_outputs;
149149
auto values = std::make_unique<TVMValue[]>(num_flat_args);

src/runtime/profiling.cc

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -861,7 +861,7 @@ TVM_REGISTER_GLOBAL("runtime.profiling.ProfileFunction")
861861

862862
PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat, int min_repeat_ms,
863863
int limit_zero_time_iterations, int cooldown_interval_ms,
864-
int repeats_to_cooldown, PackedFunc f_preproc) {
864+
int repeats_to_cooldown, int cache_flush_bytes, PackedFunc f_preproc) {
865865
ICHECK(pf != nullptr);
866866

867867
if (static_cast<int>(dev.device_type) == static_cast<int>(kDLMicroDev)) {
@@ -871,13 +871,20 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
871871
}
872872

873873
auto ftimer = [pf, dev, number, repeat, min_repeat_ms, limit_zero_time_iterations,
874-
cooldown_interval_ms, repeats_to_cooldown,
874+
cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
875875
f_preproc](TVMArgs args, TVMRetValue* rv) mutable {
876876
TVMRetValue temp;
877877
std::ostringstream os;
878878
// skip first time call, to activate lazy compilation components.
879879
pf.CallPacked(args, &temp);
880880

881+
// allocate two large arrays to flush L2 cache
882+
NDArray arr1, arr2;
883+
if (cache_flush_bytes > 0) {
884+
arr1 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
885+
arr2 = NDArray::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
886+
}
887+
881888
DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
882889

883890
for (int i = 0; i < repeat; ++i) {
@@ -892,7 +899,10 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
892899
number = static_cast<int>(
893900
std::max((min_repeat_ms / (duration_ms / number) + 1), number * golden_ratio));
894901
}
895-
902+
if (cache_flush_bytes > 0) {
903+
arr1.CopyFrom(arr2);
904+
}
905+
DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
896906
// start timing
897907
Timer t = Timer::Start(dev);
898908
for (int j = 0; j < number; ++j) {

src/runtime/rpc/rpc_module.cc

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -198,23 +198,23 @@ class RPCModuleNode final : public ModuleNode {
198198
PackedFunc GetTimeEvaluator(const std::string& name, Device dev, int number, int repeat,
199199
int min_repeat_ms, int limit_zero_time_iterations,
200200
int cooldown_interval_ms, int repeats_to_cooldown,
201-
const std::string& f_preproc_name) {
201+
int cache_flush_bytes, const std::string& f_preproc_name) {
202202
InitRemoteFunc(&remote_get_time_evaluator_, "runtime.RPCTimeEvaluator");
203203
// Remove session mask because we pass dev by parts.
204204
ICHECK_EQ(GetRPCSessionIndex(dev), sess_->table_index())
205205
<< "ValueError: Need to pass the matched remote device to RPCModule.GetTimeEvaluator";
206206
dev = RemoveRPCSessionMask(dev);
207207

208208
if (module_handle_ != nullptr) {
209-
return remote_get_time_evaluator_(GetRef<Module>(this), name,
210-
static_cast<int>(dev.device_type), dev.device_id, number,
211-
repeat, min_repeat_ms, limit_zero_time_iterations,
212-
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
209+
return remote_get_time_evaluator_(
210+
GetRef<Module>(this), name, static_cast<int>(dev.device_type), dev.device_id, number,
211+
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
212+
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
213213
} else {
214-
return remote_get_time_evaluator_(Optional<Module>(nullptr), name,
215-
static_cast<int>(dev.device_type), dev.device_id, number,
216-
repeat, min_repeat_ms, limit_zero_time_iterations,
217-
cooldown_interval_ms, repeats_to_cooldown, f_preproc_name);
214+
return remote_get_time_evaluator_(
215+
Optional<Module>(nullptr), name, static_cast<int>(dev.device_type), dev.device_id, number,
216+
repeat, min_repeat_ms, limit_zero_time_iterations, cooldown_interval_ms,
217+
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
218218
}
219219
}
220220

@@ -253,7 +253,7 @@ class RPCModuleNode final : public ModuleNode {
253253
std::shared_ptr<RPCSession> sess_;
254254
// remote function to get time evaluator
255255
TypedPackedFunc<PackedFunc(Optional<Module>, std::string, int, int, int, int, int, int, int, int,
256-
std::string)>
256+
int, std::string)>
257257
remote_get_time_evaluator_;
258258
// remote function getter for modules.
259259
TypedPackedFunc<PackedFunc(Module, std::string, bool)> remote_mod_get_function_;
@@ -372,7 +372,7 @@ inline void CPUCacheFlush(int begin_index, const TVMArgs& args) {
372372
TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
373373
.set_body_typed([](Optional<Module> opt_mod, std::string name, int device_type, int device_id,
374374
int number, int repeat, int min_repeat_ms, int limit_zero_time_iterations,
375-
int cooldown_interval_ms, int repeats_to_cooldown,
375+
int cooldown_interval_ms, int repeats_to_cooldown, int cache_flush_bytes,
376376
std::string f_preproc_name) {
377377
Device dev;
378378
dev.device_type = static_cast<DLDeviceType>(device_type);
@@ -384,7 +384,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
384384
return static_cast<RPCModuleNode*>(m.operator->())
385385
->GetTimeEvaluator(name, dev, number, repeat, min_repeat_ms,
386386
limit_zero_time_iterations, cooldown_interval_ms,
387-
repeats_to_cooldown, f_preproc_name);
387+
repeats_to_cooldown, cache_flush_bytes, f_preproc_name);
388388
} else {
389389
PackedFunc f_preproc;
390390
if (!f_preproc_name.empty()) {
@@ -397,7 +397,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
397397
CHECK(pf != nullptr) << "Cannot find " << name << " in the global registry";
398398
return profiling::WrapTimeEvaluator(pf, dev, number, repeat, min_repeat_ms,
399399
limit_zero_time_iterations, cooldown_interval_ms,
400-
repeats_to_cooldown, f_preproc);
400+
repeats_to_cooldown, cache_flush_bytes, f_preproc);
401401
}
402402
} else {
403403
auto* pf = runtime::Registry::Get(name);
@@ -411,7 +411,7 @@ TVM_REGISTER_GLOBAL("runtime.RPCTimeEvaluator")
411411
}
412412
return profiling::WrapTimeEvaluator(*pf, dev, number, repeat, min_repeat_ms,
413413
limit_zero_time_iterations, cooldown_interval_ms,
414-
repeats_to_cooldown, f_preproc);
414+
repeats_to_cooldown, cache_flush_bytes, f_preproc);
415415
}
416416
});
417417

web/emcc/tvmjs_support.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ class AsyncLocalSession : public LocalSession {
297297
CHECK(time_exec != nullptr) << "Cannot find wasm.GetTimer in the global function";
298298
(*time_exec)(TypedPackedFunc<void(int)>(finvoke), dev, number, repeat, min_repeat_ms,
299299
limit_zero_time_iterations, cooldown_interval_ms, repeats_to_cooldown,
300-
on_complete);
300+
/*cache_flush_bytes=*/0, on_complete);
301301
};
302302
return PackedFunc(ftimer);
303303
}

0 commit comments

Comments
 (0)