Skip to content

Commit def551d

Browse files
authored
[CLI TOOLS][RTVM] Improve rtvm tool with new options to measure native performance (#15818)
* [RTVM] Improve rtvm tool with new options to measure native performance Few fixes and enhancements that affects model loading times New options to measure performance. * * review comments * * review comments
1 parent 8b40f5d commit def551d

File tree

4 files changed

+316
-58
lines changed

4 files changed

+316
-58
lines changed

apps/cpp_rtvm/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ Command line usage
122122
--input - Numpy file for the model input (optional and we use random of not given)
123123
--output - Numpy file name to dump the model output as numpy
124124
--dump-meta - Dump model meta information
125+
--pre-compiled - The file name of a file where pre-compiled programs should be stored
126+
--profile - Profile over all execution
127+
--dry-run - Profile after given dry runs, default 10
128+
--run-count - Profile for given runs, default 50
129+
--zero-copy - Profile with zero copy api
125130

126131
Example
127132
./rtvm --model=keras-resnet50 --device="opencl" --dump-meta
@@ -366,3 +371,20 @@ stored. If the pre-compiled file name was passed to the `rtvm` then After method
366371
`Load`, method `UsePreCompiledProgram` is called. This method loads pre-compiled
367372
programs if the file exists. In opposite case the file will be created and
368373
pre-compiled programs will be saved to this file.
374+
375+
# Performnace Profiling Options
376+
The tool has added few options to measure wall clock performance of the given model on Target natively.
377+
--profile : Can turn on the profiling
378+
--dry-run : The number of times dry run the model before mearuring the performance. Default value os 10
379+
--run-count : The number times to run the model and take an average. Default value is 50.
380+
--zero-copy: This option enables graph runtime zero copy to be used for input and output than byte copy to DLTensor.
381+
382+
Performance profile options dumps information summary as given below.
383+
Module Load :27 ms
384+
Graph Runtime Create :11 ms
385+
Params Read :15 ms
386+
Params Set :41 ms
387+
Pre Compiled Progs Load :24 ms
388+
Total Load Time :118 ms
389+
Average ExecTime :27 ms
390+
Unload Time :35.9236 ms

apps/cpp_rtvm/main.cc

Lines changed: 174 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#endif
3030
#include <dmlc/logging.h>
3131

32+
#include <chrono>
3233
#include <cstring>
3334
#include <iostream>
3435
#include <sstream>
@@ -54,7 +55,11 @@ static const string kUsage =
5455
"--input - Numpy file for the model input (optional and we use random of not given)\n"
5556
"--output - Numpy file name to dump the model output as numpy\n"
5657
"--dump-meta - Dump model meta information\n"
57-
"--pre-compiled - The file name of a file where pre-compiled programs should be stored"
58+
"--pre-compiled - The file name of a file where pre-compiled programs should be stored\n"
59+
"--profile - Profile over all execution\n"
60+
"--dry-run - Profile after given dry runs, default 10\n"
61+
"--run-count - Profile for given runs, default 50\n"
62+
"--zero-copy - Profile with zero copy api\n"
5863
"\n"
5964
" Example\n"
6065
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
@@ -68,14 +73,19 @@ static const string kUsage =
6873
* \arg input Numpy file for the model input
6974
* \arg output Numpy file name to dump the model output as numpy
7075
* \arg pre_compiled File name where pre-compiled programs should be stored
76+
* \arg profile Do we profile overall execution
7177
*/
7278
struct ToolArgs {
7379
string model;
7480
string device;
7581
string input;
7682
string output;
7783
string pre_compiled;
78-
bool dump_meta = false;
84+
bool dump_meta{false};
85+
bool profile{false};
86+
int dry_run{10};
87+
int run_count{50};
88+
bool zero_copy{false};
7989
};
8090

8191
/*!
@@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
8999
LOG(INFO) << "Output = " << args.output;
90100
LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
91101
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
102+
LOG(INFO) << "Profile = " << ((args.profile) ? ("True") : ("False"));
103+
LOG(INFO) << "Dry Run = " << args.dry_run;
104+
LOG(INFO) << "Run Count = " << args.run_count;
105+
LOG(INFO) << "Zero Copy = " << ((args.zero_copy) ? ("True") : ("False"));
92106
}
93107

94108
#if defined(__linux__) || defined(__ANDROID__)
@@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
178192
}
179193

180194
args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
195+
196+
const string pprofile = GetCmdOption(argc, argv, "--profile", true);
197+
if (!pprofile.empty()) {
198+
args.profile = true;
199+
}
200+
201+
const string pdry_run = GetCmdOption(argc, argv, "--dry-run=");
202+
if (!pdry_run.empty()) {
203+
args.dry_run = stoi(pdry_run);
204+
}
205+
206+
const string prun = GetCmdOption(argc, argv, "--run-count=");
207+
if (!prun.empty()) {
208+
args.run_count = stoi(prun);
209+
}
210+
211+
const string pzcopy = GetCmdOption(argc, argv, "--zero-copy", true);
212+
if (!pzcopy.empty()) {
213+
args.zero_copy = true;
214+
}
181215
}
182216

183217
/*!
@@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
192226
#endif
193227

194228
// Initialize TVM Runner
195-
TVMRunner runner = TVMRunner(args.model, args.device);
229+
auto runner = new TVMRunner(args.model, args.device);
196230

197231
// Load the model
198-
runner.Load();
232+
runner->Load();
199233
if (!args.pre_compiled.empty()) {
200-
runner.UsePreCompiledPrograms(args.pre_compiled);
234+
runner->UsePreCompiledPrograms(args.pre_compiled);
201235
}
202236

203237
// Query Model meta Information
204-
TVMMetaInfo mInfo = runner.GetMetaInfo();
238+
TVMMetaInfo mInfo = runner->GetMetaInfo();
205239

206240
// Print Meta Information
207-
if (args.dump_meta) runner.PrintMetaInfo();
241+
if (args.dump_meta) runner->PrintMetaInfo();
242+
243+
int total_exec_time = 0;
244+
245+
if (args.profile) {
246+
if (args.dry_run) {
247+
for (int ii = 0; ii < args.dry_run; ++ii) {
248+
runner->Run();
249+
}
250+
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
251+
}
252+
int total_time = 0;
253+
std::map<std::string, NDArray> input_data_even, input_data_odd;
254+
std::map<std::string, NDArray> output_data_even, output_data_odd;
255+
256+
std::map<std::string, char*> input_data;
257+
std::map<std::string, char*> output_data;
258+
259+
// Alloc / populate and keep input data ready
260+
for (auto& elem : mInfo.input_info) {
261+
if (args.zero_copy) {
262+
auto ndarr =
263+
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
264+
DLDevice{GetTVMDevice(args.device), 0});
265+
input_data_even.insert({elem.first, ndarr});
266+
267+
ndarr =
268+
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
269+
DLDevice{GetTVMDevice(args.device), 0});
270+
input_data_odd.insert({elem.first, ndarr});
271+
} else {
272+
char* data = (char*)malloc(runner->GetInputMemSize(elem.first));
273+
input_data.insert({elem.first, data});
274+
}
275+
}
276+
277+
// Alloc and keep output bufers ready
278+
for (auto& elem : mInfo.output_info) {
279+
if (args.zero_copy) {
280+
auto ndarr =
281+
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
282+
DLDevice{GetTVMDevice(args.device), 0});
283+
output_data_even.insert({elem.first, ndarr});
284+
285+
ndarr =
286+
NDArray::Empty(elem.second.first, tvm::runtime::String2DLDataType(elem.second.second),
287+
DLDevice{GetTVMDevice(args.device), 0});
288+
output_data_odd.insert({elem.first, ndarr});
289+
} else {
290+
char* data = (char*)malloc(runner->GetOutputMemSize(elem.first));
291+
output_data.insert({elem.first, data});
292+
}
293+
}
294+
295+
for (int ii = 0; ii < args.run_count; ++ii) {
296+
// Timer start
297+
auto tstart = std::chrono::high_resolution_clock::now();
298+
// Set random input for all input
299+
for (auto& elem : mInfo.input_info) {
300+
if (args.zero_copy) {
301+
if (ii % 2) {
302+
runner->SetInput(elem.first, input_data_even[elem.first]);
303+
} else {
304+
runner->SetInput(elem.first, input_data_odd[elem.first]);
305+
}
306+
} else {
307+
runner->SetInput(elem.first, input_data[elem.first]);
308+
}
309+
}
310+
311+
if (args.zero_copy) {
312+
// With zero copy set the result NDArray up front
313+
for (auto& elem : mInfo.output_info) {
314+
if (ii % 2) {
315+
runner->SetOutput(elem.first, output_data_even[elem.first]);
316+
} else {
317+
runner->SetOutput(elem.first, output_data_odd[elem.first]);
318+
}
319+
}
320+
}
208321

209-
if (args.input.empty() || args.output.empty()) {
322+
// Run the model
323+
runner->Run();
324+
325+
if (!args.zero_copy) {
326+
// W/o zero copy we need to invoke explicite data copy
327+
for (auto& elem : mInfo.output_info) {
328+
runner->GetOutput(elem.first, output_data[elem.first]);
329+
}
330+
} else {
331+
// Just wait for the run to complete.
332+
TVMSynchronize(GetTVMDevice(args.device), 0, nullptr);
333+
}
334+
335+
// Timer end
336+
auto tend = std::chrono::high_resolution_clock::now();
337+
LOG(INFO) << "Exec Time:" << static_cast<double>((tend - tstart).count()) / 1e6;
338+
total_exec_time += static_cast<double>((tend - tstart).count()) / 1e6;
339+
}
340+
341+
// Free input bufers
342+
for (auto& elem : mInfo.input_info) {
343+
free(input_data[elem.first]);
344+
}
345+
346+
// Free output bufers
347+
for (auto& elem : mInfo.output_info) {
348+
free(output_data[elem.first]);
349+
}
350+
} else if (!args.input.empty() && !args.output.empty()) {
351+
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
352+
// Set Input from Numpy Input
353+
runner->SetInput(args.input);
354+
// Run the model
355+
runner->Run();
356+
// Get Output as Numpy dump
357+
runner->GetOutput(args.output);
358+
} else {
210359
LOG(INFO) << "Executing dry run ... ";
211360
// Set random input for all inputs
212361
for (auto& elem : mInfo.input_info) {
213362
LOG(INFO) << "Set Random Input for :" << elem.first;
214363
auto shape = elem.second.first;
215-
size_t ssize = runner.GetInputMemSize(elem.first);
364+
size_t ssize = runner->GetInputMemSize(elem.first);
216365
char* data = (char*)malloc(ssize);
217366
LOG(INFO) << "Random Input Size:" << ssize << " bytes";
218-
runner.SetInput(elem.first, data);
367+
runner->SetInput(elem.first, data);
219368
free(data);
220369
}
221-
222370
// Run the model
223-
runner.Run();
224-
371+
runner->Run();
225372
// Get Output and dump few values
226373
for (auto& elem : mInfo.output_info) {
227374
LOG(INFO) << "Get Output for :" << elem.first;
228375
auto shape = elem.second.first;
229-
size_t ssize = runner.GetOutputMemSize(elem.first);
376+
size_t ssize = runner->GetOutputMemSize(elem.first);
230377
char* data = (char*)malloc(ssize);
231-
runner.GetOutput(elem.first, data);
378+
runner->GetOutput(elem.first, data);
232379
LOG(INFO) << "Output Size:" << ssize << " bytes";
233380
free(data);
234381
}
235-
} else {
236-
LOG(INFO) << "Executing with Input:" << args.input << " Output:" << args.output;
237-
238-
// Set Input from Numpy Input
239-
runner.SetInput(args.input);
240-
241-
// Run the model
242-
runner.Run();
382+
}
243383

244-
// Get Output as Numpy dump
245-
runner.GetOutput(args.output);
384+
if (args.profile) {
385+
// Print Stats
386+
runner->PrintStats();
246387
}
388+
auto tstart = std::chrono::high_resolution_clock::now();
389+
delete runner;
390+
auto tend = std::chrono::high_resolution_clock::now();
247391

392+
if (args.profile) {
393+
LOG(INFO) << "Average ExecTime :" << total_exec_time / args.run_count << " ms";
394+
LOG(INFO) << "Unload Time :" << static_cast<double>((tend - tstart).count()) / 1e6
395+
<< " ms";
396+
}
248397
return 0;
249398
}
250399

0 commit comments

Comments
 (0)