2929#endif
3030#include < dmlc/logging.h>
3131
32+ #include < chrono>
3233#include < cstring>
3334#include < iostream>
3435#include < sstream>
@@ -54,7 +55,11 @@ static const string kUsage =
5455 " --input - Numpy file for the model input (optional and we use random of not given)\n "
5556 " --output - Numpy file name to dump the model output as numpy\n "
5657 " --dump-meta - Dump model meta information\n "
57- " --pre-compiled - The file name of a file where pre-compiled programs should be stored"
58+ " --pre-compiled - The file name of a file where pre-compiled programs should be stored\n "
59+ " --profile - Profile over all execution\n "
60+ " --dry-run - Profile after given dry runs, default 10\n "
61+ " --run-count - Profile for given runs, default 50\n "
62+ " --zero-copy - Profile with zero copy api\n "
5863 " \n "
5964 " Example\n "
6065 " ./rtvm --model=keras-resnet50 --device=\" opencl\" --dump-meta\n "
@@ -68,14 +73,19 @@ static const string kUsage =
6873 * \arg input Numpy file for the model input
6974 * \arg output Numpy file name to dump the model output as numpy
7075 * \arg pre_compiled File name where pre-compiled programs should be stored
76+ * \arg profile Do we profile overall execution
7177 */
7278struct ToolArgs {
7379 string model;
7480 string device;
7581 string input;
7682 string output;
7783 string pre_compiled;
78- bool dump_meta = false ;
84+ bool dump_meta{false };
85+ bool profile{false };
86+ int dry_run{10 };
87+ int run_count{50 };
88+ bool zero_copy{false };
7989};
8090
8191/* !
@@ -89,6 +99,10 @@ void PrintArgs(const ToolArgs& args) {
8999 LOG (INFO) << " Output = " << args.output ;
90100 LOG (INFO) << " Pre-compiled = " << args.pre_compiled ;
91101 LOG (INFO) << " Dump Metadata = " << ((args.dump_meta ) ? (" True" ) : (" False" ));
102+ LOG (INFO) << " Profile = " << ((args.profile ) ? (" True" ) : (" False" ));
103+ LOG (INFO) << " Dry Run = " << args.dry_run ;
104+ LOG (INFO) << " Run Count = " << args.run_count ;
105+ LOG (INFO) << " Zero Copy = " << ((args.zero_copy ) ? (" True" ) : (" False" ));
92106}
93107
94108#if defined(__linux__) || defined(__ANDROID__)
@@ -178,6 +192,26 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& args) {
178192 }
179193
180194 args.pre_compiled = GetCmdOption (argc, argv, " --pre-compiled=" );
195+
196+ const string pprofile = GetCmdOption (argc, argv, " --profile" , true );
197+ if (!pprofile.empty ()) {
198+ args.profile = true ;
199+ }
200+
201+ const string pdry_run = GetCmdOption (argc, argv, " --dry-run=" );
202+ if (!pdry_run.empty ()) {
203+ args.dry_run = stoi (pdry_run);
204+ }
205+
206+ const string prun = GetCmdOption (argc, argv, " --run-count=" );
207+ if (!prun.empty ()) {
208+ args.run_count = stoi (prun);
209+ }
210+
211+ const string pzcopy = GetCmdOption (argc, argv, " --zero-copy" , true );
212+ if (!pzcopy.empty ()) {
213+ args.zero_copy = true ;
214+ }
181215}
182216
183217/* !
@@ -192,59 +226,174 @@ int ExecuteModel(ToolArgs& args) {
192226#endif
193227
194228 // Initialize TVM Runner
195- TVMRunner runner = TVMRunner (args.model , args.device );
229+ auto runner = new TVMRunner (args.model , args.device );
196230
197231 // Load the model
198- runner. Load ();
232+ runner-> Load ();
199233 if (!args.pre_compiled .empty ()) {
200- runner. UsePreCompiledPrograms (args.pre_compiled );
234+ runner-> UsePreCompiledPrograms (args.pre_compiled );
201235 }
202236
203237 // Query Model meta Information
204- TVMMetaInfo mInfo = runner. GetMetaInfo ();
238+ TVMMetaInfo mInfo = runner-> GetMetaInfo ();
205239
206240 // Print Meta Information
207- if (args.dump_meta ) runner.PrintMetaInfo ();
241+ if (args.dump_meta ) runner->PrintMetaInfo ();
242+
243+ int total_exec_time = 0 ;
244+
245+ if (args.profile ) {
246+ if (args.dry_run ) {
247+ for (int ii = 0 ; ii < args.dry_run ; ++ii) {
248+ runner->Run ();
249+ }
250+ TVMSynchronize (GetTVMDevice (args.device ), 0 , nullptr );
251+ }
252+ int total_time = 0 ;
253+ std::map<std::string, NDArray> input_data_even, input_data_odd;
254+ std::map<std::string, NDArray> output_data_even, output_data_odd;
255+
256+ std::map<std::string, char *> input_data;
257+ std::map<std::string, char *> output_data;
258+
259+ // Alloc / populate and keep input data ready
260+ for (auto & elem : mInfo .input_info ) {
261+ if (args.zero_copy ) {
262+ auto ndarr =
263+ NDArray::Empty (elem.second .first , tvm::runtime::String2DLDataType (elem.second .second ),
264+ DLDevice{GetTVMDevice (args.device ), 0 });
265+ input_data_even.insert ({elem.first , ndarr});
266+
267+ ndarr =
268+ NDArray::Empty (elem.second .first , tvm::runtime::String2DLDataType (elem.second .second ),
269+ DLDevice{GetTVMDevice (args.device ), 0 });
270+ input_data_odd.insert ({elem.first , ndarr});
271+ } else {
272+ char * data = (char *)malloc (runner->GetInputMemSize (elem.first ));
273+ input_data.insert ({elem.first , data});
274+ }
275+ }
276+
277+ // Alloc and keep output bufers ready
278+ for (auto & elem : mInfo .output_info ) {
279+ if (args.zero_copy ) {
280+ auto ndarr =
281+ NDArray::Empty (elem.second .first , tvm::runtime::String2DLDataType (elem.second .second ),
282+ DLDevice{GetTVMDevice (args.device ), 0 });
283+ output_data_even.insert ({elem.first , ndarr});
284+
285+ ndarr =
286+ NDArray::Empty (elem.second .first , tvm::runtime::String2DLDataType (elem.second .second ),
287+ DLDevice{GetTVMDevice (args.device ), 0 });
288+ output_data_odd.insert ({elem.first , ndarr});
289+ } else {
290+ char * data = (char *)malloc (runner->GetOutputMemSize (elem.first ));
291+ output_data.insert ({elem.first , data});
292+ }
293+ }
294+
295+ for (int ii = 0 ; ii < args.run_count ; ++ii) {
296+ // Timer start
297+ auto tstart = std::chrono::high_resolution_clock::now ();
298+ // Set random input for all input
299+ for (auto & elem : mInfo .input_info ) {
300+ if (args.zero_copy ) {
301+ if (ii % 2 ) {
302+ runner->SetInput (elem.first , input_data_even[elem.first ]);
303+ } else {
304+ runner->SetInput (elem.first , input_data_odd[elem.first ]);
305+ }
306+ } else {
307+ runner->SetInput (elem.first , input_data[elem.first ]);
308+ }
309+ }
310+
311+ if (args.zero_copy ) {
312+ // With zero copy set the result NDArray up front
313+ for (auto & elem : mInfo .output_info ) {
314+ if (ii % 2 ) {
315+ runner->SetOutput (elem.first , output_data_even[elem.first ]);
316+ } else {
317+ runner->SetOutput (elem.first , output_data_odd[elem.first ]);
318+ }
319+ }
320+ }
208321
209- if (args.input .empty () || args.output .empty ()) {
322+ // Run the model
323+ runner->Run ();
324+
325+ if (!args.zero_copy ) {
326+ // W/o zero copy we need to invoke explicite data copy
327+ for (auto & elem : mInfo .output_info ) {
328+ runner->GetOutput (elem.first , output_data[elem.first ]);
329+ }
330+ } else {
331+ // Just wait for the run to complete.
332+ TVMSynchronize (GetTVMDevice (args.device ), 0 , nullptr );
333+ }
334+
335+ // Timer end
336+ auto tend = std::chrono::high_resolution_clock::now ();
337+ LOG (INFO) << " Exec Time:" << static_cast <double >((tend - tstart).count ()) / 1e6 ;
338+ total_exec_time += static_cast <double >((tend - tstart).count ()) / 1e6 ;
339+ }
340+
341+ // Free input bufers
342+ for (auto & elem : mInfo .input_info ) {
343+ free (input_data[elem.first ]);
344+ }
345+
346+ // Free output bufers
347+ for (auto & elem : mInfo .output_info ) {
348+ free (output_data[elem.first ]);
349+ }
350+ } else if (!args.input .empty () && !args.output .empty ()) {
351+ LOG (INFO) << " Executing with Input:" << args.input << " Output:" << args.output ;
352+ // Set Input from Numpy Input
353+ runner->SetInput (args.input );
354+ // Run the model
355+ runner->Run ();
356+ // Get Output as Numpy dump
357+ runner->GetOutput (args.output );
358+ } else {
210359 LOG (INFO) << " Executing dry run ... " ;
211360 // Set random input for all inputs
212361 for (auto & elem : mInfo .input_info ) {
213362 LOG (INFO) << " Set Random Input for :" << elem.first ;
214363 auto shape = elem.second .first ;
215- size_t ssize = runner. GetInputMemSize (elem.first );
364+ size_t ssize = runner-> GetInputMemSize (elem.first );
216365 char * data = (char *)malloc (ssize);
217366 LOG (INFO) << " Random Input Size:" << ssize << " bytes" ;
218- runner. SetInput (elem.first , data);
367+ runner-> SetInput (elem.first , data);
219368 free (data);
220369 }
221-
222370 // Run the model
223- runner.Run ();
224-
371+ runner->Run ();
225372 // Get Output and dump few values
226373 for (auto & elem : mInfo .output_info ) {
227374 LOG (INFO) << " Get Output for :" << elem.first ;
228375 auto shape = elem.second .first ;
229- size_t ssize = runner. GetOutputMemSize (elem.first );
376+ size_t ssize = runner-> GetOutputMemSize (elem.first );
230377 char * data = (char *)malloc (ssize);
231- runner. GetOutput (elem.first , data);
378+ runner-> GetOutput (elem.first , data);
232379 LOG (INFO) << " Output Size:" << ssize << " bytes" ;
233380 free (data);
234381 }
235- } else {
236- LOG (INFO) << " Executing with Input:" << args.input << " Output:" << args.output ;
237-
238- // Set Input from Numpy Input
239- runner.SetInput (args.input );
240-
241- // Run the model
242- runner.Run ();
382+ }
243383
244- // Get Output as Numpy dump
245- runner.GetOutput (args.output );
384+ if (args.profile ) {
385+ // Print Stats
386+ runner->PrintStats ();
246387 }
388+ auto tstart = std::chrono::high_resolution_clock::now ();
389+ delete runner;
390+ auto tend = std::chrono::high_resolution_clock::now ();
247391
392+ if (args.profile ) {
393+ LOG (INFO) << " Average ExecTime :" << total_exec_time / args.run_count << " ms" ;
394+ LOG (INFO) << " Unload Time :" << static_cast <double >((tend - tstart).count ()) / 1e6
395+ << " ms" ;
396+ }
248397 return 0 ;
249398}
250399
0 commit comments