@@ -297,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
297297struct  ProfilingInfo  {
298298    std::string op_name;
299299    std::string kernel_name;
300-     //  Kernel execution time in nanoseconds.
301-     cl_ulong duration_ns;
300+ 
301+     cl_kernel kernel;
302+     cl_event evt;
303+ 
304+     cl_ulong cmd_queued;
305+     cl_ulong cmd_submit;
306+     cl_ulong cmd_start;
307+     cl_ulong cmd_end;
308+     cl_ulong overhead_start;
309+     cl_ulong overhead_end;
310+     //  For the times below, see spec for clGetEventProfilingInfo
311+     //  The time kernel spent in cmd queue - SUBMIT - QUEUED
312+     cl_ulong cmd_queued_duration_ns;
313+     //  The time kernel spent for submission - START - SUBMIT
314+     cl_ulong cmd_submit_duration_ns;
315+     //  Kernel execution time in nanoseconds - END - START
316+     cl_ulong cmd_duration_ns;
317+     //  The time for the kernel to complete - COMPLETE - END
318+     cl_ulong cmd_complete_duration_ns;
319+     //  Total time to finish the kernel - COMPELTE - QUEUED
320+     cl_ulong cmd_total_duration_ns;
302321    //  Global and local work sizes.
303322    size_t  global_size[3 ];
304323    size_t  local_size[3 ];
@@ -903,19 +922,84 @@ static void ggml_cl2_free(void) {
903922        return ;
904923    }
905924
925+     //  Populate profiling info
926+     for  (ProfilingInfo & info : g_profiling_info) {
927+         cl_ulong cmd_queued;
928+         cl_ulong cmd_submit;
929+         cl_ulong cmd_start;
930+         cl_ulong cmd_end;
931+         cl_ulong cmd_complete;
932+ 
933+         CL_CHECK (clWaitForEvents (1 , &info.evt ));
934+         CL_CHECK (clGetEventProfilingInfo (
935+             info.evt , CL_PROFILING_COMMAND_QUEUED, sizeof (cl_ulong), &cmd_queued, NULL ));
936+         CL_CHECK (clGetEventProfilingInfo (
937+             info.evt , CL_PROFILING_COMMAND_SUBMIT, sizeof (cl_ulong), &cmd_submit, NULL ));
938+         CL_CHECK (clGetEventProfilingInfo (
939+             info.evt , CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &cmd_start, NULL ));
940+         CL_CHECK (clGetEventProfilingInfo (
941+             info.evt , CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &cmd_end, NULL ));
942+         CL_CHECK (clGetEventProfilingInfo (
943+             info.evt , CL_PROFILING_COMMAND_COMPLETE, sizeof (cl_ulong), &cmd_complete, NULL ));
944+         CL_CHECK (clReleaseEvent (info.evt ));
945+ 
946+         char  kernel_name[512 ];
947+         CL_CHECK (clGetKernelInfo (info.kernel , CL_KERNEL_FUNCTION_NAME,
948+             sizeof (kernel_name), kernel_name, NULL ));
949+         info.kernel_name  = kernel_name;
950+ 
951+         info.cmd_queued  = cmd_queued;
952+         info.cmd_submit  = cmd_submit;
953+         info.cmd_start   = cmd_start;
954+         info.cmd_end     = cmd_end;
955+ 
956+         info.cmd_queued_duration_ns      = cmd_submit    - cmd_queued;
957+         info.cmd_submit_duration_ns      = cmd_start     - cmd_submit;
958+         info.cmd_duration_ns             = cmd_end       - cmd_start;
959+         info.cmd_complete_duration_ns    = cmd_complete  - cmd_end;
960+         info.cmd_total_duration_ns       = cmd_complete  - cmd_queued;
961+     }
962+ 
963+     //  Dump a csv
906964    float  total_kernel_time = 0 ;
907-     fprintf (fperf, " op name, kernel name, duration (ms), global size, local size, output size\n " 
965+     fprintf (fperf, " op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total  duration (ms), global size, local size, output size\n " 
908966    for  (const  ProfilingInfo & info : g_profiling_info) {
909-         total_kernel_time += info.duration_ns /1 .e6f ;
910-         fprintf (fperf, " %s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " 
911-             info.op_name .c_str (), info.kernel_name .c_str (), info.duration_ns /1 .e6f ,
967+         total_kernel_time += info.cmd_duration_ns /1 .e6f ;
968+         fprintf (fperf, " %s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n " 
969+             info.op_name .c_str (), info.kernel_name .c_str (),
970+             info.cmd_queued_duration_ns /1 .e6f ,
971+             info.cmd_submit_duration_ns /1 .e6f ,
972+             info.cmd_duration_ns /1 .e6f ,
973+             info.cmd_complete_duration_ns /1 .e6f ,
974+             info.cmd_total_duration_ns /1 .e6f ,
912975            info.global_size [0 ], info.global_size [1 ], info.global_size [2 ],
913976            info.local_size [0 ], info.local_size [2 ], info.local_size [2 ],
914977            info.output_size [0 ], info.output_size [1 ], info.output_size [2 ], info.output_size [3 ]);
915978    }
916979    fclose (fperf);
917980
918981    GGML_LOG_INFO (" ggml_opencl: total kernel time: %f\n " 
982+ 
983+     //  Dump a simple chrome trace
984+     FILE* ftrace = fopen (" cl_trace.json" " w" 
985+     if  (!ftrace) {
986+         GGML_LOG_ERROR (" Failed to open cl_trace.json\n " 
987+         return ;
988+     }
989+ 
990+     fprintf (ftrace, " [\n " 
991+     for  (const  ProfilingInfo & info : g_profiling_info) {
992+         fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " 
993+             info.kernel_name .c_str (), info.cmd_queued /1000 );
994+         fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Host\" },\n " 
995+             info.kernel_name .c_str (), info.cmd_submit /1000 );
996+ 
997+         fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" B\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " 
998+             info.kernel_name .c_str (), info.cmd_start /1000 );
999+         fprintf (ftrace, " {\" name\" : \" %s\" , \" cat\" : \" OpenCL\" , \" ph\" : \" E\" , \" ts\" : %lu, \" pid\" : \"\" , \" tid\" : \" Device\" },\n " 
1000+             info.kernel_name .c_str (), info.cmd_end /1000 );
1001+     }
1002+     fclose (ftrace);
9191003#endif 
9201004}
9211005
@@ -2062,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
20622146//  Profiling utility
20632147// ------------------------------------------------------------------------------
20642148#ifdef  GGML_OPENCL_PROFILING
2065- void  populateProfilingInfo (
2149+ static   void  populateProfilingInfo (
20662150        ProfilingInfo& info, cl_event evt, cl_kernel kernel,
20672151        size_t  global_size[3 ], size_t  local_size[3 ],
20682152        const  ggml_tensor * tensor) {
2069-     cl_ulong start;
2070-     cl_ulong end;
2071-     CL_CHECK (clWaitForEvents (1 , &evt));
2072-     CL_CHECK (clGetEventProfilingInfo (
2073-         evt, CL_PROFILING_COMMAND_START, sizeof (cl_ulong), &start, NULL ));
2074-     CL_CHECK (clGetEventProfilingInfo (
2075-         evt, CL_PROFILING_COMMAND_END, sizeof (cl_ulong), &end, NULL ));
2076- 
2077-     char  kernel_name[512 ];
2078-     CL_CHECK (clGetKernelInfo (kernel, CL_KERNEL_FUNCTION_NAME,
2079-         sizeof (kernel_name), kernel_name, NULL ));
2080- 
2081-     info.duration_ns  = end - start;
2082-     info.op_name  = tensor->name ;
2083-     info.kernel_name  = kernel_name;
2153+     info.op_name      = tensor->name ;
2154+     info.kernel       = kernel;
2155+     info.evt          = evt;
2156+ 
20842157    info.local_size [0 ]  = local_size[0 ];
20852158    info.local_size [1 ]  = local_size[1 ];
20862159    info.local_size [2 ]  = local_size[2 ];
0 commit comments