1515#include < iterator>
1616#include < algorithm>
1717
18- float tensor_sum_elements (struct ggml_tensor * tensor) {
18+ float tensor_sum_elements (const ggml_tensor * tensor) {
1919 float sum = 0 ;
2020 if (tensor->type ==GGML_TYPE_F32) {
2121 for (int j = 0 ; j < tensor->ne [1 ]; j++) {
@@ -27,21 +27,15 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
2727 return sum;
2828}
2929
30+ void tensor_dump (const ggml_tensor * tensor, const char * name) {
31+ printf (" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , name,
32+ tensor->type , ggml_type_name (tensor->type ),
33+ (int ) tensor->ne [0 ], (int ) tensor->ne [1 ], (int ) tensor->ne [2 ], tensor->nb [0 ], tensor->nb [1 ], tensor->nb [2 ]);
34+ float sum = tensor_sum_elements (tensor);
35+ printf (" Sum of tensor %s is %6.2f\n " , name, sum);
36+ }
3037
31- /*
32- These are mapping to unknown
33- GGML_TYPE_I8,
34- GGML_TYPE_I16,
35- GGML_TYPE_I32,
36- GGML_TYPE_COUNT,
37- */
38-
39- #define TENSOR_TYPE_AS_STR (TYPE ) TYPE == GGML_TYPE_F32 ? " FP32" : TYPE == GGML_TYPE_F16 ? " FP16" : TYPE == GGML_TYPE_Q4_0 ? " Q4_0" : TYPE == GGML_TYPE_Q4_1 ? " Q4_1" : " UNKNOWN"
40-
41- #define TENSOR_DUMP (TENSOR ) printf(" %15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - " , #TENSOR, \
42- TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
43- (int ) TENSOR->ne[0 ], (int ) TENSOR->ne[1 ], (int ) TENSOR->ne[2 ], TENSOR->nb[0 ], TENSOR->nb[1 ], TENSOR->nb[2 ]); \
44- { float sum = tensor_sum_elements (TENSOR); printf (" Sum of tensor %s is %6.2f\n " ,#TENSOR, sum); }
38+ #define TENSOR_DUMP (tensor ) tensor_dump(tensor, #tensor)
4539
4640struct benchmark_params_struct {
4741 int32_t n_threads = 1 ;
@@ -59,8 +53,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
5953}
6054
6155int main (int argc, char ** argv) {
62-
63-
6456 struct benchmark_params_struct benchmark_params;
6557
6658 bool invalid_param = false ;
@@ -84,11 +76,11 @@ int main(int argc, char ** argv) {
8476 print_usage (argc, argv, benchmark_params);
8577 exit (0 );
8678 }
87- if (invalid_param) {
88- fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str ());
89- print_usage (argc, argv, benchmark_params );
90- exit ( 1 );
91- }
79+ }
80+ if (invalid_param) {
81+ fprintf (stderr, " error: invalid parameter for argument: %s \n " , arg. c_str () );
82+ print_usage (argc, argv, benchmark_params );
83+ exit ( 1 );
9284 }
9385
9486 fprintf (stderr, " %s: build = %d (%s)\n " , __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -216,9 +208,8 @@ int main(int argc, char ** argv) {
216208 // Let's use the F32 result from above as a reference for the q4_0 multiplication
217209 float sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
218210
219-
220- printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n " );
221- printf (" ==============================================================================================\n " );
211+ printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " );
212+ printf (" =====================================================================================\n " );
222213
223214 for (int i=0 ;i<benchmark_params.n_iterations ;i++) {
224215
@@ -227,12 +218,12 @@ int main(int argc, char ** argv) {
227218 ggml_graph_compute (ctx, &gf31);
228219 long long int stop = ggml_time_us ();
229220 long long int usec = stop-start;
230- float flops_per_usec = (1 . 0f * flops_per_matrix)/usec;
231- printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19 .2f\n " ,
221+ double gflops = (double )( flops_per_matrix)/usec/ 1000.0 ;
222+ printf (" %9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10 .2f\n " ,
232223 i,
233224 gf31.n_threads ,
234225 sizex, sizey, sizez, flops_per_matrix,
235- usec,flops_per_usec );
226+ usec,gflops );
236227
237228#ifdef VERBOSE_DEBUGGING
238229 TENSOR_DUMP (" res" ,gf31.nodes [0 ])
@@ -256,7 +247,5 @@ int main(int argc, char ** argv) {
256247
257248 // Running a different graph computation to make sure we override the CPU cache lines
258249 ggml_graph_compute (ctx, &gf32);
259-
260250 }
261-
262251}
0 commit comments