1717#include  < algorithm> 
1818
1919float  tensor_sum_elements (const  ggml_tensor * tensor) {
20-     float  sum = 0 ;
21-     if  (tensor->type == GGML_TYPE_F32) {
20+     double  sum = 0 ;
21+     if  (tensor->type  ==  GGML_TYPE_F32) {
2222        for  (int  j = 0 ; j < tensor->ne [1 ]; j++) {
2323            for  (int  k = 0 ; k < tensor->ne [0 ]; k++) {
24-                 sum +=   ((float  *) tensor->data )[j*tensor->ne [0 ]+ k];
24+                 sum += ((float  *) tensor->data )[j*tensor->ne [0 ] +  k];
2525            }
2626        }
2727    }
@@ -110,12 +110,15 @@ int main(int argc, char ** argv)  {
110110
111111    // printf("Memsize required = %i\n", sizex*sizex);
112112
113+     //  TODO: perform the bench for all types or for a user specified type
114+     const  ggml_type qtype = GGML_TYPE_Q4_1;
115+ 
113116    size_t  ctx_size = 0 ;
114117    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
115118    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
116119    ctx_size += sizex*sizez*ggml_type_sizef (GGML_TYPE_F32);
117-     ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
118-     ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
120+     ctx_size += sizex*sizey*ggml_type_sizef (qtype );
121+     ctx_size += sizex*sizey*ggml_type_sizef (qtype );
119122    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); //  BLAS
120123    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); //  BLAS
121124    ctx_size += 1024 *1024 *16 ;
@@ -148,7 +151,7 @@ int main(int argc, char ** argv)  {
148151    struct  ggml_tensor  * m2 = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, sizex, sizez);
149152    ggml_set_f32 (m2, 2 .0f );
150153
151-     printf (" \n ------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------ \n "  );
154+     printf (" \n ------ Test 1 - Matrix Mult via F32 code\n "  );
152155    //  printf("Creating new tensor m11xm2\n");
153156    struct  ggml_tensor  * m11xm2 = ggml_mul_mat (ctx, m11, m2);
154157
@@ -165,17 +168,16 @@ int main(int argc, char ** argv)  {
165168
166169    TENSOR_DUMP (gf.nodes [0 ]);
167170
168-     printf (" \n ------ Test 2 - Matrix Mult via Q4_0  code ------------------------------------------------------------------------------ \n "  );
171+     printf (" \n ------ Test 2 - Matrix Mult via %s  code\n " ,  ggml_type_name (qtype) );
169172
170173    int32_t  nelements = sizex*sizey;
171-     int32_t  ne[2 ] = { sizex, sizey };
172174
173175    std::vector<int64_t > hist_cur (1  << 4 , 0 );
174176
175177    //  Set up a the benchmark matrices
176178    //  printf("Creating new tensor q11 & Running quantize\n");
177-     struct  ggml_tensor  * q11 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
178-     ggml_quantize_q4_0 ( (const  float  *) m11->data , q11->data , nelements, ne[ 0 ] , hist_cur.data ());
179+     struct  ggml_tensor  * q11 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
180+     ggml_quantize_chunk (qtype,  (const  float  *) m11->data , q11->data , 0 , nelements , hist_cur.data ());
179181
180182    //  Set up a the compute graph
181183    //  printf("Creating new tensor q31\n");
@@ -187,8 +189,8 @@ int main(int argc, char ** argv)  {
187189
188190    //  Set up a second graph computation to make sure we override the CPU cache lines
189191    //  printf("Creating new tensor q12 & Running quantize\n");
190-     struct  ggml_tensor  * q12 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
191-     ggml_quantize_q4_0 ( (const  float  *) m12->data , q12->data , nelements, ne[ 0 ] , hist_cur.data ());
192+     struct  ggml_tensor  * q12 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
193+     ggml_quantize_chunk (qtype,  (const  float  *) m12->data , q12->data , 0 , nelements , hist_cur.data ());
192194
193195    //  printf("Creating new tensor q32\n");
194196    struct  ggml_tensor  * q32 = ggml_mul_mat (ctx, q12, m2);
@@ -206,7 +208,7 @@ int main(int argc, char ** argv)  {
206208    printf (" Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n "  , sizex, sizey, 1 , sizex, sizez, 1 , 1 .0f *flops_per_matrix / 1000  / 1000  / 1000 );
207209
208210
209-     //  Let's use the F32 result from above as a reference for the q4_0  multiplication
211+     //  Let's use the F32 result from above as a reference for the quantized  multiplication
210212    float  sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
211213
212214    printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n "  );
0 commit comments