1+ #include  " build-info.h" 
12#include  " common.h" 
23#include  " ggml.h" 
34
@@ -36,7 +37,7 @@ static float tensor_sum_elements(const ggml_tensor * tensor) {
3637    if  (tensor->type ==GGML_TYPE_F32) {
3738        for  (int  j = 0 ; j < tensor->ne [1 ]; j++) {
3839            for  (int  k = 0 ; k < tensor->ne [0 ]; k++) {
39-                 sum +=   ((float  *) tensor->data )[j*tensor->ne [0 ]+ k];
40+                 sum += ((float  *) tensor->data )[j*tensor->ne [0 ] +  k];
4041            }
4142        }
4243    }
@@ -125,12 +126,15 @@ int main(int argc, char ** argv)  {
125126
126127    // printf("Memsize required = %i\n", sizex*sizex);
127128
129+     //  TODO: perform the bench for all types or for a user specified type
130+     const  ggml_type qtype = GGML_TYPE_Q4_1;
131+ 
128132    size_t  ctx_size = 0 ;
129133    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
130134    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32);
131135    ctx_size += sizex*sizez*ggml_type_sizef (GGML_TYPE_F32);
132-     ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
133-     ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_Q4_0 );
136+     ctx_size += sizex*sizey*ggml_type_sizef (qtype );
137+     ctx_size += sizex*sizey*ggml_type_sizef (qtype );
134138    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); //  BLAS
135139    ctx_size += sizex*sizey*ggml_type_sizef (GGML_TYPE_F32); //  BLAS
136140    ctx_size += 1024 *1024 *16 ;
@@ -163,7 +167,7 @@ int main(int argc, char ** argv)  {
163167    struct  ggml_tensor  * m2 = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, sizex, sizez);
164168    ggml_set_f32 (m2, 2 .0f );
165169
166-     printf (" \n ------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------ \n " 
170+     printf (" \n ------ Test 1 - Matrix Mult via F32 code\n " 
167171    //  printf("Creating new tensor m11xm2\n");
168172    struct  ggml_tensor  * m11xm2 = ggml_mul_mat (ctx, m11, m2);
169173
@@ -181,17 +185,16 @@ int main(int argc, char ** argv)  {
181185
182186    TENSOR_DUMP (gf.nodes [0 ]);
183187
184-     printf (" \n ------ Test 2 - Matrix Mult via Q4_0  code ------------------------------------------------------------------------------ \n " 
188+     printf (" \n ------ Test 2 - Matrix Mult via %s  code\n " ,  ggml_type_name (qtype) );
185189
186190    int32_t  nelements = sizex*sizey;
187-     int32_t  ne[2 ] = { sizex, sizey };
188191
189192    std::vector<int64_t > hist_cur (1  << 4 , 0 );
190193
191194    //  Set up a the benchmark matrices
192195    //  printf("Creating new tensor q11 & Running quantize\n");
193-     struct  ggml_tensor  * q11 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
194-     ggml_quantize_q4_0 ( (const  float  *) m11->data , q11->data , nelements, ne[ 0 ] , hist_cur.data ());
196+     struct  ggml_tensor  * q11 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
197+     ggml_quantize_chunk (qtype,  (const  float  *) m11->data , q11->data , 0 , nelements , hist_cur.data ());
195198
196199    //  Set up a the compute graph
197200    //  printf("Creating new tensor q31\n");
@@ -202,8 +205,8 @@ int main(int argc, char ** argv)  {
202205
203206    //  Set up a second graph computation to make sure we override the CPU cache lines
204207    //  printf("Creating new tensor q12 & Running quantize\n");
205-     struct  ggml_tensor  * q12 = ggml_new_tensor_2d (ctx, GGML_TYPE_Q4_0 , sizex, sizey);
206-     ggml_quantize_q4_0 ( (const  float  *) m12->data , q12->data , nelements, ne[ 0 ] , hist_cur.data ());
208+     struct  ggml_tensor  * q12 = ggml_new_tensor_2d (ctx, qtype , sizex, sizey);
209+     ggml_quantize_chunk (qtype,  (const  float  *) m12->data , q12->data , 0 , nelements , hist_cur.data ());
207210
208211    //  printf("Creating new tensor q32\n");
209212    struct  ggml_tensor  * q32 = ggml_mul_mat (ctx, q12, m2);
@@ -220,7 +223,7 @@ int main(int argc, char ** argv)  {
220223    printf (" Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n " 1 , sizex, sizez, 1 , 1 .0f *flops_per_matrix / 1000  / 1000  / 1000 );
221224
222225
223-     //  Let's use the F32 result from above as a reference for the q4_0  multiplication
226+     //  Let's use the F32 result from above as a reference for the quantized  multiplication
224227    float  sum_of_F32_reference = tensor_sum_elements (gf.nodes [0 ]);
225228
226229    printf (" Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n " 
0 commit comments