@@ -176,7 +176,6 @@ struct cmd_params {
176176    std::vector<llama_split_mode> split_mode;
177177    std::vector<int > main_gpu;
178178    std::vector<bool > no_kv_offload;
179-     std::vector<bool > mul_mat_q;
180179    std::vector<std::vector<float >> tensor_split;
181180    std::vector<bool > use_mmap;
182181    int  reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196195    /*  split_mode    */   {LLAMA_SPLIT_MODE_LAYER},
197196    /*  main_gpu      */   {0 },
198197    /*  no_kv_offload */   {false },
199-     /*  mul_mat_q     */   {true },
200198    /*  tensor_split  */   {std::vector<float >(llama_max_devices (), 0 .0f )},
201199    /*  use_mmap      */   {true },
202200    /*  reps          */   5 ,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221219    printf ("   -mg, --main-gpu <i>                 (default: %s)\n "  , join (cmd_params_defaults.main_gpu , " ,"  ).c_str ());
222220    printf ("   -nkvo, --no-kv-offload <0|1>        (default: %s)\n "  , join (cmd_params_defaults.no_kv_offload , " ,"  ).c_str ());
223221    printf ("   -mmp, --mmap <0|1>                  (default: %s)\n "  , join (cmd_params_defaults.use_mmap , " ,"  ).c_str ());
224-     printf ("   -mmq, --mul-mat-q <0|1>             (default: %s)\n "  , join (cmd_params_defaults.mul_mat_q , " ,"  ).c_str ());
225222    printf ("   -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n "  );
226223    printf ("   -r, --repetitions <n>               (default: %d)\n "  , cmd_params_defaults.reps );
227224    printf ("   -o, --output <csv|json|md|sql>      (default: %s)\n "  , output_format_str (cmd_params_defaults.output_format ));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383380            }
384381            auto  p = split<bool >(argv[i], split_delim);
385382            params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
386-         } else  if  (arg == " -mmq"   || arg == " --mul-mat-q"  ) {
387-             if  (++i >= argc) {
388-                 invalid_param = true ;
389-                 break ;
390-             }
391-             auto  p = split<bool >(argv[i], split_delim);
392-             params.mul_mat_q .insert (params.mul_mat_q .end (), p.begin (), p.end ());
393383        } else  if  (arg == " -mmp"   || arg == " --mmap"  ) {
394384            if  (++i >= argc) {
395385                invalid_param = true ;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466456    if  (params.split_mode .empty ())   { params.split_mode  = cmd_params_defaults.split_mode ; }
467457    if  (params.main_gpu .empty ())     { params.main_gpu  = cmd_params_defaults.main_gpu ; }
468458    if  (params.no_kv_offload .empty ()){ params.no_kv_offload  = cmd_params_defaults.no_kv_offload ; }
469-     if  (params.mul_mat_q .empty ())    { params.mul_mat_q  = cmd_params_defaults.mul_mat_q ; }
470459    if  (params.tensor_split .empty ()) { params.tensor_split  = cmd_params_defaults.tensor_split ; }
471460    if  (params.use_mmap .empty ())     { params.use_mmap  = cmd_params_defaults.use_mmap ; }
472461    if  (params.n_threads .empty ())    { params.n_threads  = cmd_params_defaults.n_threads ; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486475    llama_split_mode split_mode;
487476    int  main_gpu;
488477    bool  no_kv_offload;
489-     bool  mul_mat_q;
490478    std::vector<float > tensor_split;
491479    bool  use_mmap;
492480
@@ -518,7 +506,6 @@ struct cmd_params_instance {
518506        cparams.n_batch  = n_batch;
519507        cparams.type_k  = type_k;
520508        cparams.type_v  = type_v;
521-         cparams.mul_mat_q  = mul_mat_q;
522509        cparams.offload_kqv  = !no_kv_offload;
523510
524511        return  cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538525    for  (const  auto  & nb : params.n_batch )
539526    for  (const  auto  & tk : params.type_k )
540527    for  (const  auto  & tv : params.type_v )
541-     for  (const  auto  & mmq : params.mul_mat_q )
542528    for  (const  auto  & nkvo : params.no_kv_offload )
543529    for  (const  auto  & nt : params.n_threads ) {
544530        for  (const  auto  & n_prompt : params.n_prompt ) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557543                /*  .split_mode   = */   sm,
558544                /*  .main_gpu     = */   mg,
559545                /*  .no_kv_offload= */   nkvo,
560-                 /*  .mul_mat_q    = */   mmq,
561546                /*  .tensor_split = */   ts,
562547                /*  .use_mmap     = */   mmp,
563548            };
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580565                /*  .split_mode   = */   sm,
581566                /*  .main_gpu     = */   mg,
582567                /*  .no_kv_offload= */   nkvo,
583-                 /*  .mul_mat_q    = */   mmq,
584568                /*  .tensor_split = */   ts,
585569                /*  .use_mmap     = */   mmp,
586570            };
@@ -616,7 +600,6 @@ struct test {
616600    llama_split_mode split_mode;
617601    int  main_gpu;
618602    bool  no_kv_offload;
619-     bool  mul_mat_q;
620603    std::vector<float > tensor_split;
621604    bool  use_mmap;
622605    int  n_prompt;
@@ -639,7 +622,6 @@ struct test {
639622        split_mode = inst.split_mode ;
640623        main_gpu = inst.main_gpu ;
641624        no_kv_offload = inst.no_kv_offload ;
642-         mul_mat_q = inst.mul_mat_q ;
643625        tensor_split = inst.tensor_split ;
644626        use_mmap = inst.use_mmap ;
645627        n_prompt = inst.n_prompt ;
@@ -713,7 +695,7 @@ struct test {
713695            " n_batch"  , " n_threads"  , " type_k"  , " type_v"  ,
714696            " n_gpu_layers"  , " split_mode"  ,
715697            " main_gpu"  , " no_kv_offload"  ,
716-             " mul_mat_q " ,  " tensor_split"  , " use_mmap"  ,
698+             " tensor_split"  , " use_mmap"  ,
717699            " n_prompt"  , " n_gen"  , " test_time"  ,
718700            " avg_ns"  , " stddev_ns"  ,
719701            " avg_ts"  , " stddev_ts" 
@@ -733,7 +715,7 @@ struct test {
733715        }
734716        if  (field == " cuda"   || field == " opencl"    || field == " vulkan"   || field == " kompute"   || field == " metal"   ||
735717            field == " gpu_blas"   || field == " blas"   || field == " sycl"   ||field == " f16_kv"   || field == " no_kv_offload"   ||
736-             field == " mul_mat_q "  || field ==  " use_mmap"  ) {
718+             field == " use_mmap"  ) {
737719            return  BOOL;
738720        }
739721        if  (field == " avg_ts"   || field == " stddev_ts"  ) {
@@ -767,7 +749,7 @@ struct test {
767749            std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
768750            std::to_string (n_gpu_layers), split_mode_str (split_mode),
769751            std::to_string (main_gpu), std::to_string (no_kv_offload),
770-             std::to_string (mul_mat_q),  tensor_split_str, std::to_string (use_mmap),
752+             tensor_split_str, std::to_string (use_mmap),
771753            std::to_string (n_prompt), std::to_string (n_gen), test_time,
772754            std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
773755            std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -931,9 +913,6 @@ struct markdown_printer : public printer {
931913        if  (field == " n_threads"  ) {
932914            return  " threads"  ;
933915        }
934-         if  (field == " mul_mat_q"  ) {
935-             return  " mmq"  ;
936-         }
937916        if  (field == " no_kv_offload"  ) {
938917            return  " nkvo"  ;
939918        }
@@ -974,9 +953,6 @@ struct markdown_printer : public printer {
974953        if  (params.split_mode .size () > 1  || params.split_mode  != cmd_params_defaults.split_mode ) {
975954            fields.emplace_back (" split_mode"  );
976955        }
977-         if  (params.mul_mat_q .size () > 1  || params.mul_mat_q  != cmd_params_defaults.mul_mat_q ) {
978-             fields.emplace_back (" mul_mat_q"  );
979-         }
980956        if  (params.no_kv_offload .size () > 1  || params.no_kv_offload  != cmd_params_defaults.no_kv_offload ) {
981957            fields.emplace_back (" no_kv_offload"  );
982958        }
0 commit comments