@@ -597,11 +597,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
597597 fprintf (stdout, " number of layers to store in VRAM\n " );
598598 fprintf (stdout, " -ts SPLIT --tensor-split SPLIT\n " );
599599 fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
600- fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
601- fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
602- fprintf (stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n " );
603- fprintf (stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n " );
604- fprintf (stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n " );
600+ fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
601+ fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
602+ #ifdef GGML_USE_CUBLAS
603+ fprintf (stdout, " -nommq, --no-mul-mat-q\n " );
604+ fprintf (stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n " );
605+ fprintf (stdout, " Not recommended since this is both slower and uses more VRAM.\n " );
606+ #endif // GGML_USE_CUBLAS
605607#endif
606608 fprintf (stdout, " --mtest compute maximum memory usage\n " );
607609 fprintf (stdout, " --export export the computation graph to 'llama.ggml'\n " );
0 commit comments