@@ -671,12 +671,11 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
671671 fprintf (stdout, " number of layers to store in VRAM\n " );
672672 fprintf (stdout, " -ts SPLIT --tensor-split SPLIT\n " );
673673 fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
674- fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
675674 fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
676675 fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
677- fprintf (stdout, " -mmq , --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!! \n " );
678- fprintf (stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed \n " );
679- fprintf (stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K .\n " );
676+ fprintf (stdout, " -nommq , --no- mul-mat-q\n " );
677+ fprintf (stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels. \n " );
678+ fprintf (stdout, " Not recommended since this is both slower and uses more VRAM .\n " );
680679#endif
681680 fprintf (stdout, " -m FNAME, --model FNAME\n " );
682681 fprintf (stdout, " model path (default: %s)\n " , params.model .c_str ());
@@ -867,12 +866,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
867866 LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n " , {});
868867#endif // GGML_USE_CUBLAS
869868 }
870- else if (arg == " --mul-mat-q" || arg == " -mmq " )
869+ else if (arg == " --no- mul-mat-q" || arg == " -nommq " )
871870 {
872871#ifdef GGML_USE_CUBLAS
873- params.mul_mat_q = true ;
872+ params.mul_mat_q = false ;
874873#else
875- LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels .\n " , {});
874+ LOG_WARNING (" warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect .\n " , {});
876875#endif // GGML_USE_CUBLAS
877876 }
878877 else if (arg == " --main-gpu" || arg == " -mg" )
0 commit comments