@@ -597,11 +597,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
597597    fprintf (stdout, "                         number of layers to store in VRAM\n " 
598598    fprintf (stdout, "   -ts SPLIT --tensor-split SPLIT\n " 
599599    fprintf (stdout, "                         how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " 
600-     fprintf (stdout, "   -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n " 
601-     fprintf (stdout, "   -lv, --low-vram       don't allocate VRAM scratch buffer\n " 
602-     fprintf (stdout, "   -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n " 
603-     fprintf (stdout, "                         Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n " 
604-     fprintf (stdout, "                         is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n " 
600+     fprintf (stdout, "   -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n " 
601+     fprintf (stdout, "   -lv, --low-vram       don't allocate VRAM scratch buffer\n " 
602+ #ifdef  GGML_USE_CUBLAS
603+     fprintf (stdout, "   -nommq, --no-mul-mat-q\n " 
604+     fprintf (stdout, "                         use " "  instead of custom mul_mat_q " "  kernels.\n " 
605+     fprintf (stdout, "                         Not recommended since this is both slower and uses more VRAM.\n " 
606+ #endif  //  GGML_USE_CUBLAS
605607#endif 
606608    fprintf (stdout, "   --mtest               compute maximum memory usage\n " 
607609    fprintf (stdout, "   --export              export the computation graph to 'llama.ggml'\n " 
0 commit comments