@@ -176,7 +176,6 @@ struct cmd_params {
176176 std::vector<llama_split_mode> split_mode;
177177 std::vector<int > main_gpu;
178178 std::vector<bool > no_kv_offload;
179- std::vector<bool > mul_mat_q;
180179 std::vector<std::vector<float >> tensor_split;
181180 std::vector<bool > use_mmap;
182181 int reps;
@@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = {
196195 /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
197196 /* main_gpu */ {0 },
198197 /* no_kv_offload */ {false },
199- /* mul_mat_q */ {true },
200198 /* tensor_split */ {std::vector<float >(llama_max_devices (), 0 .0f )},
201199 /* use_mmap */ {true },
202200 /* reps */ 5 ,
@@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) {
221219 printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
222220 printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
223221 printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
224- printf (" -mmq, --mul-mat-q <0|1> (default: %s)\n " , join (cmd_params_defaults.mul_mat_q , " ," ).c_str ());
225222 printf (" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n " );
226223 printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
227224 printf (" -o, --output <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format ));
@@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
383380 }
384381 auto p = split<bool >(argv[i], split_delim);
385382 params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
386- } else if (arg == " -mmq" || arg == " --mul-mat-q" ) {
387- if (++i >= argc) {
388- invalid_param = true ;
389- break ;
390- }
391- auto p = split<bool >(argv[i], split_delim);
392- params.mul_mat_q .insert (params.mul_mat_q .end (), p.begin (), p.end ());
393383 } else if (arg == " -mmp" || arg == " --mmap" ) {
394384 if (++i >= argc) {
395385 invalid_param = true ;
@@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
466456 if (params.split_mode .empty ()) { params.split_mode = cmd_params_defaults.split_mode ; }
467457 if (params.main_gpu .empty ()) { params.main_gpu = cmd_params_defaults.main_gpu ; }
468458 if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
469- if (params.mul_mat_q .empty ()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q ; }
470459 if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
471460 if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
472461 if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
@@ -486,7 +475,6 @@ struct cmd_params_instance {
486475 llama_split_mode split_mode;
487476 int main_gpu;
488477 bool no_kv_offload;
489- bool mul_mat_q;
490478 std::vector<float > tensor_split;
491479 bool use_mmap;
492480
@@ -518,7 +506,6 @@ struct cmd_params_instance {
518506 cparams.n_batch = n_batch;
519507 cparams.type_k = type_k;
520508 cparams.type_v = type_v;
521- cparams.mul_mat_q = mul_mat_q;
522509 cparams.offload_kqv = !no_kv_offload;
523510
524511 return cparams;
@@ -538,7 +525,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
538525 for (const auto & nb : params.n_batch )
539526 for (const auto & tk : params.type_k )
540527 for (const auto & tv : params.type_v )
541- for (const auto & mmq : params.mul_mat_q )
542528 for (const auto & nkvo : params.no_kv_offload )
543529 for (const auto & nt : params.n_threads ) {
544530 for (const auto & n_prompt : params.n_prompt ) {
@@ -557,7 +543,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
557543 /* .split_mode = */ sm,
558544 /* .main_gpu = */ mg,
559545 /* .no_kv_offload= */ nkvo,
560- /* .mul_mat_q = */ mmq,
561546 /* .tensor_split = */ ts,
562547 /* .use_mmap = */ mmp,
563548 };
@@ -580,7 +565,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
580565 /* .split_mode = */ sm,
581566 /* .main_gpu = */ mg,
582567 /* .no_kv_offload= */ nkvo,
583- /* .mul_mat_q = */ mmq,
584568 /* .tensor_split = */ ts,
585569 /* .use_mmap = */ mmp,
586570 };
@@ -639,7 +623,6 @@ struct test {
639623 split_mode = inst.split_mode ;
640624 main_gpu = inst.main_gpu ;
641625 no_kv_offload = inst.no_kv_offload ;
642- mul_mat_q = inst.mul_mat_q ;
643626 tensor_split = inst.tensor_split ;
644627 use_mmap = inst.use_mmap ;
645628 n_prompt = inst.n_prompt ;
@@ -974,9 +957,6 @@ struct markdown_printer : public printer {
974957 if (params.split_mode .size () > 1 || params.split_mode != cmd_params_defaults.split_mode ) {
975958 fields.emplace_back (" split_mode" );
976959 }
977- if (params.mul_mat_q .size () > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q ) {
978- fields.emplace_back (" mul_mat_q" );
979- }
980960 if (params.no_kv_offload .size () > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload ) {
981961 fields.emplace_back (" no_kv_offload" );
982962 }
0 commit comments