diff --git "a/docs/source/Instruction/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" "b/docs/source/Instruction/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" index a9f2a33152..0cc4945bff 100644 --- "a/docs/source/Instruction/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" +++ "b/docs/source/Instruction/LLM\345\276\256\350\260\203\346\226\207\346\241\243.md" @@ -59,12 +59,12 @@ sft_args = SftArguments( dataset=[f'{DatasetName.blossom_math_zh}#2000'], output_dir='output') result = sft_main(sft_args) -best_model_checkpoint = result['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = result['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') torch.cuda.empty_cache() infer_args = InferArguments( - ckpt_dir=best_model_checkpoint, + ckpt_dir=last_model_checkpoint, load_dataset_config=True) # merge_lora(infer_args, device_map='cpu') result = infer_main(infer_args) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index e7c37f6e6b..8d3f0daede 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -116,44 +116,44 @@ |qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| |qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| |qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| -|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| -|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| -|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| -|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| -|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| -|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| -|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| -|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| -|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| -|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| -|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| -|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| -|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| -|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| -|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| -|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| -|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| -|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| -|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| -|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| -|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| -|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| -|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| -|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| -|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| -|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| -|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| -|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| +|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| +|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| +|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| +|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| +|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| +|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| +|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| +|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| +|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| +|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| +|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| +|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| +|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| +|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| +|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| +|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| +|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| +|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| +|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| +|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| +|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| +|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| +|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| +|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| +|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| +|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| +|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| +|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| |qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| |qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| |qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| -|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| -|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| -|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| +|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| +|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| +|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| |qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| -|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| +|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| |qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| -|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| +|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| diff --git "a/docs/source/LLM/Qwen1.5\345\205\250\346\265\201\347\250\213\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/LLM/Qwen1.5\345\205\250\346\265\201\347\250\213\346\234\200\344\275\263\345\256\236\350\267\265.md" index 6dab1796fd..4083f8a0ec 100644 --- "a/docs/source/LLM/Qwen1.5\345\205\250\346\265\201\347\250\213\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source/LLM/Qwen1.5\345\205\250\346\265\201\347\250\213\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -198,8 +198,8 @@ sft_args = SftArguments( model_name=['小黄', 'Xiao Huang'], model_author=['魔搭', 'ModelScope']) output = sft_main(sft_args) -best_model_checkpoint = output['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = output['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') ``` 如果你想要在3090的机器中进行训练, 你可以**降低max_length**为1024, 使用模型并行, 或者使用deepspeed-zero3. diff --git "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" index 06427a3518..c990a3fd7b 100644 --- "a/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source/LLM/\350\207\252\346\210\221\350\256\244\347\237\245\345\276\256\350\260\203\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -119,8 +119,8 @@ sft_args = SftArguments( model_name=['小黄', 'Xiao Huang'], model_author=['魔搭', 'ModelScope']) output = sft_main(sft_args) -best_model_checkpoint = output['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = output['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') """Out[0] [INFO:swift] The logging file will be saved in: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/logging.jsonl @@ -157,7 +157,7 @@ Train: 100%|██████████████████████ [INFO:swift] best_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 [INFO:swift] images_dir: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/images [INFO:swift] End time of running main: 2024-06-07 10:18:41.386561 -best_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 +last_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 """ ``` @@ -197,7 +197,7 @@ swift sft \ ``` ## 微调后推理 -你需要设置`best_model_checkpoint`的值, 该值会在sft的最后被打印出来. +你需要设置`last_model_checkpoint`的值, 该值会在sft的最后被打印出来. 使用python: ```python @@ -206,8 +206,8 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import InferArguments, merge_lora, infer_main -best_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' -infer_args = InferArguments(ckpt_dir=best_model_checkpoint) +last_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' +infer_args = InferArguments(ckpt_dir=last_model_checkpoint) merge_lora(infer_args, device_map='cpu') result = infer_main(infer_args) @@ -281,8 +281,8 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import AppUIArguments, merge_lora, app_ui_main -best_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' -app_ui_args = AppUIArguments(ckpt_dir=best_model_checkpoint) +last_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' +app_ui_args = AppUIArguments(ckpt_dir=last_model_checkpoint) merge_lora(app_ui_args, device_map='cpu') result = app_ui_main(app_ui_args) ``` diff --git a/docs/source_en/Instruction/LLM-fine-tuning.md b/docs/source_en/Instruction/LLM-fine-tuning.md index a548b45641..d63764c6aa 100644 --- a/docs/source_en/Instruction/LLM-fine-tuning.md +++ b/docs/source_en/Instruction/LLM-fine-tuning.md @@ -55,12 +55,12 @@ sft_args = SftArguments( dataset=[f'{DatasetName.blossom_math_zh}#2000'], output_dir='output') result = sft_main(sft_args) -best_model_checkpoint = result['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = result['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') torch.cuda.empty_cache() infer_args = InferArguments( - ckpt_dir=best_model_checkpoint, + ckpt_dir=last_model_checkpoint, load_dataset_config=True) # merge_lora(infer_args, device_map='cpu') result = infer_main(infer_args) diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md index e499219f98..397b982746 100644 --- a/docs/source_en/Instruction/Supported-models-datasets.md +++ b/docs/source_en/Instruction/Supported-models-datasets.md @@ -116,44 +116,44 @@ The table below introcudes all models supported by SWIFT: |qwen2_5-14b|[qwen/Qwen2.5-14B](https://modelscope.cn/models/qwen/Qwen2.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B)| |qwen2_5-32b|[qwen/Qwen2.5-32B](https://modelscope.cn/models/qwen/Qwen2.5-32B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B)| |qwen2_5-72b|[qwen/Qwen2.5-72B](https://modelscope.cn/models/qwen/Qwen2.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B](https://huggingface.co/Qwen/Qwen2.5-72B)| -|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| -|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| -|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| -|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| -|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| -|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| -|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| -|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| -|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| -|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| -|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| -|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| -|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| -|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| -|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| -|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| -|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| -|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| -|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| -|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| -|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| -|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| -|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| -|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| -|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| -|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| -|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| -|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| +|qwen2_5-0_5b-instruct|[qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)| +|qwen2_5-1_5b-instruct|[qwen/Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)| +|qwen2_5-3b-instruct|[qwen/Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)| +|qwen2_5-7b-instruct|[qwen/Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)| +|qwen2_5-14b-instruct|[qwen/Qwen2.5-14B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)| +|qwen2_5-32b-instruct|[qwen/Qwen2.5-32B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)| +|qwen2_5-72b-instruct|[qwen/Qwen2.5-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)| +|qwen2_5-0_5b-instruct-gptq-int4|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int4)| +|qwen2_5-1_5b-instruct-gptq-int4|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int4)| +|qwen2_5-3b-instruct-gptq-int4|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int4)| +|qwen2_5-7b-instruct-gptq-int4|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int4)| +|qwen2_5-14b-instruct-gptq-int4|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int4)| +|qwen2_5-32b-instruct-gptq-int4|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int4)| +|qwen2_5-72b-instruct-gptq-int4|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int4)| +|qwen2_5-0_5b-instruct-gptq-int8|[qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GPTQ-Int8)| +|qwen2_5-1_5b-instruct-gptq-int8|[qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GPTQ-Int8)| +|qwen2_5-3b-instruct-gptq-int8|[qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GPTQ-Int8)| +|qwen2_5-7b-instruct-gptq-int8|[qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GPTQ-Int8)| +|qwen2_5-14b-instruct-gptq-int8|[qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-GPTQ-Int8)| +|qwen2_5-32b-instruct-gptq-int8|[qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8)| +|qwen2_5-72b-instruct-gptq-int8|[qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|auto_gptq>=0.5, transformers>=4.37|-|[Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GPTQ-Int8)| +|qwen2_5-0_5b-instruct-awq|[qwen/Qwen2.5-0.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-0.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-AWQ)| +|qwen2_5-1_5b-instruct-awq|[qwen/Qwen2.5-1.5B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-1.5B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-AWQ)| +|qwen2_5-3b-instruct-awq|[qwen/Qwen2.5-3B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-3B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-AWQ)| +|qwen2_5-7b-instruct-awq|[qwen/Qwen2.5-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-AWQ)| +|qwen2_5-14b-instruct-awq|[qwen/Qwen2.5-14B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-14B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-14B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-AWQ)| +|qwen2_5-32b-instruct-awq|[qwen/Qwen2.5-32B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-32B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-32B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct-AWQ)| +|qwen2_5-72b-instruct-awq|[qwen/Qwen2.5-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2.5-72B-Instruct-AWQ/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✘|✘|transformers>=4.37, autoawq|-|[Qwen/Qwen2.5-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-AWQ)| |qwen2_5-math-1_5b|[qwen/Qwen2.5-Math-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B)| |qwen2_5-math-7b|[qwen/Qwen2.5-Math-7B](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B)| |qwen2_5-math-72b|[qwen/Qwen2.5-Math-72B](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B](https://huggingface.co/Qwen/Qwen2.5-Math-72B)| -|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| -|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| -|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| +|qwen2_5-math-1_5b-instruct|[qwen/Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct)| +|qwen2_5-math-7b-instruct|[qwen/Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct)| +|qwen2_5-math-72b-instruct|[qwen/Qwen2.5-Math-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-72B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Math-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct)| |qwen2_5-coder-1_5b|[qwen/Qwen2.5-Coder-1.5B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B)| -|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| +|qwen2_5-coder-1_5b-instruct|[qwen/Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)| |qwen2_5-coder-7b|[qwen/Qwen2.5-Coder-7B](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B](https://huggingface.co/Qwen/Qwen2.5-Coder-7B)| -|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| +|qwen2_5-coder-7b-instruct|[qwen/Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary)|q_proj, k_proj, v_proj|qwen2_5|✔|✔|✔|✘|transformers>=4.37|-|[Qwen/Qwen2.5-Coder-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct)| |chatglm2-6b|[ZhipuAI/chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)| |chatglm2-6b-32k|[ZhipuAI/chatglm2-6b-32k](https://modelscope.cn/models/ZhipuAI/chatglm2-6b-32k/summary)|query_key_value|chatglm2|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm2-6b-32k](https://huggingface.co/THUDM/chatglm2-6b-32k)| |chatglm3-6b-base|[ZhipuAI/chatglm3-6b-base](https://modelscope.cn/models/ZhipuAI/chatglm3-6b-base/summary)|query_key_value|chatglm-generation|✘|✔|✘|✘|transformers<4.42|-|[THUDM/chatglm3-6b-base](https://huggingface.co/THUDM/chatglm3-6b-base)| diff --git a/docs/source_en/LLM/Qwen1.5-best-practice.md b/docs/source_en/LLM/Qwen1.5-best-practice.md index ca3d14b856..1cdcda6f6d 100644 --- a/docs/source_en/LLM/Qwen1.5-best-practice.md +++ b/docs/source_en/LLM/Qwen1.5-best-practice.md @@ -196,8 +196,8 @@ sft_args = SftArguments( model_name=['小黄', 'Xiao Huang'], model_author=['魔搭', 'ModelScope']) output = sft_main(sft_args) -best_model_checkpoint = output['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = output['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') ``` If you want to train on a 3090 machine, you can **reduce max_length** to 1024, use model parallelism, or use deepspeed-zero3. diff --git a/docs/source_en/LLM/Self-cognition-best-practice.md b/docs/source_en/LLM/Self-cognition-best-practice.md index cb4c2663d6..6eee03876a 100644 --- a/docs/source_en/LLM/Self-cognition-best-practice.md +++ b/docs/source_en/LLM/Self-cognition-best-practice.md @@ -122,8 +122,8 @@ sft_args = SftArguments( model_name=['小黄', 'Xiao Huang'], model_author=['魔搭', 'ModelScope']) output = sft_main(sft_args) -best_model_checkpoint = output['best_model_checkpoint'] -print(f'best_model_checkpoint: {best_model_checkpoint}') +last_model_checkpoint = output['last_model_checkpoint'] +print(f'last_model_checkpoint: {last_model_checkpoint}') """Out[0] [INFO:swift] The logging file will be saved in: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/logging.jsonl @@ -160,7 +160,7 @@ Train: 100%|██████████████████████ [INFO:swift] best_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 [INFO:swift] images_dir: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/images [INFO:swift] End time of running main: 2024-06-07 10:18:41.386561 -best_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 +last_model_checkpoint: /xxx/output/qwen2-7b-instruct/v2-20240607-101038/checkpoint-93 """ ``` @@ -200,7 +200,7 @@ swift sft \ ``` ## Inference After Fine-Tuning -You need to set the value of `best_model_checkpoint`, which will be printed out at the end of the sft. +You need to set the value of `last_model_checkpoint`, which will be printed out at the end of the sft. Using Python: ```python @@ -209,8 +209,8 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import InferArguments, merge_lora, infer_main -best_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' -infer_args = InferArguments(ckpt_dir=best_model_checkpoint) +last_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' +infer_args = InferArguments(ckpt_dir=last_model_checkpoint) merge_lora(infer_args, device_map='cpu') result = infer_main(infer_args) @@ -271,8 +271,8 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import AppUIArguments, merge_lora, app_ui_main -best_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' -app_ui_args = AppUIArguments(ckpt_dir=best_model_checkpoint) +last_model_checkpoint = 'qwen2-7b-instruct/vx-xxx/checkpoint-xxx' +app_ui_args = AppUIArguments(ckpt_dir=last_model_checkpoint) merge_lora(app_ui_args, device_map='cpu') result = app_ui_main(app_ui_args) ``` diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py index 834ea21456..28c9c90ae3 100644 --- a/swift/llm/utils/model.py +++ b/swift/llm/utils/model.py @@ -3478,7 +3478,7 @@ def get_model_tokenizer_qwen2_chat(model_dir: str, f'qwen2_5-{model_size_lower}-instruct', f'qwen/Qwen2.5-{model_size}-Instruct', LoRATM.llama, - TemplateType.qwen, + TemplateType.qwen2_5, get_model_tokenizer_qwen2_chat, support_flash_attn=True, support_vllm=True, @@ -3492,7 +3492,7 @@ def get_model_tokenizer_qwen2_chat(model_dir: str, f'qwen2_5-{model_size_lower}-instruct-{quant_type_lower}', f'qwen/Qwen2.5-{model_size}-Instruct-{quant_type}', LoRATM.llama, - TemplateType.qwen, + TemplateType.qwen2_5, get_model_tokenizer_qwen2_chat, support_flash_attn=True, support_vllm=True, @@ -3505,7 +3505,7 @@ def get_model_tokenizer_qwen2_chat(model_dir: str, f'qwen2_5-{model_size_lower}-instruct-awq', f'qwen/Qwen2.5-{model_size}-Instruct-AWQ', LoRATM.llama, - TemplateType.qwen, + TemplateType.qwen2_5, get_model_tokenizer_qwen2_chat, support_flash_attn=True, support_vllm=True, @@ -3531,7 +3531,7 @@ def get_model_tokenizer_qwen2_chat(model_dir: str, f'qwen2_5-math-{model_size_lower}-instruct', f'qwen/Qwen2.5-Math-{model_size}-Instruct', LoRATM.llama, - TemplateType.qwen, + TemplateType.qwen2_5, get_model_tokenizer_qwen2_chat, support_flash_attn=True, support_vllm=True, @@ -3556,7 +3556,7 @@ def get_model_tokenizer_qwen2_chat(model_dir: str, f'qwen2_5-coder-{model_size_lower}-instruct', f'qwen/Qwen2.5-Coder-{model_size}-Instruct', LoRATM.llama, - TemplateType.qwen, + TemplateType.qwen2_5, get_model_tokenizer_qwen2_chat, support_flash_attn=True, support_vllm=True, diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index 8aa7919805..cff2d12dca 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -45,6 +45,7 @@ class TemplateType: # chat default = 'default' qwen = 'qwen' + qwen2_5 = 'qwen2_5' qwen_vl = 'qwen-vl' qwen_audio = 'qwen-audio' qwen2_audio = 'qwen2-audio' @@ -1269,7 +1270,12 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]: ] +class Qwen2_5Template(QwenTemplate): + system = 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' + + register_template(TemplateType.qwen, QwenTemplate()) +register_template(TemplateType.qwen2_5, Qwen2_5Template()) class QwenVLTemplate(_QwenVLTemplateMixin, QwenTemplate):