diff --git a/README.md b/README.md
index c21e758690..7b653b58cb 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,8 @@ You can contact us and communicate with us by adding our group:
|
## 🎉 News
-- 2024.09.24: Support for training and deploying llama3.2 series models. Experience it using `swift infer --model_type llama3_2-1b-instruct`.
+- 2024.09.26: Support for training and deploying llama3.2-vision series models. Experience it using `swift infer --model_type llama3_2-11b-vision-instruct`.
+- 2024.09.26: Support for training and deploying llama3.2 series models. Experience it using `swift infer --model_type llama3_2-1b-instruct`.
- 2024.09.25: Support for training to deployment with got-ocr2. Best practices can be found [here](https://github.com/modelscope/ms-swift/issues/2122).
- 2024.09.24: Support for training and deploying llama3_1-8b-omni. Experience it using `swift infer --model_type llama3_1-8b-omni`.
- 2024.09.23: Support for training and deploying pixtral-12b. Experience it using `swift infer --model_type pixtral-12b --dtype fp16`.
@@ -623,6 +624,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
|------------------------------------------------------------|----------------------------------------------------------------------------------------|--------------------|---------------------------------------|--------------------------|
| Qwen-VL
Qwen2-VL | [Tongyi Qwen vision model](https://github.com/QwenLM) | Chinese
English | 2B-72B
including quantized versions | base model
chat model |
| Qwen-Audio
Qwen2-Audio | [Tongyi Qwen speech model](https://github.com/QwenLM) | Chinese
English | 7B | base model
chat model |
+| Llama3.2-Vision | [Llama3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) | English | 11B-90B | base model
chat model |
| YI-VL | [01AI's YI series vision models](https://github.com/01-ai) | Chinese
English | 6B-34B | chat model |
| XComposer2
XComposer2.5 | [Pujiang AI Lab InternLM vision model](https://github.com/InternLM/InternLM-XComposer) | Chinese
English | 7B | chat model |
| DeepSeek-VL | [DeepSeek series vision models](https://github.com/deepseek-ai) | Chinese
English | 1.3B-7B | chat model |
diff --git a/README_CN.md b/README_CN.md
index 18c04d82e4..23eb681bd2 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -56,6 +56,7 @@ SWIFT具有丰富全面的文档,请查看我们的文档网站:
## 🎉 新闻
+- 2024.09.26: 支持llama3.2-vision系列模型的训练到部署. 使用`swift infer --model_type llama3_2-11b-vision-instruct`进行体验.
- 2024.09.26: 支持llama3.2系列模型的训练到部署. 使用`swift infer --model_type llama3_2-1b-instruct`进行体验.
- 2024.09.25: 支持got-ocr2的训练到部署. 最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/2122).
- 2024.09.24: 支持llama3_1-8b-omni的训练与部署. 使用`swift infer --model_type llama3_1-8b-omni`进行体验.
@@ -616,6 +617,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
|---------------------------------------------------------|----------------------------------------------------------------------------|----------|------------------|------------------|
| Qwen-VL
Qwen2-VL | [通义千问视觉模型](https://github.com/QwenLM) | 中文
英文 | 2B-72B
包含量化版本 | base模型
chat模型 |
| Qwen-Audio
Qwen2-Audio | [通义千问语音模型](https://github.com/QwenLM) | 中文
英文 | 7B | base模型
chat模型 |
+| Llama3.2-Vision | [Llama3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf) | 英文 | 11B-90B | base模型
chat模型 |
| YI-VL | [01AI的YI系列视觉模型](https://github.com/01-ai) | 中文
英文 | 6B-34B | chat模型 |
| XComposer2
XComposer2.5 | [浦江实验室书生浦语视觉模型](https://github.com/InternLM/InternLM-XComposer) | 中文
英文 | 7B | chat模型 |
| DeepSeek-VL | [幻方系列视觉模型](https://github.com/deepseek-ai) | 中文
英文 | 1.3B-7B | chat模型 |
diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 8bb9e56b5b..eabde47c86 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -199,10 +199,10 @@
|llama3_1-405b-instruct-awq|[LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✔|✘|transformers>=4.43, autoawq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|
|llama3_1-405b-instruct-gptq-int4|[LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✘|✘|transformers>=4.43, auto_gptq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|
|llama3_1-405b-instruct-bnb|[LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✘|✘|transformers>=4.43, bitsandbytes|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|
-|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
-|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
-|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
-|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
+|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
+|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
+|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
+|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
|reflection-llama_3_1-70b|[LLM-Research/Reflection-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Reflection-Llama-3.1-70B/summary)|q_proj, k_proj, v_proj|reflection|✔|✔|✘|✘|transformers>=4.43|-|[mattshumer/Reflection-Llama-3.1-70B](https://huggingface.co/mattshumer/Reflection-Llama-3.1-70B)|
|longwriter-glm4-9b|[ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary)|query_key_value|chatglm4|✔|✔|✔|✘|transformers>=4.42|-|[THUDM/LongWriter-glm4-9b](https://huggingface.co/THUDM/LongWriter-glm4-9b)|
|longwriter-llama3_1-8b|[ZhipuAI/LongWriter-llama3.1-8b](https://modelscope.cn/models/ZhipuAI/LongWriter-llama3.1-8b/summary)|q_proj, k_proj, v_proj|longwriter-llama3|✔|✔|✔|✘|transformers>=4.43|-|[THUDM/LongWriter-llama3.1-8b](https://huggingface.co/THUDM/LongWriter-llama3.1-8b)|
@@ -424,26 +424,30 @@
|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
-|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
-|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
+|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
|glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
+|llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
+|llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
|llama3_1-8b-omni|[ICTNLP/Llama-3.1-8B-Omni](https://modelscope.cn/models/ICTNLP/Llama-3.1-8B-Omni/summary)|^(model.layers\|model.speech_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_1-omni|✔|✘|✘|✘|whisper, openai-whisper|audio|[ICTNLP/Llama-3.1-8B-Omni](https://huggingface.co/ICTNLP/Llama-3.1-8B-Omni)|
-|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
+|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
|llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
|llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
|llava1_6-mistral-7b-instruct|[swift/llava-v1.6-mistral-7b-hf](https://modelscope.cn/models/swift/llava-v1.6-mistral-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-mistral|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)|
@@ -454,9 +458,9 @@
|llama3-llava-next-8b-hf|[swift/llama3-llava-next-8b-hf](https://modelscope.cn/models/swift/llama3-llava-next-8b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-llava-next-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)|
|llava-next-72b-hf|[AI-ModelScope/llava-next-72b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-72b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)|
|llava-next-110b-hf|[AI-ModelScope/llava-next-110b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-110b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)|
-|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
-|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
-|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
+|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
+|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
+|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
|llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3-llava-next|✔|✘|✘|✘||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
|llava-next-72b|[AI-Modelscope/llava-next-72b](https://modelscope.cn/models/AI-Modelscope/llava-next-72b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|✔|✘|✘|✘||vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)|
|llava-next-110b|[AI-Modelscope/llava-next-110b](https://modelscope.cn/models/AI-Modelscope/llava-next-110b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|✔|✘|✘|✘||vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)|
@@ -497,7 +501,7 @@
|minicpm-v-v2-chat|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v|✔|✘|✘|✘|timm, transformers<4.42|vision|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
|minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
|minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
-|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45.0.dev0|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
index a010194aa6..afe28bca02 100644
--- a/docs/source_en/Instruction/Supported-models-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -199,10 +199,10 @@ The table below introcudes all models supported by SWIFT:
|llama3_1-405b-instruct-awq|[LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✔|✘|transformers>=4.43, autoawq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|
|llama3_1-405b-instruct-gptq-int4|[LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✘|✘|transformers>=4.43, auto_gptq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|
|llama3_1-405b-instruct-bnb|[LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4/summary)|q_proj, k_proj, v_proj|llama3|✔|✔|✘|✘|transformers>=4.43, bitsandbytes|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|
-|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
-|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
-|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
-|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.43|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
+|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
+|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
+|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
+|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|✔|✔|✔|✘|transformers>=4.45|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
|reflection-llama_3_1-70b|[LLM-Research/Reflection-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Reflection-Llama-3.1-70B/summary)|q_proj, k_proj, v_proj|reflection|✔|✔|✘|✘|transformers>=4.43|-|[mattshumer/Reflection-Llama-3.1-70B](https://huggingface.co/mattshumer/Reflection-Llama-3.1-70B)|
|longwriter-glm4-9b|[ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary)|query_key_value|chatglm4|✔|✔|✔|✘|transformers>=4.42|-|[THUDM/LongWriter-glm4-9b](https://huggingface.co/THUDM/LongWriter-glm4-9b)|
|longwriter-llama3_1-8b|[ZhipuAI/LongWriter-llama3.1-8b](https://modelscope.cn/models/ZhipuAI/LongWriter-llama3.1-8b/summary)|q_proj, k_proj, v_proj|longwriter-llama3|✔|✔|✔|✘|transformers>=4.43|-|[THUDM/LongWriter-llama3.1-8b](https://huggingface.co/THUDM/LongWriter-llama3.1-8b)|
@@ -424,26 +424,30 @@ The table below introcudes all models supported by SWIFT:
|qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|✔|✔|✘|✘|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
|qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
|qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|✔|✘|✘|✘||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
-|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
-|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|✔|✘|✘|✘|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
+|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|✔|✔|✘|✘|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
|glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|✘|✘|✘|✘|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
+|llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
+|llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|✔|✔|✘|✘|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
|llama3_1-8b-omni|[ICTNLP/Llama-3.1-8B-Omni](https://modelscope.cn/models/ICTNLP/Llama-3.1-8B-Omni/summary)|^(model.layers\|model.speech_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_1-omni|✔|✘|✘|✘|whisper, openai-whisper|audio|[ICTNLP/Llama-3.1-8B-Omni](https://huggingface.co/ICTNLP/Llama-3.1-8B-Omni)|
-|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
+|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|✔|✘|✘|✘|transformers>=4.45|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
|llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
|llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|✔|✔|✘|✘|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
|llava1_6-mistral-7b-instruct|[swift/llava-v1.6-mistral-7b-hf](https://modelscope.cn/models/swift/llava-v1.6-mistral-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-mistral|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)|
@@ -454,9 +458,9 @@ The table below introcudes all models supported by SWIFT:
|llama3-llava-next-8b-hf|[swift/llama3-llava-next-8b-hf](https://modelscope.cn/models/swift/llama3-llava-next-8b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-llava-next-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)|
|llava-next-72b-hf|[AI-ModelScope/llava-next-72b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-72b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)|
|llava-next-110b-hf|[AI-ModelScope/llava-next-110b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-110b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|✔|✔|✘|✘|transformers>=4.39|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)|
-|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
-|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
-|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
+|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
+|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
+|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|✔|✘|✘|✘|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
|llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3-llava-next|✔|✘|✘|✘||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
|llava-next-72b|[AI-Modelscope/llava-next-72b](https://modelscope.cn/models/AI-Modelscope/llava-next-72b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|✔|✘|✘|✘||vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)|
|llava-next-110b|[AI-Modelscope/llava-next-110b](https://modelscope.cn/models/AI-Modelscope/llava-next-110b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|✔|✘|✘|✘||vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)|
@@ -497,7 +501,7 @@ The table below introcudes all models supported by SWIFT:
|minicpm-v-v2-chat|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v|✔|✘|✘|✘|timm, transformers<4.42|vision|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
|minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|✔|✔|✘|✘|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
|minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|✔|✔|✘|✘|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
-|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45.0.dev0|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|✘|✘|✘|✘|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
|mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
|mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|✔|✘|✘|✘|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
|mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|✔|✘|✘|✘|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index d01b360afb..c9bea3528d 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -21,8 +21,8 @@
from .accelerator import ta_accelerate
from .tuner import prepare_model
from .utils import (TEMPLATE_MAPPING, LazyLLMDataset, PtArguments, RLHFArguments, SftArguments, Template, dataset_map,
- dynamic_vit_gradient_checkpointing, get_dataset, get_model_tokenizer, get_template, get_time_info,
- print_example, set_generation_config, sort_by_max_length, stat_dataset)
+ deep_getattr, dynamic_vit_gradient_checkpointing, get_dataset, get_mllm_arch, get_model_tokenizer,
+ get_template, get_time_info, print_example, set_generation_config, sort_by_max_length, stat_dataset)
logger = get_logger()
@@ -265,6 +265,12 @@ def prepare_model_template_train(args, msg: Optional[Dict[str, Any]] = None):
model.config.use_cache = False # fix transformers==4.36
logger.info('Setting model.config.use_cache: False')
model.enable_input_require_grads()
+ mllm_arch = get_mllm_arch(args.model_type)
+ if mllm_arch is not None:
+ for vision_tower_name in mllm_arch.vision_tower:
+ vision_tower = deep_getattr(model, vision_tower_name)
+ if hasattr(vision_tower, 'enable_input_require_grads'):
+ vision_tower.enable_input_require_grads()
if use_torchacc():
model.config.use_cache = False
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index 129981b377..3fb7c22a54 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -22,12 +22,12 @@
ModelList, UsageInfo, XRequestConfig, random_uuid)
from .template import (DEFAULT_SYSTEM, TEMPLATE_MAPPING, History, KTOTemplateMixin, Prompt, RLHFTemplateMixin,
StopWords, Template, TemplateType, get_env_args, get_template, register_template)
-from .utils import (LazyLLMDataset, LLMDataset, dataset_map, download_dataset, dynamic_vit_gradient_checkpointing,
- find_all_linears, find_embedding, find_ln, get_max_model_len, get_time_info, history_to_messages,
- inference, inference_stream, is_lmdeploy_available, is_megatron_available, is_quant_model,
- is_vllm_available, limit_history_length, messages_join_observation, messages_to_history,
- print_example, safe_tokenizer_decode, set_generation_config, sort_by_max_length, stat_dataset,
- to_device)
+from .utils import (LazyLLMDataset, LLMDataset, dataset_map, deep_getattr, download_dataset,
+ dynamic_vit_gradient_checkpointing, find_all_linears, find_embedding, find_ln, get_max_model_len,
+ get_mllm_arch, get_time_info, history_to_messages, inference, inference_stream,
+ is_lmdeploy_available, is_megatron_available, is_quant_model, is_vllm_available,
+ limit_history_length, messages_join_observation, messages_to_history, print_example,
+ safe_tokenizer_decode, set_generation_config, sort_by_max_length, stat_dataset, to_device)
logger = get_logger()
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 41a4691fd7..2eca790c61 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -31,7 +31,7 @@
from .model import (MODEL_MAPPING, dtype_mapping, get_additional_saved_files, get_default_lora_target_modules,
get_default_template_type)
from .template import TEMPLATE_MAPPING
-from .utils import is_liger_available, is_lmdeploy_available, is_quant_model, is_vllm_available
+from .utils import get_mllm_arch, is_liger_available, is_lmdeploy_available, is_quant_model, is_vllm_available
logger = get_logger()
DATASET_TYPE = Union[HfDataset, HfIterableDataset]
@@ -1048,16 +1048,12 @@ def __post_init__(self) -> None:
if self.eval_steps is None:
self.eval_steps = 50
elif self.sft_type == 'full':
- from swift.utils.module_mapping import MODEL_KEYS_MAPPING
- lora_target_modules = model_info.get('lora_target_modules') # model_group
- model_arch = None
- if isinstance(lora_target_modules, str):
- model_arch = MODEL_KEYS_MAPPING[lora_target_modules]
- if model_arch:
- if self.freeze_vit and model_arch.vision_tower:
- self.freeze_parameters += model_arch.vision_tower
- if model_arch.generator:
- self.freeze_parameters += model_arch.generator
+ mllm_arch = get_mllm_arch(self.model_type)
+ if mllm_arch:
+ if self.freeze_vit and mllm_arch.vision_tower:
+ self.freeze_parameters += mllm_arch.vision_tower
+ if mllm_arch.generator:
+ self.freeze_parameters += mllm_arch.generator
assert 0 <= self.freeze_parameters_ratio <= 1
assert self.quantization_bit == 0, 'Full parameter fine-tuning does not support quantization.'
assert self.dtype != 'fp16', ("Fine-tuning with dtype=='fp16' can lead to NaN issues. "
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index ccd1f9c8be..3799aee022 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -265,6 +265,12 @@ class ModelType:
llama3_2_1b_instruct = 'llama3_2-1b-instruct'
llama3_2_3b = 'llama3_2-3b'
llama3_2_3b_instruct = 'llama3_2-3b-instruct'
+ # llama3.2-vision
+ llama3_2_11b_vision = 'llama3_2-11b-visiont'
+ llama3_2_11b_vision_instruct = 'llama3_2-11b-vision-instruct'
+ llama3_2_90b_vision = 'llama3_2-90b-vision'
+ llama3_2_90b_vision_instruct = 'llama3_2-90b-vision-instruct'
+
# omni
llama3_1_8b_omni = 'llama3_1-8b-omni'
# reflection
@@ -644,6 +650,7 @@ class LoRATM(NamedTuple):
mplug_owl3 = 'mplug_owl3'
llama3_1_omni = 'llama3_1_omni'
got_ocr2 = 'got_ocr2'
+ llama3_2_vision = 'llama3_2_vision'
# default lora target modules for nlp llms.
minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj']
baichuan = ['W_pack']
@@ -6194,6 +6201,56 @@ def get_model_tokenizer_llava_hf(model_dir: str, *args, **kwargs):
return model, tokenizer
+@register_model(
+ ModelType.llama3_2_11b_vision,
+ 'LLM-Research/Llama-3.2-11B-Vision',
+ LoRATM.llama3_2_vision,
+ TemplateType.llama3_2_vision_generation,
+ support_flash_attn=True,
+ support_vllm=True,
+ ignore_file_pattern=['*.pth'],
+ requires=['transformers>=4.45'],
+ tags=['multi-modal', 'vision'],
+ hf_model_id='meta-llama/Llama-3.2-11B-Vision')
+@register_model(
+ ModelType.llama3_2_11b_vision_instruct,
+ 'LLM-Research/Llama-3.2-11B-Vision-Instruct',
+ LoRATM.llama3_2_vision,
+ TemplateType.llama3_2_vision,
+ support_flash_attn=True,
+ support_vllm=True,
+ ignore_file_pattern=['*.pth'],
+ requires=['transformers>=4.45'],
+ tags=['multi-modal', 'vision'],
+ hf_model_id='meta-llama/Llama-3.2-11B-Vision-Instruct')
+@register_model(
+ ModelType.llama3_2_90b_vision,
+ 'LLM-Research/Llama-3.2-90B-Vision',
+ LoRATM.llama3_2_vision,
+ TemplateType.llama3_2_vision_generation,
+ support_flash_attn=True,
+ support_vllm=True,
+ ignore_file_pattern=['*.pth'],
+ requires=['transformers>=4.45'],
+ tags=['multi-modal', 'vision'],
+ hf_model_id='meta-llama/Llama-3.2-90B-Vision')
+@register_model(
+ ModelType.llama3_2_90b_vision_instruct,
+ 'LLM-Research/Llama-3.2-90B-Vision-Instruct',
+ LoRATM.llama3_2_vision,
+ TemplateType.llama3_2_vision,
+ support_flash_attn=True,
+ support_vllm=True,
+ ignore_file_pattern=['*.pth'],
+ requires=['transformers>=4.45'],
+ tags=['multi-modal', 'vision'],
+ hf_model_id='meta-llama/Llama-3.2-90B-Vision-Instruct')
+def get_model_tokenizer_llama3_2_vision(*args, **kwargs):
+ from transformers import MllamaForConditionalGeneration
+ kwargs['automodel_class'] = MllamaForConditionalGeneration
+ return get_model_tokenizer_llava_hf(*args, **kwargs)
+
+
@register_model(
ModelType.llava1_5_13b_instruct,
'swift/llava-1.5-13b-hf',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index b4638d9124..0f3ad2a6a8 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -64,6 +64,8 @@ class TemplateType:
llama3 = 'llama3'
llama3_1_omni = 'llama3_1-omni'
llama3_2 = 'llama3_2'
+ llama3_2_vision = 'llama3_2-vision'
+ llama3_2_vision_generation = 'llama3_2-vision-generation'
reflection = 'reflection'
longwriter_llama3 = 'longwriter-llama3'
# llava-hf
@@ -1931,6 +1933,64 @@ class Llama3_2Template(Llama3_2TemplateMixin, Template):
register_template(TemplateType.llama3_2, Llama3_2Template())
+class Llama3_2VisionTemplateMixin:
+
+ def replace_tag(self, media_type, index, example) -> List[Context]:
+ assert media_type == 'image'
+ return ['<|image|>']
+
+ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ from transformers.models.mllama.processing_mllama import (get_cross_attention_token_mask,
+ convert_sparse_cross_attention_mask_to_dense)
+ inputs, _ = super()._encode(example)
+ if len(inputs) == 0:
+ return inputs, {}
+ images = example['images']
+ if images:
+ input_ids = inputs['input_ids']
+ processor = self.tokenizer.processor
+ image_features = processor.image_processor(images, return_tensors='pt')
+ num_tiles = image_features.pop('num_tiles')
+ inputs.update(image_features)
+
+ cross_attention_token_mask = [get_cross_attention_token_mask(input_ids, processor.image_token_id)]
+ cross_attention_mask = convert_sparse_cross_attention_mask_to_dense(
+ cross_attention_token_mask,
+ num_tiles=num_tiles,
+ max_num_tiles=processor.image_processor.max_image_tiles,
+ length=len(input_ids),
+ )
+ inputs['cross_attention_mask'] = torch.tensor(cross_attention_mask)
+
+ return inputs, {}
+
+ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+ res = super().data_collator(batch, padding_to)
+ for key in ['aspect_ratio_ids', 'aspect_ratio_mask']:
+ value = [b[key] for b in batch if b.get(key) is not None]
+ if value:
+ res[key] = torch.concat(value)
+
+ cross_attention_mask = [
+ b['cross_attention_mask'][0] for b in batch if b.get('cross_attention_mask') is not None
+ ]
+ if cross_attention_mask:
+ res['cross_attention_mask'] = self.pad_sequence(cross_attention_mask, 0, self.padding_side)
+ return res
+
+
+class Llama3_2VisionTemplate(Llama3_2VisionTemplateMixin, Llama3Template):
+ pass
+
+
+class Llama3_2VisionGenerationTemplate(Llama3_2VisionTemplateMixin, DefaultGenerationTemplate):
+ pass
+
+
+register_template(TemplateType.llama3_2_vision, Llama3_2VisionTemplate(), lazy_tokenize=True)
+register_template(TemplateType.llama3_2_vision_generation, Llama3_2VisionGenerationTemplate(), lazy_tokenize=True)
+
+
class Llama3_1OmniTemplate(Llama3Template):
system = ('You are a helpful language and speech assistant. '
'You are able to understand the speech content that the user provides, '
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 481d998860..5a211ab728 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -36,7 +36,7 @@
from swift.hub import ModelScopeConfig
from swift.utils import get_dist_setting, get_logger, is_ddp_plus_mp, stat_array, upper_bound, use_torchacc
-from swift.utils.module_mapping import MODEL_KEYS_MAPPING
+from swift.utils.module_mapping import MODEL_KEYS_MAPPING, MultiModelKeys
from .template import History, StopWords, StopWordsCriteria, Template
DATASET_TYPE = Union[HfDataset, HfIterableDataset]
@@ -458,16 +458,20 @@ def deep_getattr(model, attr: str):
return model
-def dynamic_vit_gradient_checkpointing(model, model_type: str) -> None:
- from swift.utils.module_mapping import MODEL_KEYS_MAPPING
+def get_mllm_arch(model_type: str) -> MultiModelKeys:
from .model import MODEL_MAPPING
model_info = MODEL_MAPPING[model_type]
lora_target_modules = model_info.get('lora_target_modules') # model_group
-
if not isinstance(lora_target_modules, str):
+ return None
+ return MODEL_KEYS_MAPPING[lora_target_modules]
+
+
+def dynamic_vit_gradient_checkpointing(model, model_type: str) -> None:
+ mllm_arch = get_mllm_arch(model_type)
+ if mllm_arch is None:
return
- vision_tower_list = MODEL_KEYS_MAPPING[lora_target_modules].vision_tower
- for vision_tower_name in vision_tower_list:
+ for vision_tower_name in mllm_arch.vision_tower:
vision_tower = deep_getattr(model, vision_tower_name)
module_list = _find_module_list(vision_tower)
if module_list is None:
diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py
index 6e4b3aac93..6c0bf6b405 100644
--- a/swift/utils/module_mapping.py
+++ b/swift/utils/module_mapping.py
@@ -296,6 +296,12 @@ def __post_init__(self):
vision_tower='model.vision_tower_high',
)
+LLAMA3_2_VISION = MultiModelKeys(
+ language_model='language_model',
+ connector='multi_modal_projector',
+ vision_tower='vision_model',
+)
+
MODEL_KEYS_MAPPING = OrderedDict([
# MLLM here
('qwen_audio', QWEN_AUDIO_KEYS),
@@ -317,6 +323,7 @@ def __post_init__(self):
('mplug_owl3', MPLUG_OWL3_KEYS),
('llama3_1_omni', LLAMA3_1_OMNI),
('got_ocr2', GOT_OCR2),
+ ('llama3_2_vision', LLAMA3_2_VISION),
# LLM begins here
('llama', LLAMA_KEYS),
('mistral', LLAMA_KEYS),