diff --git a/README.md b/README.md
index c21e758690..7b653b58cb 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,8 @@ You can contact us and communicate with us by adding our group:
 <img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
 
 ## 🎉 News
-- 2024.09.24: Support for training and deploying llama3.2 series models. Experience it using `swift infer --model_type llama3_2-1b-instruct`.
+- 2024.09.26: Support for training and deploying llama3.2-vision series models. Experience it using `swift infer --model_type llama3_2-11b-vision-instruct`.
+- 2024.09.26: Support for training and deploying llama3.2 series models. Experience it using `swift infer --model_type llama3_2-1b-instruct`.
 - 2024.09.25: Support for training to deployment with got-ocr2. Best practices can be found [here](https://github.com/modelscope/ms-swift/issues/2122).
 - 2024.09.24: Support for training and deploying llama3_1-8b-omni. Experience it using `swift infer --model_type llama3_1-8b-omni`.
 - 2024.09.23: Support for training and deploying pixtral-12b. Experience it using `swift infer --model_type pixtral-12b --dtype fp16`.
@@ -623,6 +624,7 @@ The complete list of supported models and datasets can be found at [Supported Mo
 |------------------------------------------------------------|----------------------------------------------------------------------------------------|--------------------|---------------------------------------|--------------------------|
 | Qwen-VL<br>Qwen2-VL                       | [Tongyi Qwen vision model](https://github.com/QwenLM)                                  | Chinese<br>English | 2B-72B<br>including quantized versions    | base model<br>chat model |
 | Qwen-Audio<br>Qwen2-Audio                                  | [Tongyi Qwen speech model](https://github.com/QwenLM)                                  | Chinese<br>English | 7B                                    | base model<br>chat model |
+| Llama3.2-Vision              | [Llama3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)                | English       | 11B-90B      | base model<br>chat model       |
 | YI-VL                                                      | [01AI's YI series vision models](https://github.com/01-ai)                             | Chinese<br>English | 6B-34B                                | chat model               |
 | XComposer2<br>XComposer2.5                                 | [Pujiang AI Lab InternLM vision model](https://github.com/InternLM/InternLM-XComposer) | Chinese<br>English | 7B                                    | chat model               |
 | DeepSeek-VL                                                | [DeepSeek series vision models](https://github.com/deepseek-ai)                        | Chinese<br>English | 1.3B-7B                               | chat model               |
diff --git a/README_CN.md b/README_CN.md
index 18c04d82e4..23eb681bd2 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -56,6 +56,7 @@ SWIFT具有丰富全面的文档，请查看我们的文档网站:
 
 
 ## 🎉 新闻
+- 2024.09.26: 支持llama3.2-vision系列模型的训练到部署. 使用`swift infer --model_type llama3_2-11b-vision-instruct`进行体验.
 - 2024.09.26: 支持llama3.2系列模型的训练到部署. 使用`swift infer --model_type llama3_2-1b-instruct`进行体验.
 - 2024.09.25: 支持got-ocr2的训练到部署. 最佳实践可以查看[这里](https://github.com/modelscope/ms-swift/issues/2122).
 - 2024.09.24: 支持llama3_1-8b-omni的训练与部署. 使用`swift infer --model_type llama3_1-8b-omni`进行体验.
@@ -616,6 +617,7 @@ CUDA_VISIBLE_DEVICES=0 swift deploy \
 |---------------------------------------------------------|----------------------------------------------------------------------------|----------|------------------|------------------|
 | Qwen-VL<br>Qwen2-VL                        | [通义千问视觉模型](https://github.com/QwenLM)                                      | 中文<br>英文 | 2B-72B<br>包含量化版本     | base模型<br>chat模型 |
 | Qwen-Audio<br>Qwen2-Audio                       | [通义千问语音模型](https://github.com/QwenLM)                                      | 中文<br>英文 | 7B               | base模型<br>chat模型 |
+| Llama3.2-Vision              | [Llama3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)                | 英文       | 11B-90B      | base模型<br>chat模型 |
 | YI-VL                                                   | [01AI的YI系列视觉模型](https://github.com/01-ai)                                  | 中文<br>英文 | 6B-34B           | chat模型           |
 | XComposer2<br>XComposer2.5                              | [浦江实验室书生浦语视觉模型](https://github.com/InternLM/InternLM-XComposer)            | 中文<br>英文 | 7B               | chat模型           |
 | DeepSeek-VL                                             | [幻方系列视觉模型](https://github.com/deepseek-ai)                                 | 中文<br>英文 | 1.3B-7B          | chat模型           |
diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 8bb9e56b5b..eabde47c86 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -199,10 +199,10 @@
 |llama3_1-405b-instruct-awq|[LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43, autoawq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|
 |llama3_1-405b-instruct-gptq-int4|[LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43, auto_gptq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|
 |llama3_1-405b-instruct-bnb|[LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43, bitsandbytes|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|
-|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
-|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
-|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
-|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
+|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
+|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
+|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
+|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
 |reflection-llama_3_1-70b|[LLM-Research/Reflection-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Reflection-Llama-3.1-70B/summary)|q_proj, k_proj, v_proj|reflection|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43|-|[mattshumer/Reflection-Llama-3.1-70B](https://huggingface.co/mattshumer/Reflection-Llama-3.1-70B)|
 |longwriter-glm4-9b|[ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/LongWriter-glm4-9b](https://huggingface.co/THUDM/LongWriter-glm4-9b)|
 |longwriter-llama3_1-8b|[ZhipuAI/LongWriter-llama3.1-8b](https://modelscope.cn/models/ZhipuAI/LongWriter-llama3.1-8b/summary)|q_proj, k_proj, v_proj|longwriter-llama3|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[THUDM/LongWriter-llama3.1-8b](https://huggingface.co/THUDM/LongWriter-llama3.1-8b)|
@@ -424,26 +424,30 @@
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
-|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
-|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
+|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
+|llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
+|llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
 |llama3_1-8b-omni|[ICTNLP/Llama-3.1-8B-Omni](https://modelscope.cn/models/ICTNLP/Llama-3.1-8B-Omni/summary)|^(model.layers\|model.speech_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_1-omni|&#x2714;|&#x2718;|&#x2718;|&#x2718;|whisper, openai-whisper|audio|[ICTNLP/Llama-3.1-8B-Omni](https://huggingface.co/ICTNLP/Llama-3.1-8B-Omni)|
-|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
+|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
 |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
 |llava1_6-mistral-7b-instruct|[swift/llava-v1.6-mistral-7b-hf](https://modelscope.cn/models/swift/llava-v1.6-mistral-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-mistral|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)|
@@ -454,9 +458,9 @@
 |llama3-llava-next-8b-hf|[swift/llama3-llava-next-8b-hf](https://modelscope.cn/models/swift/llama3-llava-next-8b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-llava-next-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)|
 |llava-next-72b-hf|[AI-ModelScope/llava-next-72b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-72b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)|
 |llava-next-110b-hf|[AI-ModelScope/llava-next-110b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-110b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)|
-|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
-|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
-|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
+|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
+|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
+|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
 |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3-llava-next|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
 |llava-next-72b|[AI-Modelscope/llava-next-72b](https://modelscope.cn/models/AI-Modelscope/llava-next-72b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)|
 |llava-next-110b|[AI-Modelscope/llava-next-110b](https://modelscope.cn/models/AI-Modelscope/llava-next-110b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)|
@@ -497,7 +501,7 @@
 |minicpm-v-v2-chat|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|timm, transformers<4.42|vision|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
 |minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
 |minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|&#x2714;|&#x2714;|&#x2718;|&#x2718;|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
-|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
 |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
 |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
 |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
diff --git a/docs/source_en/Instruction/Supported-models-datasets.md b/docs/source_en/Instruction/Supported-models-datasets.md
index a010194aa6..afe28bca02 100644
--- a/docs/source_en/Instruction/Supported-models-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-datasets.md
@@ -199,10 +199,10 @@ The table below introcudes all models supported by SWIFT:
 |llama3_1-405b-instruct-awq|[LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-AWQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43, autoawq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4)|
 |llama3_1-405b-instruct-gptq-int4|[LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43, auto_gptq|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4)|
 |llama3_1-405b-instruct-bnb|[LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://modelscope.cn/models/LLM-Research/Meta-Llama-3.1-405B-Instruct-BNB-NF4/summary)|q_proj, k_proj, v_proj|llama3|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43, bitsandbytes|-|[hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4](https://huggingface.co/hugging-quants/Meta-Llama-3.1-405B-Instruct-BNB-NF4)|
-|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
-|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
-|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
-|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
+|llama3_2-1b|[LLM-Research/Llama-3.2-1B](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)|
+|llama3_2-1b-instruct|[LLM-Research/Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)|
+|llama3_2-3b|[LLM-Research/Llama-3.2-3B](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)|
+|llama3_2-3b-instruct|[LLM-Research/Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary)|q_proj, k_proj, v_proj|llama3_2|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.45|-|[meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)|
 |reflection-llama_3_1-70b|[LLM-Research/Reflection-Llama-3.1-70B](https://modelscope.cn/models/LLM-Research/Reflection-Llama-3.1-70B/summary)|q_proj, k_proj, v_proj|reflection|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.43|-|[mattshumer/Reflection-Llama-3.1-70B](https://huggingface.co/mattshumer/Reflection-Llama-3.1-70B)|
 |longwriter-glm4-9b|[ZhipuAI/LongWriter-glm4-9b](https://modelscope.cn/models/ZhipuAI/LongWriter-glm4-9b/summary)|query_key_value|chatglm4|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.42|-|[THUDM/LongWriter-glm4-9b](https://huggingface.co/THUDM/LongWriter-glm4-9b)|
 |longwriter-llama3_1-8b|[ZhipuAI/LongWriter-llama3.1-8b](https://modelscope.cn/models/ZhipuAI/LongWriter-llama3.1-8b/summary)|q_proj, k_proj, v_proj|longwriter-llama3|&#x2714;|&#x2714;|&#x2714;|&#x2718;|transformers>=4.43|-|[THUDM/LongWriter-llama3.1-8b](https://huggingface.co/THUDM/LongWriter-llama3.1-8b)|
@@ -424,26 +424,30 @@ The table below introcudes all models supported by SWIFT:
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|^(transformer.h)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
-|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45.0.dev0|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
-|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|qwen2-audio-7b|[qwen/Qwen2-Audio-7B](https://modelscope.cn/models/qwen/Qwen2-Audio-7B/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio-generation|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|qwen2-audio-7b-instruct|[qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-Audio-7B-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-audio|&#x2714;|&#x2718;|&#x2718;|&#x2718;|librosa, transformers>=4.45|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
+|qwen2-vl-2b|[qwen/Qwen2-VL-2B](https://modelscope.cn/models/qwen/Qwen2-VL-2B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|qwen2-vl-2b-instruct|[qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|qwen2-vl-2b-instruct-gptq-int4|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|qwen2-vl-2b-instruct-gptq-int8|[qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|qwen2-vl-2b-instruct-awq|[qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|qwen2-vl-7b|[qwen/Qwen2-VL-7B](https://modelscope.cn/models/qwen/Qwen2-VL-7B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|qwen2-vl-7b-instruct|[qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|qwen2-vl-7b-instruct-gptq-int4|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|qwen2-vl-7b-instruct-gptq-int8|[qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|qwen2-vl-7b-instruct-awq|[qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|qwen2-vl-72b|[qwen/Qwen2-VL-72B](https://modelscope.cn/models/qwen/Qwen2-VL-72B/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|qwen2-vl-72b-instruct|[qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|qwen2-vl-72b-instruct-gptq-int4|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|qwen2-vl-72b-instruct-gptq-int8|[qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, auto_gptq>=0.5|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|qwen2-vl-72b-instruct-awq|[qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/qwen/Qwen2-VL-72B-Instruct-AWQ/summary)|^(model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|qwen2-vl|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45, qwen_vl_utils, autoawq|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |glm4v-9b-chat|[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b/summary)|^(transformer.encoder)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|glm4v|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.42|vision|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
+|llama3_2-11b-visiont|[LLM-Research/Llama-3.2-11B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)|
+|llama3_2-11b-vision-instruct|[LLM-Research/Llama-3.2-11B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-11B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)|
+|llama3_2-90b-vision|[LLM-Research/Llama-3.2-90B-Vision](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision-generation|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)|
+|llama3_2-90b-vision-instruct|[LLM-Research/Llama-3.2-90B-Vision-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-90B-Vision-Instruct/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_2-vision|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.45|vision|[meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)|
 |llama3_1-8b-omni|[ICTNLP/Llama-3.1-8B-Omni](https://modelscope.cn/models/ICTNLP/Llama-3.1-8B-Omni/summary)|^(model.layers\|model.speech_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3_1-omni|&#x2714;|&#x2718;|&#x2718;|&#x2718;|whisper, openai-whisper|audio|[ICTNLP/Llama-3.1-8B-Omni](https://huggingface.co/ICTNLP/Llama-3.1-8B-Omni)|
-|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
+|idefics3-8b-llama3|[AI-ModelScope/Idefics3-8B-Llama3](https://modelscope.cn/models/AI-ModelScope/Idefics3-8B-Llama3/summary)|^(model.text_model\|model.connector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|idefics3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[HuggingFaceM4/Idefics3-8B-Llama3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3)|
 |llava1_5-7b-instruct|[swift/llava-1.5-7b-hf](https://modelscope.cn/models/swift/llava-1.5-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)|
 |llava1_5-13b-instruct|[swift/llava-1.5-13b-hf](https://modelscope.cn/models/swift/llava-1.5-13b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava1_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.36|vision|[llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf)|
 |llava1_6-mistral-7b-instruct|[swift/llava-v1.6-mistral-7b-hf](https://modelscope.cn/models/swift/llava-v1.6-mistral-7b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-mistral|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)|
@@ -454,9 +458,9 @@ The table below introcudes all models supported by SWIFT:
 |llama3-llava-next-8b-hf|[swift/llama3-llava-next-8b-hf](https://modelscope.cn/models/swift/llama3-llava-next-8b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-llava-next-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)|
 |llava-next-72b-hf|[AI-ModelScope/llava-next-72b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-72b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf)|
 |llava-next-110b-hf|[AI-ModelScope/llava-next-110b-hf](https://modelscope.cn/models/AI-ModelScope/llava-next-110b-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama-qwen-hf|&#x2714;|&#x2714;|&#x2718;|&#x2718;|transformers>=4.39|vision|[llava-hf/llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf)|
-|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
-|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
-|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
+|llava-onevision-qwen2-0_5b-ov|[AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-0.5b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-0.5b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf)|
+|llava-onevision-qwen2-7b-ov|[AI-ModelScope/llava-onevision-qwen2-7b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-7b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-7b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-7b-ov-hf)|
+|llava-onevision-qwen2-72b-ov|[AI-ModelScope/llava-onevision-qwen2-72b-ov-hf](https://modelscope.cn/models/AI-ModelScope/llava-onevision-qwen2-72b-ov-hf/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-onevision-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision, video|[llava-hf/llava-onevision-qwen2-72b-ov-hf](https://huggingface.co/llava-hf/llava-onevision-qwen2-72b-ov-hf)|
 |llama3-llava-next-8b|[AI-Modelscope/llama3-llava-next-8b](https://modelscope.cn/models/AI-Modelscope/llama3-llava-next-8b/summary)|^(model.layers\|model.mm_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llama3-llava-next|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llama3-llava-next-8b](https://huggingface.co/lmms-lab/llama3-llava-next-8b)|
 |llava-next-72b|[AI-Modelscope/llava-next-72b](https://modelscope.cn/models/AI-Modelscope/llava-next-72b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llava-next-72b](https://huggingface.co/lmms-lab/llava-next-72b)|
 |llava-next-110b|[AI-Modelscope/llava-next-110b](https://modelscope.cn/models/AI-Modelscope/llava-next-110b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|llava-qwen|&#x2714;|&#x2718;|&#x2718;|&#x2718;||vision|[lmms-lab/llava-next-110b](https://huggingface.co/lmms-lab/llava-next-110b)|
@@ -497,7 +501,7 @@ The table below introcudes all models supported by SWIFT:
 |minicpm-v-v2-chat|[OpenBMB/MiniCPM-V-2](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v|&#x2714;|&#x2718;|&#x2718;|&#x2718;|timm, transformers<4.42|vision|[openbmb/MiniCPM-V-2](https://huggingface.co/openbmb/MiniCPM-V-2)|
 |minicpm-v-v2_5-chat|[OpenBMB/MiniCPM-Llama3-V-2_5](https://modelscope.cn/models/OpenBMB/MiniCPM-Llama3-V-2_5/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_5|&#x2714;|&#x2714;|&#x2718;|&#x2718;|timm, transformers>=4.36|vision|[openbmb/MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5)|
 |minicpm-v-v2_6-chat|[OpenBMB/MiniCPM-V-2_6](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6/summary)|^(llm\|resampler)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|minicpm-v-v2_6|&#x2714;|&#x2714;|&#x2718;|&#x2718;|timm, transformers>=4.36|vision, video|[openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6)|
-|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45.0.dev0|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
+|pixtral-12b|[AI-ModelScope/pixtral-12b](https://modelscope.cn/models/AI-ModelScope/pixtral-12b/summary)|^(language_model\|multi_modal_projector)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|pixtral|&#x2718;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.45|vision|[mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b)|
 |mplug-owl2-chat|[iic/mPLUG-Owl2](https://modelscope.cn/models/iic/mPLUG-Owl2/summary)|q_proj, k_proj.multiway.0, k_proj.multiway.1, v_proj.multiway.0, v_proj.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[MAGAer13/mplug-owl2-llama2-7b](https://huggingface.co/MAGAer13/mplug-owl2-llama2-7b)|
 |mplug-owl2_1-chat|[iic/mPLUG-Owl2.1](https://modelscope.cn/models/iic/mPLUG-Owl2.1/summary)|c_attn.multiway.0, c_attn.multiway.1|mplug-owl2|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers<4.35, icecream|vision|[Mizukiluke/mplug_owl_2_1](https://huggingface.co/Mizukiluke/mplug_owl_2_1)|
 |mplug-owl3-7b-chat|[iic/mPLUG-Owl3-7B-240728](https://modelscope.cn/models/iic/mPLUG-Owl3-7B-240728/summary)|^(language_model\|vision2text_model)(?!.\*(lm_head\|output\|emb\|wte\|shared)).\*|mplug_owl3|&#x2714;|&#x2718;|&#x2718;|&#x2718;|transformers>=4.36, icecream|vision, video|[mPLUG/mPLUG-Owl3-7B-240728](https://huggingface.co/mPLUG/mPLUG-Owl3-7B-240728)|
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
index d01b360afb..c9bea3528d 100644
--- a/swift/llm/sft.py
+++ b/swift/llm/sft.py
@@ -21,8 +21,8 @@
 from .accelerator import ta_accelerate
 from .tuner import prepare_model
 from .utils import (TEMPLATE_MAPPING, LazyLLMDataset, PtArguments, RLHFArguments, SftArguments, Template, dataset_map,
-                    dynamic_vit_gradient_checkpointing, get_dataset, get_model_tokenizer, get_template, get_time_info,
-                    print_example, set_generation_config, sort_by_max_length, stat_dataset)
+                    deep_getattr, dynamic_vit_gradient_checkpointing, get_dataset, get_mllm_arch, get_model_tokenizer,
+                    get_template, get_time_info, print_example, set_generation_config, sort_by_max_length, stat_dataset)
 
 logger = get_logger()
 
@@ -265,6 +265,12 @@ def prepare_model_template_train(args, msg: Optional[Dict[str, Any]] = None):
         model.config.use_cache = False  # fix transformers==4.36
         logger.info('Setting model.config.use_cache: False')
         model.enable_input_require_grads()
+        mllm_arch = get_mllm_arch(args.model_type)
+        if mllm_arch is not None:
+            for vision_tower_name in mllm_arch.vision_tower:
+                vision_tower = deep_getattr(model, vision_tower_name)
+                if hasattr(vision_tower, 'enable_input_require_grads'):
+                    vision_tower.enable_input_require_grads()
 
     if use_torchacc():
         model.config.use_cache = False
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
index 129981b377..3fb7c22a54 100644
--- a/swift/llm/utils/__init__.py
+++ b/swift/llm/utils/__init__.py
@@ -22,12 +22,12 @@
                        ModelList, UsageInfo, XRequestConfig, random_uuid)
 from .template import (DEFAULT_SYSTEM, TEMPLATE_MAPPING, History, KTOTemplateMixin, Prompt, RLHFTemplateMixin,
                        StopWords, Template, TemplateType, get_env_args, get_template, register_template)
-from .utils import (LazyLLMDataset, LLMDataset, dataset_map, download_dataset, dynamic_vit_gradient_checkpointing,
-                    find_all_linears, find_embedding, find_ln, get_max_model_len, get_time_info, history_to_messages,
-                    inference, inference_stream, is_lmdeploy_available, is_megatron_available, is_quant_model,
-                    is_vllm_available, limit_history_length, messages_join_observation, messages_to_history,
-                    print_example, safe_tokenizer_decode, set_generation_config, sort_by_max_length, stat_dataset,
-                    to_device)
+from .utils import (LazyLLMDataset, LLMDataset, dataset_map, deep_getattr, download_dataset,
+                    dynamic_vit_gradient_checkpointing, find_all_linears, find_embedding, find_ln, get_max_model_len,
+                    get_mllm_arch, get_time_info, history_to_messages, inference, inference_stream,
+                    is_lmdeploy_available, is_megatron_available, is_quant_model, is_vllm_available,
+                    limit_history_length, messages_join_observation, messages_to_history, print_example,
+                    safe_tokenizer_decode, set_generation_config, sort_by_max_length, stat_dataset, to_device)
 
 logger = get_logger()
 
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
index 41a4691fd7..2eca790c61 100644
--- a/swift/llm/utils/argument.py
+++ b/swift/llm/utils/argument.py
@@ -31,7 +31,7 @@
 from .model import (MODEL_MAPPING, dtype_mapping, get_additional_saved_files, get_default_lora_target_modules,
                     get_default_template_type)
 from .template import TEMPLATE_MAPPING
-from .utils import is_liger_available, is_lmdeploy_available, is_quant_model, is_vllm_available
+from .utils import get_mllm_arch, is_liger_available, is_lmdeploy_available, is_quant_model, is_vllm_available
 
 logger = get_logger()
 DATASET_TYPE = Union[HfDataset, HfIterableDataset]
@@ -1048,16 +1048,12 @@ def __post_init__(self) -> None:
             if self.eval_steps is None:
                 self.eval_steps = 50
         elif self.sft_type == 'full':
-            from swift.utils.module_mapping import MODEL_KEYS_MAPPING
-            lora_target_modules = model_info.get('lora_target_modules')  # model_group
-            model_arch = None
-            if isinstance(lora_target_modules, str):
-                model_arch = MODEL_KEYS_MAPPING[lora_target_modules]
-            if model_arch:
-                if self.freeze_vit and model_arch.vision_tower:
-                    self.freeze_parameters += model_arch.vision_tower
-                if model_arch.generator:
-                    self.freeze_parameters += model_arch.generator
+            mllm_arch = get_mllm_arch(self.model_type)
+            if mllm_arch:
+                if self.freeze_vit and mllm_arch.vision_tower:
+                    self.freeze_parameters += mllm_arch.vision_tower
+                if mllm_arch.generator:
+                    self.freeze_parameters += mllm_arch.generator
             assert 0 <= self.freeze_parameters_ratio <= 1
             assert self.quantization_bit == 0, 'Full parameter fine-tuning does not support quantization.'
             assert self.dtype != 'fp16', ("Fine-tuning with dtype=='fp16' can lead to NaN issues. "
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index ccd1f9c8be..3799aee022 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -265,6 +265,12 @@ class ModelType:
     llama3_2_1b_instruct = 'llama3_2-1b-instruct'
     llama3_2_3b = 'llama3_2-3b'
     llama3_2_3b_instruct = 'llama3_2-3b-instruct'
+    # llama3.2-vision
+    llama3_2_11b_vision = 'llama3_2-11b-visiont'
+    llama3_2_11b_vision_instruct = 'llama3_2-11b-vision-instruct'
+    llama3_2_90b_vision = 'llama3_2-90b-vision'
+    llama3_2_90b_vision_instruct = 'llama3_2-90b-vision-instruct'
+
     # omni
     llama3_1_8b_omni = 'llama3_1-8b-omni'
     # reflection
@@ -644,6 +650,7 @@ class LoRATM(NamedTuple):
     mplug_owl3 = 'mplug_owl3'
     llama3_1_omni = 'llama3_1_omni'
     got_ocr2 = 'got_ocr2'
+    llama3_2_vision = 'llama3_2_vision'
     # default lora target modules for nlp llms.
     minicpm3 = ['q_a_proj', 'q_b_proj', 'kv_a_proj_with_mqa', 'kv_b_proj']
     baichuan = ['W_pack']
@@ -6194,6 +6201,56 @@ def get_model_tokenizer_llava_hf(model_dir: str, *args, **kwargs):
     return model, tokenizer
 
 
+@register_model(
+    ModelType.llama3_2_11b_vision,
+    'LLM-Research/Llama-3.2-11B-Vision',
+    LoRATM.llama3_2_vision,
+    TemplateType.llama3_2_vision_generation,
+    support_flash_attn=True,
+    support_vllm=True,
+    ignore_file_pattern=['*.pth'],
+    requires=['transformers>=4.45'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='meta-llama/Llama-3.2-11B-Vision')
+@register_model(
+    ModelType.llama3_2_11b_vision_instruct,
+    'LLM-Research/Llama-3.2-11B-Vision-Instruct',
+    LoRATM.llama3_2_vision,
+    TemplateType.llama3_2_vision,
+    support_flash_attn=True,
+    support_vllm=True,
+    ignore_file_pattern=['*.pth'],
+    requires=['transformers>=4.45'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='meta-llama/Llama-3.2-11B-Vision-Instruct')
+@register_model(
+    ModelType.llama3_2_90b_vision,
+    'LLM-Research/Llama-3.2-90B-Vision',
+    LoRATM.llama3_2_vision,
+    TemplateType.llama3_2_vision_generation,
+    support_flash_attn=True,
+    support_vllm=True,
+    ignore_file_pattern=['*.pth'],
+    requires=['transformers>=4.45'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='meta-llama/Llama-3.2-90B-Vision')
+@register_model(
+    ModelType.llama3_2_90b_vision_instruct,
+    'LLM-Research/Llama-3.2-90B-Vision-Instruct',
+    LoRATM.llama3_2_vision,
+    TemplateType.llama3_2_vision,
+    support_flash_attn=True,
+    support_vllm=True,
+    ignore_file_pattern=['*.pth'],
+    requires=['transformers>=4.45'],
+    tags=['multi-modal', 'vision'],
+    hf_model_id='meta-llama/Llama-3.2-90B-Vision-Instruct')
+def get_model_tokenizer_llama3_2_vision(*args, **kwargs):
+    from transformers import MllamaForConditionalGeneration
+    kwargs['automodel_class'] = MllamaForConditionalGeneration
+    return get_model_tokenizer_llava_hf(*args, **kwargs)
+
+
 @register_model(
     ModelType.llava1_5_13b_instruct,
     'swift/llava-1.5-13b-hf',
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index b4638d9124..0f3ad2a6a8 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -64,6 +64,8 @@ class TemplateType:
     llama3 = 'llama3'
     llama3_1_omni = 'llama3_1-omni'
     llama3_2 = 'llama3_2'
+    llama3_2_vision = 'llama3_2-vision'
+    llama3_2_vision_generation = 'llama3_2-vision-generation'
     reflection = 'reflection'
     longwriter_llama3 = 'longwriter-llama3'
     # llava-hf
@@ -1931,6 +1933,64 @@ class Llama3_2Template(Llama3_2TemplateMixin, Template):
 register_template(TemplateType.llama3_2, Llama3_2Template())
 
 
+class Llama3_2VisionTemplateMixin:
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        assert media_type == 'image'
+        return ['<|image|>']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        from transformers.models.mllama.processing_mllama import (get_cross_attention_token_mask,
+                                                                  convert_sparse_cross_attention_mask_to_dense)
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        if images:
+            input_ids = inputs['input_ids']
+            processor = self.tokenizer.processor
+            image_features = processor.image_processor(images, return_tensors='pt')
+            num_tiles = image_features.pop('num_tiles')
+            inputs.update(image_features)
+
+            cross_attention_token_mask = [get_cross_attention_token_mask(input_ids, processor.image_token_id)]
+            cross_attention_mask = convert_sparse_cross_attention_mask_to_dense(
+                cross_attention_token_mask,
+                num_tiles=num_tiles,
+                max_num_tiles=processor.image_processor.max_image_tiles,
+                length=len(input_ids),
+            )
+            inputs['cross_attention_mask'] = torch.tensor(cross_attention_mask)
+
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        for key in ['aspect_ratio_ids', 'aspect_ratio_mask']:
+            value = [b[key] for b in batch if b.get(key) is not None]
+            if value:
+                res[key] = torch.concat(value)
+
+        cross_attention_mask = [
+            b['cross_attention_mask'][0] for b in batch if b.get('cross_attention_mask') is not None
+        ]
+        if cross_attention_mask:
+            res['cross_attention_mask'] = self.pad_sequence(cross_attention_mask, 0, self.padding_side)
+        return res
+
+
+class Llama3_2VisionTemplate(Llama3_2VisionTemplateMixin, Llama3Template):
+    pass
+
+
+class Llama3_2VisionGenerationTemplate(Llama3_2VisionTemplateMixin, DefaultGenerationTemplate):
+    pass
+
+
+register_template(TemplateType.llama3_2_vision, Llama3_2VisionTemplate(), lazy_tokenize=True)
+register_template(TemplateType.llama3_2_vision_generation, Llama3_2VisionGenerationTemplate(), lazy_tokenize=True)
+
+
 class Llama3_1OmniTemplate(Llama3Template):
     system = ('You are a helpful language and speech assistant. '
               'You are able to understand the speech content that the user provides, '
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
index 481d998860..5a211ab728 100644
--- a/swift/llm/utils/utils.py
+++ b/swift/llm/utils/utils.py
@@ -36,7 +36,7 @@
 
 from swift.hub import ModelScopeConfig
 from swift.utils import get_dist_setting, get_logger, is_ddp_plus_mp, stat_array, upper_bound, use_torchacc
-from swift.utils.module_mapping import MODEL_KEYS_MAPPING
+from swift.utils.module_mapping import MODEL_KEYS_MAPPING, MultiModelKeys
 from .template import History, StopWords, StopWordsCriteria, Template
 
 DATASET_TYPE = Union[HfDataset, HfIterableDataset]
@@ -458,16 +458,20 @@ def deep_getattr(model, attr: str):
     return model
 
 
-def dynamic_vit_gradient_checkpointing(model, model_type: str) -> None:
-    from swift.utils.module_mapping import MODEL_KEYS_MAPPING
+def get_mllm_arch(model_type: str) -> MultiModelKeys:
     from .model import MODEL_MAPPING
     model_info = MODEL_MAPPING[model_type]
     lora_target_modules = model_info.get('lora_target_modules')  # model_group
-
     if not isinstance(lora_target_modules, str):
+        return None
+    return MODEL_KEYS_MAPPING[lora_target_modules]
+
+
+def dynamic_vit_gradient_checkpointing(model, model_type: str) -> None:
+    mllm_arch = get_mllm_arch(model_type)
+    if mllm_arch is None:
         return
-    vision_tower_list = MODEL_KEYS_MAPPING[lora_target_modules].vision_tower
-    for vision_tower_name in vision_tower_list:
+    for vision_tower_name in mllm_arch.vision_tower:
         vision_tower = deep_getattr(model, vision_tower_name)
         module_list = _find_module_list(vision_tower)
         if module_list is None:
diff --git a/swift/utils/module_mapping.py b/swift/utils/module_mapping.py
index 6e4b3aac93..6c0bf6b405 100644
--- a/swift/utils/module_mapping.py
+++ b/swift/utils/module_mapping.py
@@ -296,6 +296,12 @@ def __post_init__(self):
     vision_tower='model.vision_tower_high',
 )
 
+LLAMA3_2_VISION = MultiModelKeys(
+    language_model='language_model',
+    connector='multi_modal_projector',
+    vision_tower='vision_model',
+)
+
 MODEL_KEYS_MAPPING = OrderedDict([
     # MLLM here
     ('qwen_audio', QWEN_AUDIO_KEYS),
@@ -317,6 +323,7 @@ def __post_init__(self):
     ('mplug_owl3', MPLUG_OWL3_KEYS),
     ('llama3_1_omni', LLAMA3_1_OMNI),
     ('got_ocr2', GOT_OCR2),
+    ('llama3_2_vision', LLAMA3_2_VISION),
     # LLM begins here
     ('llama', LLAMA_KEYS),
     ('mistral', LLAMA_KEYS),