From 6e3fa6d2eecacdcad201df424d0586860cc07f1d Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 23 Dec 2024 16:33:38 +0800
Subject: [PATCH 01/13] support paligemma2 (#2735)

---
 README.md                                      |  6 +++---
 README_CN.md                                   |  6 +++---
 ...253\351\200\237\345\274\200\345\247\213.md" |  4 ++--
 ...214\346\225\260\346\215\256\351\233\206.md" | 11 +++++++++++
 docs/source_en/GetStarted/Quick-start.md       |  4 ++--
 .../Supported-models-and-datasets.md           | 11 +++++++++++
 swift/llm/model/model/gemma.py                 | 17 +++++++++++++++++
 swift/llm/template/template/gemma.py           |  2 +-
 tests/test_align/test_template/test_vision.py  | 18 +++++++++++++++---
 9 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 5083d6f14..82fff525f 100644
--- a/README.md
+++ b/README.md
@@ -55,13 +55,13 @@ You can contact us and communicate with us by adding our group:
 
 
 ## 📝 Introduction
-🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of over 400 large models and 100+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, Llama3.2, GLM4, Internlm2.5, Yi1.5, Mistral, DeepSeek, Baichuan2, Gemma2, and TeleChat2. The multi-modal LLMs include models such as Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2.
+🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 400+ large models and 150+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, Llama3.3, GLM4, Internlm2.5, Yi1.5, Mistral, DeepSeek2.5, Baichuan2, Gemma2, and TeleChat2. The multi-modal LLMs include models such as Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2.
 
-🍔 In addition, ms-swift gathers the latest training technologies, including LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger. ms-swift supports accelerating the inference, evaluation, and deployment modules using vLLM and LMDeploy. To help researchers and developers fine-tune and apply large models more easily, ms-swift also provides a Gradio-based Web-UI interface and a wealth of best practices.
+🍔 In addition, ms-swift gathers the latest training technologies, including LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and supports the quantization of large models and multi-modal large models using technologies such as GPTQ, AWQ, and BNB. To help researchers and developers fine-tune and apply large models more easily, ms-swift also provides a Gradio-based Web-UI interface and a wealth of best practices.
 
 **Why choose ms-swift?**
 
-- 🍎 **Model Types**: Supports 400+ large language models and **100+ multi-modal large models** and all-to-all models, **providing a comprehensive solution from training to deployment**.
+- 🍎 **Model Types**: Supports 400+ large language models and **150+ multi-modal large models** and all-to-all models, **providing a comprehensive solution from training to deployment**.
 - **Dataset Types**: Comes with 150+ pre-training, fine-tuning, human alignment, multi-modal datasets, and supports custom datasets.
 - **Hardware Support**: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, etc.
 - 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel.
diff --git a/README_CN.md b/README_CN.md
index 92140cb02..4e204b7d9 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -53,12 +53,12 @@
 <img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
 
 ## 📝 简介
-🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架，现已支持400+大模型与100+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。其中LLM包括：Qwen2.5、Llama3.2、GLM4、Internlm2.5、Yi1.5、Mistral、DeepSeek、Baichuan2、Gemma2、TeleChat2等模型，多模态LLM包括：Qwen2-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。
+🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架，现已支持450+大模型与150+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。其中大模型包括：Qwen2.5、Llama3.3、GLM4、Internlm2.5、Yi1.5、Mistral、DeepSeek2.5、Baichuan2、Gemma2、TeleChat2等模型，多模态大模型包括：Qwen2-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。
 
-🍔 除此之外，ms-swift汇集了最新的训练技术，包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速。为了帮助研究者和开发者更轻松地微调和应用大模型，ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。
+🍔 除此之外，ms-swift汇集了最新的训练技术，包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速，并支持使用GPTQ、AWQ、BNB等技术对大模型和多模态大模型进行量化。为了帮助研究者和开发者更轻松地微调和应用大模型，ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。
 
 **为什么选择ms-swift？**
-- 🍎 **模型类型**：支持400+纯文本大模型、**100+多模态大模型**，All-to-All全模态模型的**训练到部署全流程**。
+- 🍎 **模型类型**：支持400+纯文本大模型、**150+多模态大模型**，All-to-All全模态模型的**训练到部署全流程**。
 - **数据集类型**：内置150+预训练、微调、人类对齐、多模态等各种类型的数据集，并支持自定义数据集。
 - **硬件支持**：CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU等。
 - 🍊 **轻量训练**：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index 1e1e8961e..306b0a62f 100644
--- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -1,8 +1,8 @@
 # 快速开始
 
-ms-swift是魔搭社区提供的大模型与多模态大模型训练部署框架，现已支持400+大模型与100+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。模型开发者可以在ms-swift框架中一站式完成围绕大模型的各类需求。目前ms-swift的主要能力包含：
+ms-swift是魔搭社区提供的大模型与多模态大模型训练部署框架，现已支持400+大模型与150+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。模型开发者可以在ms-swift框架中一站式完成围绕大模型的各类需求。目前ms-swift的主要能力包含：
 
-- 🍎 模型类型：支持400+纯文本大模型、100+多模态大模型，All-to-All全模态模型的训练到部署全流程。
+- 🍎 模型类型：支持400+纯文本大模型、150+多模态大模型，All-to-All全模态模型的训练到部署全流程。
 - 数据集类型：内置150+预训练、微调、人类对齐、多模态等各种类型的数据集，并支持自定义数据集。
 - 硬件支持：CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU等。
 - 🍊 轻量训练：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 66b9a8027..89061f4cb 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -603,6 +603,17 @@
 |[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|
 |[AI-ModelScope/paligemma-3b-mix-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)|
 |[AI-ModelScope/paligemma-3b-mix-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448)|
+|[AI-ModelScope/paligemma2-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-224](https://huggingface.co/google/paligemma2-3b-pt-224)|
+|[AI-ModelScope/paligemma2-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-448](https://huggingface.co/google/paligemma2-3b-pt-448)|
+|[AI-ModelScope/paligemma2-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-896](https://huggingface.co/google/paligemma2-3b-pt-896)|
+|[AI-ModelScope/paligemma2-10b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-224](https://huggingface.co/google/paligemma2-10b-pt-224)|
+|[AI-ModelScope/paligemma2-10b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-448](https://huggingface.co/google/paligemma2-10b-pt-448)|
+|[AI-ModelScope/paligemma2-10b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-896](https://huggingface.co/google/paligemma2-10b-pt-896)|
+|[AI-ModelScope/paligemma2-28b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-224](https://huggingface.co/google/paligemma2-28b-pt-224)|
+|[AI-ModelScope/paligemma2-28b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-448](https://huggingface.co/google/paligemma2-28b-pt-448)|
+|[AI-ModelScope/paligemma2-28b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-896](https://huggingface.co/google/paligemma2-28b-pt-896)|
+|[AI-ModelScope/paligemma2-3b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-ft-docci-448](https://huggingface.co/google/paligemma2-3b-ft-docci-448)|
+|[AI-ModelScope/paligemma2-10b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-ft-docci-448](https://huggingface.co/google/paligemma2-10b-ft-docci-448)|
 |[LLM-Research/Molmo-7B-O-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-O-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-O-0924](https://huggingface.co/allenai/Molmo-7B-O-0924)|
 |[LLM-Research/Molmo-7B-D-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-D-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)|
 |[LLM-Research/Molmo-72B-0924](https://modelscope.cn/models/LLM-Research/Molmo-72B-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-72B-0924](https://huggingface.co/allenai/Molmo-72B-0924)|
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
index 415bbfb7b..47861d52e 100644
--- a/docs/source_en/GetStarted/Quick-start.md
+++ b/docs/source_en/GetStarted/Quick-start.md
@@ -1,8 +1,8 @@
 # Quick Start
 
-ms-swift is a comprehensive training and deployment framework for large language models and multimodal large models, provided by the ModelScope Community. It currently supports the training (CPT, SFT, RLHF), inference, evaluation, quantization, and deployment of over 400 LLM and over 100 MLLM. Model developers can fulfill all kinds of needs related to large models in a single platform within the ms-swift framework. The main capabilities of ms-swift include:
+ms-swift is a comprehensive training and deployment framework for large language models and multimodal large models, provided by the ModelScope Community. It currently supports the training (CPT, SFT, RLHF), inference, evaluation, quantization, and deployment of 400+ LLM and 150+ MLLM. Model developers can fulfill all kinds of needs related to large models in a single platform within the ms-swift framework. The main capabilities of ms-swift include:
 
-- 🍎 Model Types: Supports the full process from training to deployment of over 400 text-based large models and over 100 multimodal large models, including All-to-All all-modality models.
+- 🍎 Model Types: Supports the full process from training to deployment of 400+ text-based large models and 150+ multimodal large models, including All-to-All all-modality models.
 - Dataset Types: Comes with more than 150 pre-built datasets for pre-training, fine-tuning, human alignment, multimodal, and supports custom datasets.
 - Hardware Support: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, and others.
 - 🍊 Lightweight Training: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel, and more.
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
index 71990dcec..94bfb0e38 100644
--- a/docs/source_en/Instruction/Supported-models-and-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -603,6 +603,17 @@ The table below introduces the models integrated with ms-swift:
 |[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|
 |[AI-ModelScope/paligemma-3b-mix-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)|
 |[AI-ModelScope/paligemma-3b-mix-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448)|
+|[AI-ModelScope/paligemma2-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-224](https://huggingface.co/google/paligemma2-3b-pt-224)|
+|[AI-ModelScope/paligemma2-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-448](https://huggingface.co/google/paligemma2-3b-pt-448)|
+|[AI-ModelScope/paligemma2-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-896](https://huggingface.co/google/paligemma2-3b-pt-896)|
+|[AI-ModelScope/paligemma2-10b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-224](https://huggingface.co/google/paligemma2-10b-pt-224)|
+|[AI-ModelScope/paligemma2-10b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-448](https://huggingface.co/google/paligemma2-10b-pt-448)|
+|[AI-ModelScope/paligemma2-10b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-896](https://huggingface.co/google/paligemma2-10b-pt-896)|
+|[AI-ModelScope/paligemma2-28b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-224](https://huggingface.co/google/paligemma2-28b-pt-224)|
+|[AI-ModelScope/paligemma2-28b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-448](https://huggingface.co/google/paligemma2-28b-pt-448)|
+|[AI-ModelScope/paligemma2-28b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-896](https://huggingface.co/google/paligemma2-28b-pt-896)|
+|[AI-ModelScope/paligemma2-3b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-ft-docci-448](https://huggingface.co/google/paligemma2-3b-ft-docci-448)|
+|[AI-ModelScope/paligemma2-10b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-ft-docci-448](https://huggingface.co/google/paligemma2-10b-ft-docci-448)|
 |[LLM-Research/Molmo-7B-O-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-O-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-O-0924](https://huggingface.co/allenai/Molmo-7B-O-0924)|
 |[LLM-Research/Molmo-7B-D-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-D-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)|
 |[LLM-Research/Molmo-72B-0924](https://modelscope.cn/models/LLM-Research/Molmo-72B-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-72B-0924](https://huggingface.co/allenai/Molmo-72B-0924)|
diff --git a/swift/llm/model/model/gemma.py b/swift/llm/model/model/gemma.py
index 8d5b8c2dc..7ae3ba91b 100644
--- a/swift/llm/model/model/gemma.py
+++ b/swift/llm/model/model/gemma.py
@@ -28,9 +28,26 @@ def get_model_tokenizer_paligemma_vision(model_dir: str,
                 Model('AI-ModelScope/paligemma-3b-pt-224', 'google/paligemma-3b-pt-224'),
                 Model('AI-ModelScope/paligemma-3b-pt-448', 'google/paligemma-3b-pt-448'),
                 Model('AI-ModelScope/paligemma-3b-pt-896', 'google/paligemma-3b-pt-896'),
+            ]),
+            ModelGroup([
                 Model('AI-ModelScope/paligemma-3b-mix-224', 'google/paligemma-3b-mix-224'),
                 Model('AI-ModelScope/paligemma-3b-mix-448', 'google/paligemma-3b-mix-448'),
             ]),
+            ModelGroup([
+                Model('AI-ModelScope/paligemma2-3b-pt-224', 'google/paligemma2-3b-pt-224'),
+                Model('AI-ModelScope/paligemma2-3b-pt-448', 'google/paligemma2-3b-pt-448'),
+                Model('AI-ModelScope/paligemma2-3b-pt-896', 'google/paligemma2-3b-pt-896'),
+                Model('AI-ModelScope/paligemma2-10b-pt-224', 'google/paligemma2-10b-pt-224'),
+                Model('AI-ModelScope/paligemma2-10b-pt-448', 'google/paligemma2-10b-pt-448'),
+                Model('AI-ModelScope/paligemma2-10b-pt-896', 'google/paligemma2-10b-pt-896'),
+                Model('AI-ModelScope/paligemma2-28b-pt-224', 'google/paligemma2-28b-pt-224'),
+                Model('AI-ModelScope/paligemma2-28b-pt-448', 'google/paligemma2-28b-pt-448'),
+                Model('AI-ModelScope/paligemma2-28b-pt-896', 'google/paligemma2-28b-pt-896'),
+            ]),
+            ModelGroup([
+                Model('AI-ModelScope/paligemma2-3b-ft-docci-448', 'google/paligemma2-3b-ft-docci-448'),
+                Model('AI-ModelScope/paligemma2-10b-ft-docci-448', 'google/paligemma2-10b-ft-docci-448'),
+            ]),
         ],
         TemplateType.paligemma,
         get_model_tokenizer_paligemma_vision,
diff --git a/swift/llm/template/template/gemma.py b/swift/llm/template/template/gemma.py
index 24c1d4936..dabf3644f 100644
--- a/swift/llm/template/template/gemma.py
+++ b/swift/llm/template/template/gemma.py
@@ -42,7 +42,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             encoded['token_type_ids'] = [0] * len(encoded['input_ids'])
         if raw_image:
             model_inputs = processor(text=inputs.to_history()['query'], images=raw_image[0], return_tensors='pt')
-            encoded['pixel_values'] = model_inputs['pixel_values']
+            encoded['pixel_values'] = model_inputs['pixel_values'].to(self.config.torch_dtype)
         return encoded
 
 
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
index bb53b1978..6191fe450 100644
--- a/tests/test_align/test_template/test_vision.py
+++ b/tests/test_align/test_template/test_vision.py
@@ -180,8 +180,19 @@ def test_ovis1_6():
 
 
 def test_paligemma():
-    pt_engine = PtEngine('AI-ModelScope/paligemma-3b-pt-224')
-    _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'caption en'}])
+    pt_engine = PtEngine('AI-ModelScope/paligemma-3b-mix-224')
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'detect cat'}])
+    assert response == '<loc0000><loc0000><loc1022><loc1022> cat'
+
+
+def test_paligemma2():
+    pt_engine = PtEngine('AI-ModelScope/paligemma2-3b-ft-docci-448', torch_dtype=torch.bfloat16)
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'caption en'}])
+    assert response == (
+        'A close up view of a white kitten with black stripes on its head and body. The kitten is looking straight '
+        'ahead with its light blue eyes. The kitten has a pink nose and mouth. The kitten is sitting on a white '
+        'surface. A white light is shining on the kitten and the white surface. A shadow is being cast underneath '
+        'the kitten and the white surface.')
 
 
 def test_pixtral():
@@ -299,6 +310,7 @@ def test_doc_owl2():
     # test_minicpmv()
     # test_got_ocr()
     # test_paligemma()
+    test_paligemma2()
     # test_pixtral()
     # test_llama_vision()
     # test_llava_hf()
@@ -314,4 +326,4 @@ def test_doc_owl2():
     # test_mplug_owl2()
     # test_molmo()
     # test_molmoe()
-    test_doc_owl2()
+    # test_doc_owl2()

From 64cede0f413fd386d98b2e33996be7ea0f5805f3 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 23 Dec 2024 17:10:18 +0800
Subject: [PATCH 02/13] fix windows (#2733)

---
 ...\273\244\350\241\214\345\217\202\346\225\260.md" |  2 +-
 .../Instruction/Command-line-parameters.md          |  2 +-
 examples/export/quantize/awq.sh                     |  2 +-
 examples/export/quantize/gptq.sh                    |  2 +-
 swift/llm/argument/export_args.py                   |  2 +-
 swift/llm/model/register.py                         |  9 ++++++---
 tests/export/quant.py                               | 13 ++++++++++++-
 7 files changed, 23 insertions(+), 9 deletions(-)

diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 283912e13..bb867063f 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -365,7 +365,7 @@ RLHF参数继承于[训练参数](#训练参数)
 - 🔥output_dir: 导出结果存储路径，默认为None
 
 - 🔥quant_method: 可选为'gptq', 'awq'，默认为None
-- quant_n_samples: gptq/awq的校验集抽样数，默认为256
+- quant_n_samples: gptq/awq的校验集抽样数，默认为128
 - max_length: 校准集的max_length, 默认值2048
 - quant_batch_size: 量化batch_size，默认为1
 - group_size: 量化group大小，默认为128
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index fc959e977..ef0236a8c 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -367,7 +367,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum
 - 🔥output_dir: Path for storing export results, default is None.
 
 - 🔥quant_method: Options are 'gptq' and 'awq', default is None.
-- quant_n_samples: Sampling size for the validation set in gptq/awq, default is 256.
+- quant_n_samples: Sampling size for the validation set in gptq/awq, default is 128.
 - max_length: Max length for the calibration set, default value is 2048.
 - quant_batch_size: Quantization batch size, default is 1.
 - group_size: Group size for quantization, default is 128.
diff --git a/examples/export/quantize/awq.sh b/examples/export/quantize/awq.sh
index 571b4717d..379ad270e 100644
--- a/examples/export/quantize/awq.sh
+++ b/examples/export/quantize/awq.sh
@@ -3,7 +3,7 @@ swift export \
     --model Qwen/Qwen2.5-1.5B-Instruct \
     --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
               AI-ModelScope/alpaca-gpt4-data-en#500 \
-    --quant_n_samples 256 \
+    --quant_n_samples 128 \
     --quant_batch_size 1 \
     --max_length 2048 \
     --quant_method awq \
diff --git a/examples/export/quantize/gptq.sh b/examples/export/quantize/gptq.sh
index 7e207f205..f53d251bd 100644
--- a/examples/export/quantize/gptq.sh
+++ b/examples/export/quantize/gptq.sh
@@ -5,7 +5,7 @@ swift export \
     --model Qwen/Qwen2.5-1.5B-Instruct \
     --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
               AI-ModelScope/alpaca-gpt4-data-en#500 \
-    --quant_n_samples 256 \
+    --quant_n_samples 128 \
     --quant_batch_size 1 \
     --max_length 2048 \
     --quant_method gptq \
diff --git a/swift/llm/argument/export_args.py b/swift/llm/argument/export_args.py
index f5e24ca49..7a7f39595 100644
--- a/swift/llm/argument/export_args.py
+++ b/swift/llm/argument/export_args.py
@@ -33,7 +33,7 @@ class ExportArguments(MergeArguments, BaseArguments):
 
     # awq/gptq
     quant_method: Literal['awq', 'gptq', 'bnb'] = None
-    quant_n_samples: int = 256
+    quant_n_samples: int = 128
     max_length: int = 2048
     quant_batch_size: int = 1
     group_size: int = 128
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index 702f869fd..e81716c56 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import platform
 import re
 from copy import deepcopy
 from dataclasses import asdict, dataclass, field
@@ -333,9 +334,11 @@ def get_model_name(model_id_or_path: str) -> Optional[str]:
     model_id_or_path = model_id_or_path.rstrip('/')
     match_ = re.search('/models--.+?--(.+?)/snapshots/', model_id_or_path)
     if match_ is not None:
-        model_name = match_.group(1)
-    else:
-        model_name = model_id_or_path.rsplit('/', 1)[-1]
+        return match_.group(1)
+
+    model_name = model_id_or_path.rsplit('/', 1)[-1]
+    if platform.system().lower() == 'windows':
+        model_name = model_name.rsplit('\\', 1)[-1]
     # compat modelscope snapshot_download
     model_name = model_name.replace('___', '.')
     return model_name
diff --git a/tests/export/quant.py b/tests/export/quant.py
index 563244418..1c0b41dbd 100644
--- a/tests/export/quant.py
+++ b/tests/export/quant.py
@@ -24,6 +24,16 @@ def test_vlm_quant(quant_method: Literal['gptq', 'awq'] = 'awq'):
             quant_method=quant_method))
 
 
+def test_audio_quant(quant_method: Literal['gptq', 'awq'] = 'awq'):
+    from swift.llm import export_main, ExportArguments
+    export_main(
+        ExportArguments(
+            model='Qwen/Qwen2-Audio-7B-Instruct',
+            quant_bits=4,
+            dataset=['speech_asr/speech_asr_aishell1_trainsets:validation#1000'],
+            quant_method=quant_method))
+
+
 def test_vlm_bnb_quant():
     from swift.llm import export_main, ExportArguments, infer_main, InferArguments
     export_main(ExportArguments(model='Qwen/Qwen2-VL-7B-Instruct', quant_bits=4, quant_method='bnb'))
@@ -34,4 +44,5 @@ def test_vlm_bnb_quant():
 if __name__ == '__main__':
     # test_llm_quant('gptq')
     # test_vlm_quant('gptq')
-    test_vlm_bnb_quant()
+    test_audio_quant('gptq')
+    # test_vlm_bnb_quant()

From f17ca92808697388b4b901e2d0bd0d5e53fa8414 Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Mon, 23 Dec 2024 18:41:13 +0800
Subject: [PATCH 03/13] support mm llamapro (#2738)

---
 swift/tuners/llamapro.py | 67 ++++++++++++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/swift/tuners/llamapro.py b/swift/tuners/llamapro.py
index e20e7ab94..ab343d8bc 100644
--- a/swift/tuners/llamapro.py
+++ b/swift/tuners/llamapro.py
@@ -1,12 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from copy import deepcopy
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, fields
 from typing import Optional
 
 import torch
 from torch import nn
 
-from swift.llm import MODEL_ARCH_MAPPING, ModelKeys
+from swift.llm import MODEL_ARCH_MAPPING, HfConfigFactory, ModelKeys
 from swift.utils.logger import get_logger
 from .utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput
 
@@ -46,11 +46,9 @@ class LLaMAPro(SwiftAdapter):
     @staticmethod
     def prepare_model(model: nn.Module, config: LLaMAProConfig, adapter_name: str) -> SwiftOutput:
         """Prepare a model with `LLaMAProConfig`"""
-        num_hidden_layers = None
-        if hasattr(model.config, 'num_hidden_layers'):
-            num_hidden_layers = model.config.num_hidden_layers
-        elif hasattr(model.config, 'num_layers'):
-            num_hidden_layers = model.config.num_layers
+        num_hidden_layers = HfConfigFactory.get_config_attr(model.config, 'num_hidden_layers')
+        if num_hidden_layers is None:
+            num_hidden_layers = HfConfigFactory.get_config_attr(model.config, 'num_layers')
 
         assert num_hidden_layers is not None, 'Cannot find num of layers config'
         assert num_hidden_layers % config.num_new_blocks == 0, f'Model layers {num_hidden_layers} ' \
@@ -60,8 +58,26 @@ def prepare_model(model: nn.Module, config: LLaMAProConfig, adapter_name: str) -
 
         num_stride = num_hidden_layers // config.num_groups
 
-        # We only support decoder only model for now.
-        module_list = LLaMAPro._find_module_list(config, model)
+        try:
+            module_list = LLaMAPro._find_module_list(config, model)
+        except AssertionError as e:
+            model_type = LLaMAPro.search_correct_model_type(model)
+            if model_type is None:
+                language_model_name = SwiftAdapter.get_model_key_mapping(config.model_type, config).language_model
+                if language_model_name:
+                    if isinstance(language_model_name, str):
+                        language_model_name = [language_model_name]
+                    language_model = model.get_submodule(language_model_name[0])
+                    model_type = LLaMAPro.search_correct_model_type(language_model)
+                    if model_type:
+                        model = language_model
+
+            if model_type:
+                config.model_type = model_type
+                module_list = LLaMAPro._find_module_list(config, model)
+            else:
+                raise e
+
         new_module_list = nn.ModuleList()
         new_module_idx = []
         for idx, module in enumerate(module_list):
@@ -107,7 +123,10 @@ def _update_module_attr(config: LLaMAProConfig, module_list):
         if model_type in ('llama', 'mistral', 'qwen2', 'yi', 'gemma', 'deepseek', 'openbuddy', 'xverse', 'orion',
                           'bluelm', 'ziya', 'skywork', 'deepseek-v2', 'minicpm', 'phi3', 'internlm2'):
             for idx, module in enumerate(module_list):
-                getattr(module, attention).layer_idx = idx
+                try:
+                    getattr(module, attention).layer_idx = idx
+                except AttributeError:
+                    getattr(module, 'cross_attn').layer_idx = idx
         elif model_type in ('chatglm', 'glm4'):
             for idx, module in enumerate(module_list):
                 getattr(module, attention).layer_number = idx
@@ -135,6 +154,34 @@ def get_model_key_mapping(cls, model_type, config) -> ModelKeys:
             'LLaMAPro only support models with o_proj and down_proj components.'
         return model_key_mapping
 
+    @classmethod
+    def search_correct_model_type(cls, module: nn.Module):
+        for arch_name, arch_type in MODEL_ARCH_MAPPING.items():
+            arch_type: ModelKeys
+            if getattr(arch_type, 'module_list') is None:
+                # Need to be a LLM arch
+                continue
+
+            matched = True
+            for f in fields(arch_type):
+                arch_str = getattr(arch_type, f.name)
+                if f.name == 'arch_name' or arch_str is None:
+                    continue
+
+                arch_str = arch_str.replace('{}', '0')
+                try:
+                    sub_module = module.get_submodule(arch_str)
+                    if sub_module is None:
+                        matched = False
+                except AttributeError:
+                    matched = False
+
+                if not matched:
+                    break
+
+            if matched:
+                return arch_name
+
     @staticmethod
     def _update_module_weight(config: LLaMAProConfig, module_list, new_module_idx):
         model_key_mapping = LLaMAPro.get_model_key_mapping(config.model_type, config)

From c1f10f4c099913c7f3f35e5f4ec67e67b4b95f09 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Mon, 23 Dec 2024 19:35:54 +0800
Subject: [PATCH 04/13] support AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B
 (#2739)

---
 ...14\346\225\260\346\215\256\351\233\206.md" |  5 +++--
 .../Supported-models-and-datasets.md          |  5 +++--
 swift/llm/model/constant.py                   |  4 +++-
 swift/llm/model/model/llm.py                  | 17 ++++++++++++++-
 swift/llm/template/constant.py                |  4 +++-
 swift/llm/template/template/llm.py            | 12 +++++++++++
 swift/version.py                              |  2 +-
 tests/test_align/test_template/test_llm.py    | 21 ++++++++++++++++++-
 8 files changed, 61 insertions(+), 9 deletions(-)

diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 89061f4cb..0ae7bd93f 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -418,6 +418,9 @@
 |[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)|
 |[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)|
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
+|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
+|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-|
+|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
 |[IEITYuan/Yuan2.0-2B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-2B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)|
 |[IEITYuan/Yuan2.0-51B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-51B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)|
 |[IEITYuan/Yuan2.0-102B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-102B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)|
@@ -451,8 +454,6 @@
 |[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)|
 |[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
-|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|-|
-|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
 
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
index 94bfb0e38..56397f528 100644
--- a/docs/source_en/Instruction/Supported-models-and-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -418,6 +418,9 @@ The table below introduces the models integrated with ms-swift:
 |[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)|
 |[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)|
 |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)|
+|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)|
+|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-|
+|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)|
 |[IEITYuan/Yuan2.0-2B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-2B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)|
 |[IEITYuan/Yuan2.0-51B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-51B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)|
 |[IEITYuan/Yuan2.0-102B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-102B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)|
@@ -451,8 +454,6 @@ The table below introduces the models integrated with ms-swift:
 |[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)|
 |[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)|
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
-|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|-|
-|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
 
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 173f4fce9..4ac9de3d7 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -78,6 +78,9 @@ class LLMModelType:
     gemma = 'gemma'
     gemma2 = 'gemma2'
 
+    skywork = 'skywork'
+    skywork_o1 = 'skywork_o1'
+
     yuan2 = 'yuan2'
     orion = 'orion'
     xverse = 'xverse'
@@ -89,7 +92,6 @@ class LLMModelType:
     grok = 'grok'
     mamba = 'mamba'
     polylm = 'polylm'
-    skywork = 'skywork'
     aya = 'aya'
 
 
diff --git a/swift/llm/model/model/llm.py b/swift/llm/model/model/llm.py
index e46189b38..dfc4639aa 100644
--- a/swift/llm/model/model/llm.py
+++ b/swift/llm/model/model/llm.py
@@ -80,7 +80,7 @@ def get_skywork_model_tokenizer(model_dir: str,
         LLMModelType.skywork,
         [
             ModelGroup([
-                Model('skywork/Skywork-13B-base'),
+                Model('skywork/Skywork-13B-base', 'skywork/Skywork-13B-base'),
                 Model('skywork/Skywork-13B-chat'),
             ]),
         ],
@@ -90,6 +90,21 @@ def get_skywork_model_tokenizer(model_dir: str,
         model_arch=ModelArch.llama,
     ))
 
+register_model(
+    ModelMeta(
+        LLMModelType.skywork_o1,
+        [
+            ModelGroup([
+                Model('AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B', 'Skywork/Skywork-o1-Open-Llama-3.1-8B'),
+            ]),
+        ],
+        TemplateType.skywork_o1,
+        get_model_tokenizer_with_flash_attn,
+        architectures=['LlamaForCausalLM'],
+        requires=['transformers>=4.43'],
+        model_arch=ModelArch.llama,
+    ))
+
 
 def get_model_tokenizer_yuan(model_dir: str,
                              model_info: ModelInfo,
diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py
index 9046bc9c0..963ad75de 100644
--- a/swift/llm/template/constant.py
+++ b/swift/llm/template/constant.py
@@ -50,6 +50,9 @@ class LLMTemplateType:
     codefuse = 'codefuse'
     codefuse_codellama = 'codefuse_codellama'
 
+    skywork = 'skywork'
+    skywork_o1 = 'skywork_o1'
+
     mistral_nemo = 'mistral_nemo'
     zephyr = 'zephyr'
     wizardlm2 = 'wizardlm2'
@@ -59,7 +62,6 @@ class LLMTemplateType:
 
     yuan = 'yuan'
     xverse = 'xverse'
-    skywork = 'skywork'
     bluelm = 'bluelm'
     orion = 'orion'
 
diff --git a/swift/llm/template/template/llm.py b/swift/llm/template/template/llm.py
index b0376a19f..0cd125d8a 100644
--- a/swift/llm/template/template/llm.py
+++ b/swift/llm/template/template/llm.py
@@ -5,6 +5,7 @@
 from ..constant import LLMTemplateType
 from ..register import TemplateMeta, register_template
 from ..utils import Prompt
+from .llama import Llama3_2TemplateMeta
 from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
 
 register_template(
@@ -70,6 +71,17 @@
         chat_sep=None,
         suffix=['[SEP]</s>']))
 
+register_template(
+    Llama3_2TemplateMeta(
+        LLMTemplateType.skywork_o1,
+        default_system=(
+            'You are Skywork-o1, a thinking model developed by Skywork AI, specializing in solving complex problems '
+            "involving mathematics, coding, and logical reasoning through deep thought. When faced with a user's "
+            'request, you first engage in a lengthy and in-depth thinking process to explore possible solutions to '
+            'the problem. After completing your thoughts, you then provide a detailed explanation of the solution '
+            'process in your response.'),
+    ))
+
 register_template(
     TemplateMeta(
         LLMTemplateType.bluelm,
diff --git a/swift/version.py b/swift/version.py
index 9a09a9153..2691e62f9 100644
--- a/swift/version.py
+++ b/swift/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '3.0.0.dev0'
+__version__ = '3.1.0.dev0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-10-13 08:56:12'
diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py
index c43c67365..b1f8272be 100644
--- a/tests/test_align/test_template/test_llm.py
+++ b/tests/test_align/test_template/test_llm.py
@@ -140,6 +140,24 @@ def test_megrez():
     assert res == res2, f'res: {res}, res2: {res2}'
 
 
+def test_skywork_o1():
+    pt_engine = PtEngine('AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B')
+    res = _infer_model(
+        pt_engine,
+        messages=[{
+            'role':
+            'user',
+            'content':
+            ('Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits '
+             'all her apples equally among herself and her 2 siblings. How many apples does each person get?')
+        }])
+    assert res == ("To solve the problem, let's break it down into a series of logical steps:\n\n1. **Initial Number "
+                   'of Apples**: Jane starts with 12 apples.\n2. **Apples Given Away**: Jane gives 4 apples to her '
+                   'friend Mark. So, the number of apples she has now is:\n   \\[\n   12 - 4 = 8\n   \\]\n3. **Apples '
+                   'Bought**: Jane then buys 1 more apple. So, the number of apples she has now is:\n   \\[\n   '
+                   '8 + 1 = 9\n   \\]\n4. **Apples Split Equally')
+
+
 if __name__ == '__main__':
     from swift.llm import PtEngine, RequestConfig, get_template, get_model_tokenizer, VllmEngine
     from swift.utils import get_logger, seed_everything
@@ -158,4 +176,5 @@ def test_megrez():
     # test_glm_edge()
     # test_llama()
     # test_openbuddy()
-    test_megrez()
+    # test_megrez()
+    test_skywork_o1()

From f913bca710852b25060d798d588c1609d0af96c2 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 24 Dec 2024 10:29:52 +0800
Subject: [PATCH 05/13] Fix windows encoding gbk (#2741)

---
 scripts/benchmark/exp_utils.py             |  8 ++++----
 scripts/benchmark/generate_report.py       |  2 +-
 scripts/utils/run_model_info.py            |  4 ++--
 swift/hub/hub.py                           |  2 +-
 swift/llm/argument/base_args/model_args.py |  2 +-
 swift/llm/dataset/register.py              |  2 +-
 swift/llm/export/merge_lora.py             |  2 +-
 swift/llm/export/ollama.py                 |  2 +-
 swift/plugin/loss_scale.py                 |  2 +-
 swift/tuners/base.py                       | 21 +++++++++++----------
 swift/tuners/peft.py                       |  5 +++--
 swift/tuners/utils.py                      |  4 ++--
 swift/ui/base.py                           |  4 ++--
 swift/ui/llm_infer/llm_infer.py            |  2 +-
 swift/ui/llm_infer/runtime.py              |  4 ++--
 swift/ui/llm_train/runtime.py              |  6 +++---
 16 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/scripts/benchmark/exp_utils.py b/scripts/benchmark/exp_utils.py
index 8caa020ac..5e1e9f8ba 100644
--- a/scripts/benchmark/exp_utils.py
+++ b/scripts/benchmark/exp_utils.py
@@ -106,7 +106,7 @@ def assert_gpu_not_overlap(self):
 
     def run(self, exp: Experiment):
         if os.path.exists(os.path.join(exp.input_args.save_dir, exp.name + '.json')):
-            with open(os.path.join(exp.input_args.save_dir, exp.name + '.json'), 'r') as f:
+            with open(os.path.join(exp.input_args.save_dir, exp.name + '.json'), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
                 if exp.eval_dataset and 'eval_result' not in _json['record']:
                     if not exp.do_eval:
@@ -238,7 +238,7 @@ def _find_free_gpu(self, n):
     def prepare_experiments(self, args: Any):
         experiments = []
         for config_file in args.config:
-            with open(config_file, 'r') as f:
+            with open(config_file, 'r', encoding='utf-8') as f:
                 group = os.path.basename(config_file)
                 group = group[:-5]
                 content = json.load(f)
@@ -275,7 +275,7 @@ def prepare_experiments(self, args: Any):
     def _get_metric(exp: Experiment):
         if exp.do_eval:
             if os.path.isfile(os.path.join('exp', f'{exp.name}.eval.log')):
-                with open(os.path.join('exp', f'{exp.name}.eval.log'), 'r') as f:
+                with open(os.path.join('exp', f'{exp.name}.eval.log'), 'r', encoding='utf-8') as f:
                     for line in f.readlines():
                         if 'Final report:' in line:
                             return json.loads(line.split('Final report:')[1].replace('\'', '"'))
@@ -301,7 +301,7 @@ def _get_metric(exp: Experiment):
             logging_dir = exp.runtime.get('logging_dir')
             logging_file = os.path.join(logging_dir, '..', 'logging.jsonl')
             if os.path.isfile(logging_file):
-                with open(logging_file, 'r') as f:
+                with open(logging_file, 'r', encoding='utf-8') as f:
                     for line in f.readlines():
                         if 'model_info' in line:
                             return json.loads(line)
diff --git a/scripts/benchmark/generate_report.py b/scripts/benchmark/generate_report.py
index 28332f70d..a02037360 100644
--- a/scripts/benchmark/generate_report.py
+++ b/scripts/benchmark/generate_report.py
@@ -284,7 +284,7 @@ def generate_export_report(outputs: List[ModelOutput]):
 
 
 def parse_output(file):
-    with open(file, 'r') as f:
+    with open(file, 'r', encoding='utf-8') as f:
         content = json.load(f)
 
     name = content['name']
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
index 8095ca7e9..11c07698e 100644
--- a/scripts/utils/run_model_info.py
+++ b/scripts/utils/run_model_info.py
@@ -54,7 +54,7 @@ def get_model_info_table():
         result[i] += text[i]
 
     for i, fpath in enumerate(fpaths):
-        with open(fpath, 'r') as f:
+        with open(fpath, 'r', encoding='utf-8') as f:
             text = f.read()
         llm_start_idx = text.find('| Model ID |')
         mllm_start_idx = text[llm_start_idx + 1:].find('| Model ID |') + llm_start_idx + 1
@@ -62,7 +62,7 @@ def get_model_info_table():
         mllm_end_idx = text.find(end_words[i][1])
         output = text[:llm_start_idx] + result[0] + '\n\n' + text[llm_end_idx:mllm_start_idx] + result[
             1] + '\n\n' + text[mllm_end_idx:]
-        with open(fpath, 'w') as f:
+        with open(fpath, 'w', encoding='utf-8') as f:
             f.write(output)
 
 
diff --git a/swift/hub/hub.py b/swift/hub/hub.py
index e684ba1fd..704cfa39e 100644
--- a/swift/hub/hub.py
+++ b/swift/hub/hub.py
@@ -256,7 +256,7 @@ def push_to_hub(cls,
         if commit_description:
             commit_message = commit_message + '\n' + commit_description
         if not os.path.exists(os.path.join(folder_path, 'configuration.json')):
-            with open(os.path.join(folder_path, 'configuration.json'), 'w') as f:
+            with open(os.path.join(folder_path, 'configuration.json'), 'w', encoding='utf-8') as f:
                 f.write('{"framework": "pytorch", "task": "text-generation", "allow_remote": true}')
         if ignore_patterns:
             ignore_patterns = [p for p in ignore_patterns if p != '_*']
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index c397238d9..bf61e07c4 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -52,7 +52,7 @@ def parse_to_dict(value: Union[str, Dict, None], strict: bool = True) -> Union[s
             value = {}
         elif isinstance(value, str):
             if os.path.exists(value):  # local path
-                with open(value, 'r') as f:
+                with open(value, 'r', encoding='utf-8') as f:
                     value = json.load(f)
             else:  # json str
                 try:
diff --git a/swift/llm/dataset/register.py b/swift/llm/dataset/register.py
index 62de5fd27..c7e4700f7 100644
--- a/swift/llm/dataset/register.py
+++ b/swift/llm/dataset/register.py
@@ -157,7 +157,7 @@ def register_dataset_info(dataset_info: Union[str, List[str], None] = None) -> L
         if os.path.isfile(dataset_path):
             log_msg = dataset_path
             base_dir = os.path.dirname(dataset_path)
-            with open(dataset_path, 'r') as f:
+            with open(dataset_path, 'r', encoding='utf-8') as f:
                 dataset_info = json.load(f)
         else:
             dataset_info = json.loads(dataset_info)  # json
diff --git a/swift/llm/export/merge_lora.py b/swift/llm/export/merge_lora.py
index 266397dfc..a00ef26f6 100644
--- a/swift/llm/export/merge_lora.py
+++ b/swift/llm/export/merge_lora.py
@@ -60,7 +60,7 @@ def merge_lora(args: ExportArguments, device_map=None, replace_if_exists=False)
                                                                                        '{base_model}', base_model)
         try:
             yamlfile = os.path.join(tempdir, 'mergekit.yaml')
-            with open(yamlfile, 'w') as f:
+            with open(yamlfile, 'w', encoding='utf-8') as f:
                 f.write(merge_yaml)
             logger.info(f'Merging with config: {merge_yaml}')
             os.system(f'mergekit-yaml {yamlfile} {mergekit_path}')
diff --git a/swift/llm/export/ollama.py b/swift/llm/export/ollama.py
index c1e4d44ea..90e3fbcab 100644
--- a/swift/llm/export/ollama.py
+++ b/swift/llm/export/ollama.py
@@ -36,7 +36,7 @@ def export_to_ollama(args: ExportArguments):
     pt_engine = PtEngine.from_model_template(model, template)
     logger.info(f'Using model_dir: {pt_engine.model_dir}')
     template_meta = template.template_meta
-    with open(os.path.join(args.output_dir, 'Modelfile'), 'w') as f:
+    with open(os.path.join(args.output_dir, 'Modelfile'), 'w', encoding='utf-8') as f:
         f.write(f'FROM {pt_engine.model_dir}\n')
         f.write(f'TEMPLATE """{{{{ if .System }}}}'
                 f'{replace_and_concat(template, template_meta.system_prefix, "{{SYSTEM}}", "{{ .System }}")}'
diff --git a/swift/plugin/loss_scale.py b/swift/plugin/loss_scale.py
index f5e9f5b92..275d2e0e4 100644
--- a/swift/plugin/loss_scale.py
+++ b/swift/plugin/loss_scale.py
@@ -69,7 +69,7 @@ def __init__(self):
         if self.loss_scale_config is not None:
             path = os.path.dirname(os.path.abspath(__file__))
             config_path = os.path.join(path, 'agent', self.loss_scale_config)
-            with open(config_path, 'r') as json_file:
+            with open(config_path, 'r', encoding='utf-8') as json_file:
                 self.loss_scale_map = json.load(json_file)
         else:
             self.loss_scale_map = None
diff --git a/swift/tuners/base.py b/swift/tuners/base.py
index dc9d47747..654e7c5be 100644
--- a/swift/tuners/base.py
+++ b/swift/tuners/base.py
@@ -322,7 +322,7 @@ def from_pretrained(cls,
             raise ValueError(f'Please pass in a local dir or a model id, not a local file: {model_dir}')
         extra_state_keys = kwargs.pop('extra_state_keys', None)
         if extra_state_keys is None and os.path.isfile(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME)):
-            with open(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME), 'r') as file:
+            with open(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME), 'r', encoding='utf-8') as file:
                 _json = json.load(file)
                 extra_state_keys = _json.get('extra_state_keys')
         if adapter_name is None:
@@ -340,7 +340,7 @@ def from_pretrained(cls,
                 logger.warning(f'{_name} is not a valid tuner')
                 continue
 
-            with open(config_file, 'r') as file:
+            with open(config_file, 'r', encoding='utf-8') as file:
                 json_object = json.load(file)
 
             if SWIFT_TYPE_KEY not in json_object:
@@ -395,7 +395,7 @@ def create_or_update_model_card(self, output_dir: str):
         if not os.path.exists(os.path.join(output_dir, 'README.md')):
             lines = []
         else:
-            with open(os.path.join(output_dir, 'README.md'), 'r') as f:
+            with open(os.path.join(output_dir, 'README.md'), 'r', encoding='utf-8') as f:
                 lines = f.readlines()
 
         quantization_config = None
@@ -426,7 +426,7 @@ def create_or_update_model_card(self, output_dir: str):
         lines.append(f'{base_model_heading}\n\n- BaseModel Class {self.base_model.__class__.__name__}\n')
 
         # write the lines back to README.md
-        with open(os.path.join(output_dir, 'README.md'), 'w') as f:
+        with open(os.path.join(output_dir, 'README.md'), 'w', encoding='utf-8') as f:
             f.writelines(lines)
 
     def add_weighted_adapter(
@@ -587,13 +587,14 @@ def save_pretrained(self,
                 os.makedirs(os.path.join(save_directory, self.EXTRA_STATE_DIR), exist_ok=True)
                 self._save_state_dict(output_state_dict, os.path.join(save_directory, self.EXTRA_STATE_DIR),
                                       safe_serialization)
-                with open(os.path.join(save_directory, self.EXTRA_STATE_DIR, CONFIG_NAME), 'w') as file:
+                with open(
+                        os.path.join(save_directory, self.EXTRA_STATE_DIR, CONFIG_NAME), 'w', encoding='utf-8') as file:
                     json.dump({'extra_state_keys': self.extra_state_keys}, file)
             else:
                 logger.error('Full parameter training, save_extra_states will be ignored')
 
         if not os.path.exists(os.path.join(save_directory, 'configuration.json')):
-            with open(os.path.join(save_directory, 'configuration.json'), 'w') as f:
+            with open(os.path.join(save_directory, 'configuration.json'), 'w', encoding='utf-8') as f:
                 f.write('{}')
 
     @staticmethod
@@ -776,7 +777,7 @@ def has_custom_content(_json):
             return not LoRAConfig(**_json).can_be_saved_to_peft()
 
         for adapter in adapter_names:
-            with open(os.path.join(ckpt_dir, adapter, CONFIG_NAME)) as f:
+            with open(os.path.join(ckpt_dir, adapter, CONFIG_NAME), encoding='utf-8') as f:
                 _json = json.load(f)
                 if has_custom_content(_json):
                     raise AssertionError('Cannot transfer to peft format, '
@@ -802,7 +803,7 @@ def has_custom_content(_json):
             state_dict = new_state_dict
             SwiftModel._save_state_dict(state_dict, os.path.join(output_dir, adapter), safe_serialization)
             from swift import LoRAConfig
-            with open(os.path.join(output_dir, adapter, CONFIG_NAME)) as f:
+            with open(os.path.join(output_dir, adapter, CONFIG_NAME), encoding='utf-8') as f:
                 _json = json.load(f)
                 peft_config = LoRAConfig(**_json).to_peft_config()
             peft_config.save_pretrained(os.path.join(output_dir, adapter))
@@ -836,7 +837,7 @@ def from_pretrained(model: Union[nn.Module, SwiftModel, PeftModel],
             model_id = snapshot_download(model_id, revision=revision)
         is_peft_model = False
         if os.path.exists(os.path.join(model_id, CONFIG_NAME)):
-            with open(os.path.join(model_id, CONFIG_NAME), 'r') as f:
+            with open(os.path.join(model_id, CONFIG_NAME), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
             is_peft_model = SWIFT_TYPE_KEY not in _json
 
@@ -845,7 +846,7 @@ def from_pretrained(model: Union[nn.Module, SwiftModel, PeftModel],
             if isinstance(adapter_name, list) else list(adapter_name.keys())[0]
         _name = _name or ''
         if os.path.exists(os.path.join(model_id, _name, CONFIG_NAME)):
-            with open(os.path.join(model_id, _name, CONFIG_NAME), 'r') as f:
+            with open(os.path.join(model_id, _name, CONFIG_NAME), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
             is_peft_model = SWIFT_TYPE_KEY not in _json and 'extra_state_keys' not in _json
         if is_peft_model:
diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py
index a01b28d87..c8cdd0708 100644
--- a/swift/tuners/peft.py
+++ b/swift/tuners/peft.py
@@ -61,7 +61,7 @@ def save_pretrained(self, save_directory: str, **kwargs) -> None:
             'lorap_lr_ratio': self.lorap_lr_ratio,
             'lorap_emb_lr': self.lorap_emb_lr,
         }
-        with open(os.path.join(save_directory, 'additional_config.json'), 'w') as f:
+        with open(os.path.join(save_directory, 'additional_config.json'), 'w', encoding='utf-8') as f:
             json.dump(additional_args, f)
 
     @classmethod
@@ -75,7 +75,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional
             self = LoraConfig(**self.to_dict())
 
         if os.path.isfile(os.path.join(pretrained_model_name_or_path, 'additional_config.json')):
-            with open(os.path.join(pretrained_model_name_or_path, 'additional_config.json'), 'r') as f:
+            with open(
+                    os.path.join(pretrained_model_name_or_path, 'additional_config.json'), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
                 for key, value in _json.items():
                     setattr(self, key, value)
diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py
index 72d5904f2..bb23221bc 100644
--- a/swift/tuners/utils.py
+++ b/swift/tuners/utils.py
@@ -59,7 +59,7 @@ def save_pretrained(self, save_directory, **kwargs):
         output_path = os.path.join(save_directory, CONFIG_NAME)
 
         # save it
-        with open(output_path, 'w') as writer:
+        with open(output_path, 'w', encoding='utf-8') as writer:
             writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
 
     @classmethod
@@ -103,7 +103,7 @@ def from_json_file(cls, path_json_file, **kwargs):
             path_json_file (`str`):
                 The path to the json file.
         """
-        with open(path_json_file, 'r') as file:
+        with open(path_json_file, 'r', encoding='utf-8') as file:
             json_object = json.load(file)
 
         return json_object
diff --git a/swift/ui/base.py b/swift/ui/base.py
index 5ee66c01a..ed6c389ac 100644
--- a/swift/ui/base.py
+++ b/swift/ui/base.py
@@ -138,7 +138,7 @@ def save_cache(cls, key, value):
         timestamp = str(int(time.time()))
         key = key.replace('/', '-')
         filename = os.path.join(cls.cache_dir, key + '-' + timestamp)
-        with open(filename, 'w') as f:
+        with open(filename, 'w', encoding='utf-8') as f:
             json.dump(value, f)
 
     @classmethod
@@ -161,7 +161,7 @@ def load_cache(cls, key, timestamp) -> BaseArguments:
         timestamp = int(dt_object.timestamp())
         key = key.replace('/', '-')
         filename = key + '-' + str(timestamp)
-        with open(os.path.join(cls.cache_dir, filename), 'r') as f:
+        with open(os.path.join(cls.cache_dir, filename), 'r', encoding='utf-8') as f:
             return json.load(f)
 
     @classmethod
diff --git a/swift/ui/llm_infer/llm_infer.py b/swift/ui/llm_infer/llm_infer.py
index 07dbd9a43..65ac9f202 100644
--- a/swift/ui/llm_infer/llm_infer.py
+++ b/swift/ui/llm_infer/llm_infer.py
@@ -230,7 +230,7 @@ def deploy(cls, *args):
         model = kwargs.get('model')
         if os.path.exists(model) and os.path.exists(os.path.join(model, 'args.json')):
             kwargs['ckpt_dir'] = kwargs.pop('model')
-            with open(os.path.join(kwargs['ckpt_dir'], 'args.json'), 'r') as f:
+            with open(os.path.join(kwargs['ckpt_dir'], 'args.json'), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
                 kwargs['model_type'] = _json['model_type']
                 kwargs['train_type'] = _json['train_type']
diff --git a/swift/ui/llm_infer/runtime.py b/swift/ui/llm_infer/runtime.py
index 6dbb37812..6b086cf2b 100644
--- a/swift/ui/llm_infer/runtime.py
+++ b/swift/ui/llm_infer/runtime.py
@@ -131,7 +131,7 @@ def wait(cls, task):
         latest_data = ''
         lines = collections.deque(maxlen=int(os.environ.get('MAX_LOG_LINES', 50)))
         try:
-            with open(log_file, 'r') as input:
+            with open(log_file, 'r', encoding='utf-8') as input:
                 input.seek(offset)
                 fail_cnt = 0
                 while True:
@@ -268,7 +268,7 @@ def task_changed(cls, task, base_tab):
                 ret.append(gr.update())
         train_type = None
         if is_custom_path:
-            with open(os.path.join(all_args['ckpt_dir'], 'args.json'), 'r') as f:
+            with open(os.path.join(all_args['ckpt_dir'], 'args.json'), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
                 train_type = _json.get('train_type')
         return ret + [gr.update(value=None), [all_args.get('model_type'), all_args.get('template_type'), train_type]]
diff --git a/swift/ui/llm_train/runtime.py b/swift/ui/llm_train/runtime.py
index 218d0d6f2..a8a02c198 100644
--- a/swift/ui/llm_train/runtime.py
+++ b/swift/ui/llm_train/runtime.py
@@ -319,7 +319,7 @@ def wait(cls, logging_dir, task):
         latest_data = ''
         lines = collections.deque(maxlen=int(os.environ.get('MAX_LOG_LINES', 50)))
         try:
-            with open(log_file, 'r') as input:
+            with open(log_file, 'r', encoding='utf-8') as input:
                 input.seek(offset)
                 fail_cnt = 0
                 while True:
@@ -451,8 +451,8 @@ def parse_info_from_cmdline(task):
             all_args[splits[0]] = splits[1]
 
         output_dir = all_args['output_dir']
-        if os.path.exists(os.path.join(output_dir, 'sft_args.json')):
-            with open(os.path.join(output_dir, 'sft_args.json'), 'r') as f:
+        if os.path.exists(os.path.join(output_dir, 'args.json')):
+            with open(os.path.join(output_dir, 'args.json'), 'r', encoding='utf-8') as f:
                 _json = json.load(f)
             for key in all_args.keys():
                 all_args[key] = _json.get(key)

From 00c2eaa97c6891684f34a00b556ca47d20103ab8 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 24 Dec 2024 10:52:01 +0800
Subject: [PATCH 06/13] fix docs multimodal; fix pretrain mllm (#2742)

---
 ...11\346\225\260\346\215\256\351\233\206.md" | 20 +++++++++++++----
 .../source_en/Customization/Custom-dataset.md | 22 +++++++++++++++----
 swift/llm/template/base.py                    |  1 +
 swift/llm/template/template/gemma.py          |  2 +-
 tests/test_align/test_template/test_vision.py |  7 +++---
 5 files changed, 39 insertions(+), 13 deletions(-)

diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
index da454cc19..aaa018144 100644
--- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
@@ -34,7 +34,7 @@ query-response格式：
 
 ## 推荐数据集格式
 
-以下给出ms-swift的推荐数据集格式：
+以下给出ms-swift的推荐数据集格式，其中system字段是可选的，默认使用template中定义的`default_system`。
 
 ### 预训练
 
@@ -69,11 +69,23 @@ query-response格式：
 
 ### 多模态
 
-对于多模态数据集，和上述任务的格式相同。区别在于增加了`images`, `videos`, `audios`几个key，分别代表多模态资源：
+对于多模态数据集，和上述任务的格式相同。区别在于增加了`images`, `videos`, `audios`几个key，分别代表多模态资源，`<image>` `<video>` `<audio>`标签代表了插入图片/视频/音频的位置。下面给出的四条示例分别展示了纯文本，以及包含图像、视频和音频数据的数据格式。
+
+预训练：
+```
+{"messages": [{"role": "assistant", "content": "预训练的文本在这里"}]}
+{"messages": [{"role": "assistant", "content": "<image>是一只小狗，<image>是一只小猫"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+{"messages": [{"role": "assistant", "content": "<audio>描述了今天天气真不错"}], "audios": ["/xxx/x.wav"]}
+{"messages": [{"role": "assistant", "content": "<image>是一个大象，<video>是一只狮子在跑步"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+```
+
+微调：
 ```jsonl
-{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么,<video>视频中是什么"}, {"role": "assistant", "content": "一个大象，一个狮子"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+{"messages": [{"role": "user", "content": "<image><image>两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫，后一张是小狗"}], "images": ["/xxx/x.jpg", "xxx/x.png"]}
+{"messages": [{"role": "user", "content": "<audio>语音说了什么"}, {"role": "assistant", "content": "今天天气真好呀"}], "audios": ["/xxx/x.mp3"]}
+{"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么，<video>视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象，视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
-其中`<image>` `<video>` `<audio>`标签代表了插入图片/视频/音频的位置。
+RLHF的数据格式可以参考纯文本大模型的格式。
 
 #### grounding
 
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
index 1dea15e4a..6f26382c7 100644
--- a/docs/source_en/Customization/Custom-dataset.md
+++ b/docs/source_en/Customization/Custom-dataset.md
@@ -33,7 +33,7 @@ There are three ways to integrate a custom dataset, with increasing control over
 
 ## Recommended Dataset Format
 
-Here is the recommended dataset format for ms-swift:
+The following provides the recommended dataset format for ms-swift, where the system field is optional and defaults to the `default_system` defined in the template.
 
 ### Pre-training
 
@@ -68,11 +68,25 @@ Here is the recommended dataset format for ms-swift:
 
 ### Multimodal
 
-For multimodal datasets, the format is the same as above. The difference is that it includes the keys `images`, `videos`, and `audios`, which represent multimodal resources:
+For multimodal datasets, the format is the same as the tasks mentioned above. The difference is the addition of several keys: `images`, `videos`, and `audios`, which represent multimodal resources. The tags `<image>`, `<video>`, and `<audio>` indicate the positions where images, videos, and audio are inserted, respectively. The four examples provided below demonstrate the data format for pure text, as well as formats that include image, video, and audio data.
+
+
+Pre-training:
+```jsonl
+{"messages": [{"role": "assistant", "content": "Pre-trained text goes here"}]}
+{"messages": [{"role": "assistant", "content": "<image> is a puppy, <image> is a kitten"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+{"messages": [{"role": "assistant", "content": "<audio> describes how nice the weather is today"}], "audios": ["/xxx/x.wav"]}
+{"messages": [{"role": "assistant", "content": "<image> is an elephant, <video> is a lion running"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+```
+
+Supervised Fine-tuning:
+
 ```jsonl
-{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image> What is in the image? <video> What is in the video?"}, {"role": "assistant", "content": "An elephant and a lion"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+{"messages": [{"role": "user", "content": "<image><image>What is the difference between the two images?"}, {"role": "assistant", "content": "The first one is a kitten, and the second one is a puppy."}], "images": ["/xxx/x.jpg", "xxx/x.png"]}
+{"messages": [{"role": "user", "content": "<audio>What did the audio say?"}, {"role": "assistant", "content": "The weather is really nice today."}], "audios": ["/xxx/x.mp3"]}
+{"messages": [{"role": "system", "content": "You are a helpful and harmless assistant."}, {"role": "user", "content": "<image>What is in the image, <video>What is in the video?"}, {"role": "assistant", "content": "The image shows an elephant, and the video shows a puppy running on the grass."}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
-The `<image>`, `<video>`, and `<audio>` tags indicate where to insert images/videos/audios.
+The data format for RLHF can refer to the format used for pure text large models.
 
 #### Grounding
 
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index be6f0941e..2ef821b94 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -442,6 +442,7 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
                     idx = getattr(inputs, f'{k}_idx')
                     c_list = self.replace_tag(k, idx, inputs)
                     setattr(inputs, f'{k}_idx', idx + 1)
+                    loss_scale = 0.
                     break
             else:
                 if context == '<ref-object>':
diff --git a/swift/llm/template/template/gemma.py b/swift/llm/template/template/gemma.py
index dabf3644f..8881feec5 100644
--- a/swift/llm/template/template/gemma.py
+++ b/swift/llm/template/template/gemma.py
@@ -41,7 +41,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         else:
             encoded['token_type_ids'] = [0] * len(encoded['input_ids'])
         if raw_image:
-            model_inputs = processor(text=inputs.to_history()['query'], images=raw_image[0], return_tensors='pt')
+            model_inputs = processor(text='<image>' * len(raw_image), images=raw_image, return_tensors='pt')
             encoded['pixel_values'] = model_inputs['pixel_values'].to(self.config.torch_dtype)
         return encoded
 
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
index 6191fe450..35e56ca58 100644
--- a/tests/test_align/test_template/test_vision.py
+++ b/tests/test_align/test_template/test_vision.py
@@ -189,10 +189,9 @@ def test_paligemma2():
     pt_engine = PtEngine('AI-ModelScope/paligemma2-3b-ft-docci-448', torch_dtype=torch.bfloat16)
     response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'caption en'}])
     assert response == (
-        'A close up view of a white kitten with black stripes on its head and body. The kitten is looking straight '
-        'ahead with its light blue eyes. The kitten has a pink nose and mouth. The kitten is sitting on a white '
-        'surface. A white light is shining on the kitten and the white surface. A shadow is being cast underneath '
-        'the kitten and the white surface.')
+        'A close up view of a white and gray kitten with black stripes on its head and face staring forward with '
+        'its light blue eyes. The kitten is sitting on a white surface with a blurry background. '
+        "There is a light shining on the top of the kitten's head and the front of its body.")
 
 
 def test_pixtral():

From 2fcda5b0e1acfb4ed0be0b392aca8babc040c194 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 25 Dec 2024 00:36:05 +0800
Subject: [PATCH 07/13] support SequenceClassification & update QVQ-72B-Preview
 (#2747)

---
 README.md                                     | 12 ++-
 README_CN.md                                  | 12 ++-
 .../\346\217\222\344\273\266\345\214\226.md"  |  5 --
 ...11\346\225\260\346\215\256\351\233\206.md" |  1 +
 ...53\351\200\237\345\274\200\345\247\213.md" |  6 +-
 docs/source/Instruction/ReleaseNote3.0.md     |  1 -
 ...44\350\241\214\345\217\202\346\225\260.md" |  4 +-
 .../source_en/Customization/Custom-dataset.md | 13 ++--
 docs/source_en/Customization/Pluginization.md |  6 --
 docs/source_en/GetStarted/Quick-start.md      |  6 +-
 .../Instruction/Command-line-parameters.md    |  5 +-
 docs/source_en/Instruction/ReleaseNote3.0.md  |  1 -
 examples/infer/cli_demo.sh                    |  3 +-
 examples/infer/lmdeploy/mllm_tp.sh            |  3 +-
 examples/infer/pt/lora.sh                     |  3 +-
 .../infer.ipynb                               |  0
 .../self-cognition-sft.ipynb                  |  0
 .../{mllm => qwen2vl-ocr}/infer.ipynb         |  0
 .../{mllm => qwen2vl-ocr}/ocr-sft.ipynb       |  0
 examples/train/demo.sh                        |  3 +-
 examples/train/multimodal/dpo.sh              |  2 +-
 .../{train_grounding.sh => grounding.sh}      |  2 +-
 examples/train/multimodal/infer.sh            |  7 ++
 examples/train/multimodal/ocr.sh              | 25 +++++++
 examples/train/multimodal/video.sh            | 31 ++++++++
 .../train/multimodal/{train.sh => vqa.sh}     |  3 +-
 examples/train/plugins/train_cls.sh           | 17 -----
 examples/train/seq_cls/infer.sh               |  6 ++
 examples/train/seq_cls/sft.sh                 | 27 +++++++
 examples/train/tuners/lora-ga/train.sh        |  3 +-
 swift/__init__.py                             | 13 +++-
 swift/llm/argument/base_args/base_args.py     | 15 ++--
 .../llm/argument/base_args/generation_args.py |  6 +-
 swift/llm/argument/base_args/model_args.py    | 20 ++++-
 swift/llm/argument/base_args/template_args.py |  3 +-
 swift/llm/argument/train_args.py              |  1 -
 swift/llm/dataset/dataset/llm.py              | 51 ++++++++++++-
 swift/llm/dataset/dataset/mllm.py             | 26 +++++--
 swift/llm/infer/infer.py                      | 11 ++-
 swift/llm/infer/infer_engine/pt_engine.py     | 58 +++++++++++++--
 swift/llm/model/model/qwen.py                 | 37 ++++------
 swift/llm/model/register.py                   |  5 +-
 swift/llm/model/utils.py                      |  1 +
 swift/llm/template/base.py                    | 27 ++++---
 swift/llm/template/template/qwen.py           | 37 +++-------
 swift/llm/template/vision_utils.py            | 74 -------------------
 swift/llm/train/pt.py                         |  5 +-
 swift/llm/train/rlhf.py                       |  4 +-
 swift/llm/train/sft.py                        | 18 ++---
 swift/llm/train/tuner.py                      |  6 +-
 swift/plugin/__init__.py                      |  2 -
 swift/plugin/custom_trainer.py                | 37 ----------
 swift/plugin/metric.py                        | 14 ++--
 swift/trainers/mixin.py                       | 52 +++++++++++--
 swift/trainers/rlhf_trainer/rlhf_mixin.py     |  2 +-
 swift/trainers/trainer_factory.py             | 11 ++-
 swift/trainers/trainers.py                    | 43 +++++------
 tests/train/test_cls.py                       | 21 ++++++
 tests/train/test_sft.py                       |  4 +-
 59 files changed, 476 insertions(+), 335 deletions(-)
 rename examples/notebook/{llm => qwen2.5-self-cognition}/infer.ipynb (100%)
 rename examples/notebook/{llm => qwen2.5-self-cognition}/self-cognition-sft.ipynb (100%)
 rename examples/notebook/{mllm => qwen2vl-ocr}/infer.ipynb (100%)
 rename examples/notebook/{mllm => qwen2vl-ocr}/ocr-sft.ipynb (100%)
 rename examples/train/multimodal/{train_grounding.sh => grounding.sh} (95%)
 create mode 100644 examples/train/multimodal/infer.sh
 create mode 100644 examples/train/multimodal/ocr.sh
 create mode 100644 examples/train/multimodal/video.sh
 rename examples/train/multimodal/{train.sh => vqa.sh} (91%)
 delete mode 100644 examples/train/plugins/train_cls.sh
 create mode 100644 examples/train/seq_cls/infer.sh
 create mode 100644 examples/train/seq_cls/sft.sh
 delete mode 100644 swift/plugin/custom_trainer.py
 create mode 100644 tests/train/test_cls.py

diff --git a/README.md b/README.md
index 82fff525f..cbce47f37 100644
--- a/README.md
+++ b/README.md
@@ -146,7 +146,8 @@ After training is complete, use the following command to perform inference with
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --stream true
+    --stream true \
+    --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
 CUDA_VISIBLE_DEVICES=0 \
@@ -155,7 +156,8 @@ swift infer \
     --stream true \
     --merge_lora true \
     --infer_backend vllm \
-    --max_model_len 8192
+    --max_model_len 8192 \
+    --max_new_tokens 2048
 ```
 
 ### Web-UI
@@ -262,7 +264,8 @@ CUDA_VISIBLE_DEVICES=0 swift rlhf \
 CUDA_VISIBLE_DEVICES=0 swift infer \
     --model Qwen/Qwen2.5-7B-Instruct \
     --stream true \
-    --infer_backend pt
+    --infer_backend pt \
+    --max_new_tokens 2048
 
 # LoRA
 CUDA_VISIBLE_DEVICES=0 swift infer \
@@ -270,7 +273,8 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
     --adapters swift/test_lora \
     --stream true \
     --infer_backend pt \
-    --temperature 0
+    --temperature 0 \
+    --max_new_tokens 2048
 ```
 
 ### Deployment
diff --git a/README_CN.md b/README_CN.md
index 4e204b7d9..86eca3fad 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -139,7 +139,8 @@ swift sft \
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --stream true
+    --stream true \
+    --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
 CUDA_VISIBLE_DEVICES=0 \
@@ -148,7 +149,8 @@ swift infer \
     --stream true \
     --merge_lora true \
     --infer_backend vllm \
-    --max_model_len 8192
+    --max_model_len 8192 \
+    --max_new_tokens 2048
 ```
 
 ### Web-UI
@@ -254,7 +256,8 @@ CUDA_VISIBLE_DEVICES=0 swift rlhf \
 CUDA_VISIBLE_DEVICES=0 swift infer \
     --model Qwen/Qwen2.5-7B-Instruct \
     --stream true \
-    --infer_backend pt
+    --infer_backend pt \
+    --max_new_tokens 2048
 
 # LoRA
 CUDA_VISIBLE_DEVICES=0 swift infer \
@@ -262,7 +265,8 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
     --adapters swift/test_lora \
     --stream true \
     --infer_backend pt \
-    --temperature 0
+    --temperature 0 \
+    --max_new_tokens 2048
 ```
 
 ### 部署
diff --git "a/docs/source/Customization/\346\217\222\344\273\266\345\214\226.md" "b/docs/source/Customization/\346\217\222\344\273\266\345\214\226.md"
index fc86191d2..8cb3542cb 100644
--- "a/docs/source/Customization/\346\217\222\344\273\266\345\214\226.md"
+++ "b/docs/source/Customization/\346\217\222\344\273\266\345\214\226.md"
@@ -8,11 +8,6 @@ example在[这里](https://github.com/modelscope/swift/blob/main/swift/plugin/ca
 
 callback会在trainer构造前注册进trainer中，example中给出了一个简单版本的EarlyStop方案。
 
-## 定制化trainer
-
-example在[这里](https://github.com/modelscope/swift/blob/main/swift/plugin/custom_trainer.py).
-
-用户可以在这里继承现有trainer，并实现自己的训练逻辑，例如定制data_loader、定制compute_loss等。example中给出了一个text-classification任务的trainer。
 
 ## 定制化loss
 
diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
index aaa018144..b34b10169 100644
--- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
@@ -81,6 +81,7 @@ query-response格式：
 
 微调：
 ```jsonl
+{"messages": [{"role": "user", "content": "浙江的省会在哪？"}, {"role": "assistant", "content": "浙江的省会在杭州。"}]}
 {"messages": [{"role": "user", "content": "<image><image>两张图片有什么区别"}, {"role": "assistant", "content": "前一张是小猫，后一张是小狗"}], "images": ["/xxx/x.jpg", "xxx/x.png"]}
 {"messages": [{"role": "user", "content": "<audio>语音说了什么"}, {"role": "assistant", "content": "今天天气真好呀"}], "audios": ["/xxx/x.mp3"]}
 {"messages": [{"role": "system", "content": "你是个有用无害的助手"}, {"role": "user", "content": "<image>图片中是什么，<video>视频中是什么"}, {"role": "assistant", "content": "图片中是一个大象，视频中是一只小狗在草地上奔跑"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index 306b0a62f..c04094e6f 100644
--- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -63,7 +63,8 @@ swift sft \
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --stream true
+    --stream true \
+    --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
 CUDA_VISIBLE_DEVICES=0 \
@@ -72,7 +73,8 @@ swift infer \
     --stream true \
     --merge_lora true \
     --infer_backend vllm \
-    --max_model_len 8192
+    --max_model_len 8192 \
+    --max_new_tokens 2048
 ```
 
 > [!TIP]
diff --git a/docs/source/Instruction/ReleaseNote3.0.md b/docs/source/Instruction/ReleaseNote3.0.md
index 4e9bbe743..feda24658 100644
--- a/docs/source/Instruction/ReleaseNote3.0.md
+++ b/docs/source/Instruction/ReleaseNote3.0.md
@@ -18,7 +18,6 @@
     - 采用messages格式作为入参接口
 4. 支持了plugin机制，用于定制训练过程，目前支持的plugin有：
     - callback 定制训练回调方法
-    - custom_trainer 定制trainer
     - loss 定制loss方法
     - loss_scale 定制每个token的权重
     - metric 定制交叉验证的指标
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index bb867063f..8f3e9e29a 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -16,12 +16,13 @@
 - custom_register_path: 自定义模型、对话模板和数据集注册的`.py`文件路径
 
 ### 模型参数
-
+- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: 模型id或模型本地路径。如果是自定义模型请配合`model_type`和`template`使用，具体可以参考[自定义模型](../Customization/自定义模型.md)
 - model_type: 模型类型。相同的模型架构、template、模型加载过程被定义为一个model_type
 - model_revision: 模型版本
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`，默认从config文件中读取
 - attn_impl: attention类型，支持`flash_attn`, `sdpa`, `eager`，默认使用sdpa
+- num_labels: 分类模型需要指定。代表标签数量，默认为None
 - rope_scaling: rope类型，支持`linear`和`dynamic`，请配合`max_length`共同使用
 - device_map: 模型使用的device_map配置，例如：'auto'、'cpu'、json字符串、json文件路径
 - local_repo_path: 部分模型在加载时依赖于github repo. 为了避免`git clone`时遇到网络问题, 可以直接使用本地repo. 该参数需要传入本地repo的路径, 默认为`None`
@@ -290,7 +291,6 @@ Vera使用`target_modules`, `target_regex`, `modules_to_save`三个参数.
 - resume_only_model: 如果resume_from_checkpoint，仅resume模型权重，默认为False
 - check_model: 检查本地模型文件有损坏或修改并给出提示，默认为True。如果是断网环境，请设置为False
 - loss_type: loss类型，默认使用模型自带损失函数
-- num_labels: 分类模型需要指定。代表标签数量，默认为None
 
 - packing: 是否使用packing，默认为False
 - 🔥lazy_tokenize: 是否使用lazy_tokenize，在LLM训练中默认False，MLLM训练中默认True
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
index 6f26382c7..f190e8f47 100644
--- a/docs/source_en/Customization/Custom-dataset.md
+++ b/docs/source_en/Customization/Custom-dataset.md
@@ -74,14 +74,15 @@ For multimodal datasets, the format is the same as the tasks mentioned above. Th
 Pre-training:
 ```jsonl
 {"messages": [{"role": "assistant", "content": "Pre-trained text goes here"}]}
-{"messages": [{"role": "assistant", "content": "<image> is a puppy, <image> is a kitten"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
-{"messages": [{"role": "assistant", "content": "<audio> describes how nice the weather is today"}], "audios": ["/xxx/x.wav"]}
-{"messages": [{"role": "assistant", "content": "<image> is an elephant, <video> is a lion running"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+{"messages": [{"role": "assistant", "content": "<image>is a puppy, <image>is a kitten"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
+{"messages": [{"role": "assistant", "content": "<audio>describes how nice the weather is today"}], "audios": ["/xxx/x.wav"]}
+{"messages": [{"role": "assistant", "content": "<image>is an elephant, <video>is a lion running"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
 ```
 
 Supervised Fine-tuning:
 
 ```jsonl
+{"messages": [{"role": "user", "content": "Where is the capital of Zhejiang?"}, {"role": "assistant", "content": "The capital of Zhejiang is Hangzhou."}]}
 {"messages": [{"role": "user", "content": "<image><image>What is the difference between the two images?"}, {"role": "assistant", "content": "The first one is a kitten, and the second one is a puppy."}], "images": ["/xxx/x.jpg", "xxx/x.png"]}
 {"messages": [{"role": "user", "content": "<audio>What did the audio say?"}, {"role": "assistant", "content": "The weather is really nice today."}], "audios": ["/xxx/x.mp3"]}
 {"messages": [{"role": "system", "content": "You are a helpful and harmless assistant."}, {"role": "user", "content": "<image>What is in the image, <video>What is in the video?"}, {"role": "assistant", "content": "The image shows an elephant, and the video shows a puppy running on the grass."}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
@@ -93,7 +94,7 @@ The data format for RLHF can refer to the format used for pure text large models
 For grounding (object detection) tasks, SWIFT supports two methods:
 1. Maintain consistency with the above multimodal dataset format, adding special characters in the dataset, for example:
 ```jsonl
-{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image> Find a <ref> elephant </ref>"}, {"role": "assistant", "content": "<box>(200,450),(500,800)</box>"}], "images": ["/xxx/x.jpg"]}
+{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image>Find a <ref> elephant </ref>"}, {"role": "assistant", "content": "<box>(200,450),(500,800)</box>"}], "images": ["/xxx/x.jpg"]}
 ```
 With this type of data, please note:
   - Grounding tasks often require special characters. You need to determine which model to use, read the model paper to identify special characters for grounding tasks, and combine the data accordingly.
@@ -104,9 +105,9 @@ With this type of data, please note:
 
 ```jsonl
 # Object detection
-{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image> Identify <bbox>"}, {"role": "assistant", "content": "<ref-object>"}], "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]"}
+{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image>Identify <bbox>"}, {"role": "assistant", "content": "<ref-object>"}], "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]"}
 # Grounding to multiple bboxes
-{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image> Find <ref-object>"}, {"role": "assistant", "content": "<bbox>"}], "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [[138, 136, 235, 359], [1,2,3,4]], \"bbox_type\": \"real\", \"image\": 0}]"}
+{"messages": [{"role": "system", "content": "You are a useful and harmless assistant"}, {"role": "user", "content": "<image>Find <ref-object>"}, {"role": "assistant", "content": "<bbox>"}], "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [[138, 136, 235, 359], [1,2,3,4]], \"bbox_type\": \"real\", \"image\": 0}]"}
 ```
 
 This format adds the objects field, which includes:
diff --git a/docs/source_en/Customization/Pluginization.md b/docs/source_en/Customization/Pluginization.md
index 095e2963b..335bf456e 100644
--- a/docs/source_en/Customization/Pluginization.md
+++ b/docs/source_en/Customization/Pluginization.md
@@ -8,12 +8,6 @@ Examples can be found [here](https://github.com/modelscope/swift/blob/main/swift
 
 Callbacks are registered into the trainer before constructing the trainer. The example provides a simple version of the EarlyStop scheme.
 
-## Customized Trainer
-
-Examples can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/custom_trainer.py).
-
-Users can inherit existing trainers and implement their own training logic here, such as customizing data loaders, customizing compute_loss, etc. The example demonstrates a trainer for a text-classification task.
-
 ## Customized Loss
 
 Examples can be found [here](https://github.com/modelscope/swift/blob/main/swift/plugin/loss.py).
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
index 47861d52e..f717afaa9 100644
--- a/docs/source_en/GetStarted/Quick-start.md
+++ b/docs/source_en/GetStarted/Quick-start.md
@@ -63,7 +63,8 @@ After training is complete, use the following command to perform inference with
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --stream true
+    --stream true \
+    --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
 CUDA_VISIBLE_DEVICES=0 \
@@ -72,7 +73,8 @@ swift infer \
     --stream true \
     --merge_lora true \
     --infer_backend vllm \
-    --max_model_len 8192
+    --max_model_len 8192 \
+    --max_new_tokens 2048
 ```
 
 > [!TIP]
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index ef0236a8c..9e89e6b41 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -16,12 +16,13 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - custom_register_path: The file path for the custom model, chat template, and dataset registration `.py` files.
 
 ### Model Arguments
-
+- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: Model ID or local path to the model. If it's a custom model, please use it with `model_type` and `template`. The specific details can be referred to in the [Custom Model](../Customization/Custom-model.md).
 - model_type: Model type. The same model architecture, template, and loading process define a model_type.
 - model_revision: Model version.
 - 🔥torch_dtype: Data type for model weights, supports `float16`, `bfloat16`, `float32`, default is read from the config file.
 - attn_impl: Attention type, supports `flash_attn`, `sdpa`, `eager`, default is sdpa.
+- num_labels: To be specified for classification models, representing the number of labels, default is None.
 - rope_scaling: Rope type, supports `linear` and `dynamic`, to be used with `max_length`.
 - device_map: Configuration of the device map used by the model, e.g., 'auto', 'cpu', json string, json file path.
 - local_repo_path: Some models require a GitHub repo when loading. To avoid network issues during `git clone`, you can directly use a local repo. This parameter needs to pass the local repo path, default is `None`.
@@ -294,8 +295,6 @@ Training arguments include the [base arguments](#base-arguments), [Seq2SeqTraine
 - resume_only_model: If resume_from_checkpoint, only resume model weights, default is False.
 - check_model: Check local model files for corruption or modification and give a prompt, default is True. If in an offline environment, please set to False.
 - loss_type: Type of loss, default uses the model's built-in loss function.
-- num_labels: To be specified for classification models, representing the number of labels, default is None.
--
 - packing: Whether to use packing, default is False.
 - 🔥lazy_tokenize: Whether to use lazy_tokenize, default is False during LLM training, default is True during MLLM training.
 
diff --git a/docs/source_en/Instruction/ReleaseNote3.0.md b/docs/source_en/Instruction/ReleaseNote3.0.md
index b3c63fd80..c6c1c9cec 100644
--- a/docs/source_en/Instruction/ReleaseNote3.0.md
+++ b/docs/source_en/Instruction/ReleaseNote3.0.md
@@ -21,7 +21,6 @@
 
 4. Supported plugin mechanism for customizing the training process. Current plugins include:
     - callback  to customize training callbacks,
-    - custom_trainer  to customize the trainer,
     - loss  to customize the loss method,
     - loss_scale  to customize the weight of each token,
     - metric  to customize cross-validation metrics,
diff --git a/examples/infer/cli_demo.sh b/examples/infer/cli_demo.sh
index 8e5ca1584..ef9849b5e 100644
--- a/examples/infer/cli_demo.sh
+++ b/examples/infer/cli_demo.sh
@@ -2,4 +2,5 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --model Qwen/Qwen2.5-1.5B-Instruct \
     --infer_backend pt \
-    --stream true
+    --stream true \
+    --max_new_tokens 2048
diff --git a/examples/infer/lmdeploy/mllm_tp.sh b/examples/infer/lmdeploy/mllm_tp.sh
index 9c58b2427..44cca4dd8 100644
--- a/examples/infer/lmdeploy/mllm_tp.sh
+++ b/examples/infer/lmdeploy/mllm_tp.sh
@@ -4,4 +4,5 @@ swift infer \
     --infer_backend lmdeploy \
     --val_dataset AI-ModelScope/captcha-images#1000 \
     --tp 2 \
-    --vision_batch_size 8
+    --vision_batch_size 8 \
+    --max_new_tokens 2048
diff --git a/examples/infer/pt/lora.sh b/examples/infer/pt/lora.sh
index 491c90548..89b403c9b 100644
--- a/examples/infer/pt/lora.sh
+++ b/examples/infer/pt/lora.sh
@@ -6,4 +6,5 @@ swift infer \
     --adapters swift/test_lora \
     --infer_backend pt \
     --stream true \
-    --temperature 0
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/notebook/llm/infer.ipynb b/examples/notebook/qwen2.5-self-cognition/infer.ipynb
similarity index 100%
rename from examples/notebook/llm/infer.ipynb
rename to examples/notebook/qwen2.5-self-cognition/infer.ipynb
diff --git a/examples/notebook/llm/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
similarity index 100%
rename from examples/notebook/llm/self-cognition-sft.ipynb
rename to examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
diff --git a/examples/notebook/mllm/infer.ipynb b/examples/notebook/qwen2vl-ocr/infer.ipynb
similarity index 100%
rename from examples/notebook/mllm/infer.ipynb
rename to examples/notebook/qwen2vl-ocr/infer.ipynb
diff --git a/examples/notebook/mllm/ocr-sft.ipynb b/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
similarity index 100%
rename from examples/notebook/mllm/ocr-sft.ipynb
rename to examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
diff --git a/examples/train/demo.sh b/examples/train/demo.sh
index 57a48f05e..c9c93e1d8 100644
--- a/examples/train/demo.sh
+++ b/examples/train/demo.sh
@@ -33,4 +33,5 @@ swift sft \
 # CUDA_VISIBLE_DEVICES=0 \
 # swift infer \
 #     --adapters output/vx-xxx/checkpoint-xxx \
-#     --stream true
+#     --stream true \
+#     --max_new_tokens 2048
diff --git a/examples/train/multimodal/dpo.sh b/examples/train/multimodal/dpo.sh
index 7826fb995..372a8c176 100644
--- a/examples/train/multimodal/dpo.sh
+++ b/examples/train/multimodal/dpo.sh
@@ -5,7 +5,7 @@ nproc_per_node=4
 
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 NPROC_PER_NODE=$nproc_per_node \
-MAX_PIXELS=602112 \
+MAX_PIXELS=1003520 \
 swift rlhf \
     --rlhf_type dpo \
     --model Qwen/Qwen2-VL-7B-Instruct \
diff --git a/examples/train/multimodal/train_grounding.sh b/examples/train/multimodal/grounding.sh
similarity index 95%
rename from examples/train/multimodal/train_grounding.sh
rename to examples/train/multimodal/grounding.sh
index 2f06a9430..01f04b3a6 100644
--- a/examples/train/multimodal/train_grounding.sh
+++ b/examples/train/multimodal/grounding.sh
@@ -1,6 +1,6 @@
 # You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
 CUDA_VISIBLE_DEVICES=0 \
-MAX_PIXELS=602112 \
+MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
     --train_type lora \
diff --git a/examples/train/multimodal/infer.sh b/examples/train/multimodal/infer.sh
new file mode 100644
index 000000000..10495b366
--- /dev/null
+++ b/examples/train/multimodal/infer.sh
@@ -0,0 +1,7 @@
+# Perform inference using the validation set from the training phase.
+# CUDA_VISIBLE_DEVICES=0,1 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --load_data_args true \
+    --max_new_tokens 2048
diff --git a/examples/train/multimodal/ocr.sh b/examples/train/multimodal/ocr.sh
new file mode 100644
index 000000000..4e39f45d1
--- /dev/null
+++ b/examples/train/multimodal/ocr.sh
@@ -0,0 +1,25 @@
+# 2*75GB
+CUDA_VISIBLE_DEVICES=0,1 \
+MAX_PIXELS=1003520 \
+swift sft \
+    --model Qwen/QVQ-72B-Preview \
+    --dataset AI-ModelScope/LaTeX_OCR:human_handwrite#20000 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4
diff --git a/examples/train/multimodal/video.sh b/examples/train/multimodal/video.sh
new file mode 100644
index 000000000..6df0b07fc
--- /dev/null
+++ b/examples/train/multimodal/video.sh
@@ -0,0 +1,31 @@
+# 4*80GB
+# You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `VIDEO_MAX_PIXELS` parameter.
+nproc_per_node=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+NPROC_PER_NODE=$nproc_per_node \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+swift sft \
+    --model Qwen/QVQ-72B-Preview \
+    --dataset swift/VideoChatGPT:all \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero3
diff --git a/examples/train/multimodal/train.sh b/examples/train/multimodal/vqa.sh
similarity index 91%
rename from examples/train/multimodal/train.sh
rename to examples/train/multimodal/vqa.sh
index 5082da2b1..f5e0aeeba 100644
--- a/examples/train/multimodal/train.sh
+++ b/examples/train/multimodal/vqa.sh
@@ -1,7 +1,8 @@
 # 22GiB
 # You can refer to `https://github.com/QwenLM/Qwen2-VL` for the meaning of the `MAX_PIXELS` parameter.
+# 1003520 = 1280 * 28 * 28
 CUDA_VISIBLE_DEVICES=0 \
-MAX_PIXELS=602112 \
+MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
     --train_type lora \
diff --git a/examples/train/plugins/train_cls.sh b/examples/train/plugins/train_cls.sh
deleted file mode 100644
index 8b9aa92c2..000000000
--- a/examples/train/plugins/train_cls.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# uncomment custom_trainer
-CUDA_VISIBLE_DEVICES=0 \
-swift sft \
-    --model Qwen/Qwen2.5-7B-Instruct \
-    --train_type full \
-    --freeze_parameters_ratio 1 \
-    --trainable_parameters score \
-    --dataset simpleai/HC3-Chinese:baike_cls#1000 \
-    --num_train_epochs 1 \
-    --num_labels 2 \
-    --per_device_train_batch_size 1 \
-    --learning_rate 1e-5 \
-    --gradient_accumulation_steps 16 \
-    --eval_steps 100 \
-    --save_steps 100 \
-    --save_total_limit 2 \
-    --logging_steps 5
diff --git a/examples/train/seq_cls/infer.sh b/examples/train/seq_cls/infer.sh
new file mode 100644
index 000000000..c994148de
--- /dev/null
+++ b/examples/train/seq_cls/infer.sh
@@ -0,0 +1,6 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --max_new_tokens 2048 \
+    --load_data_args true \
+    --max_batch_size 16
diff --git a/examples/train/seq_cls/sft.sh b/examples/train/seq_cls/sft.sh
new file mode 100644
index 000000000..a7f0d229c
--- /dev/null
+++ b/examples/train/seq_cls/sft.sh
@@ -0,0 +1,27 @@
+# If `num_labels` is provided, it will be considered a classification task,
+# and AutoModelForSequenceClassification will be used to load the model.
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-7B \
+    --train_type lora \
+    --dataset DAMO_NLP/jd:cls#2000 \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --num_labels 2 \
+    --task_type seq_cls \
+    --use_chat_template false
diff --git a/examples/train/tuners/lora-ga/train.sh b/examples/train/tuners/lora-ga/train.sh
index a22877ebe..dd132200d 100644
--- a/examples/train/tuners/lora-ga/train.sh
+++ b/examples/train/tuners/lora-ga/train.sh
@@ -29,4 +29,5 @@ swift sft \
 #     --model Qwen/Qwen2-1.5B-Instruct \
 #     --ckpt_dir ./output/Qwen2-1.5B-Instruct/v0-20241214-191235/checkpoint-62/converted/default \
 #     --infer_backend pt \
-#     --stream true
+#     --stream true \
+#     --max_new_tokens 2048
diff --git a/swift/__init__.py b/swift/__init__.py
index 31e6b0f04..059e2ea13 100644
--- a/swift/__init__.py
+++ b/swift/__init__.py
@@ -29,8 +29,17 @@
             'Swift', 'SwiftTuners', 'LongLoRAConfig', 'LongLoRA', 'LongLoRAModelType', 'SCETuning', 'SCETuningConfig'
         ],
         'trainers': [
-            'EvaluationStrategy', 'FSDPOption', 'HPSearchBackend', 'HubStrategy', 'IntervalStrategy', 'SchedulerType',
-            'ShardedDDPOption', 'TrainingArguments', 'Seq2SeqTrainingArguments', 'Trainer', 'Seq2SeqTrainer'
+            'EvaluationStrategy',
+            'FSDPOption',
+            'HPSearchBackend',
+            'HubStrategy',
+            'IntervalStrategy',
+            'SchedulerType',
+            'ShardedDDPOption',
+            'TrainingArguments',
+            'Seq2SeqTrainingArguments',
+            'Trainer',
+            'Seq2SeqTrainer',
         ],
         'utils': ['get_logger']
     }
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index 0605b989b..ace7bf7a4 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -97,7 +97,6 @@ class BaseArguments(CompatArguments, GenerationArguments, QuantizeArguments, Dat
     custom_register_path: List[str] = field(default_factory=list)  # .py
 
     # extra
-    num_labels: Optional[int] = None
     ignore_args_error: bool = False  # True: notebook compatibility
     use_swift_lora: bool = False  # True for using tuner_backend == swift, don't specify this unless you know what you are doing # noqa
 
@@ -205,7 +204,8 @@ def load_args_from_ckpt(self) -> None:
             'model_author',
             'split_dataset_ratio',
             # template_args
-            'tools_prompt'
+            'tools_prompt',
+            'use_chat_template'
         ]
         skip_keys = list(f.name for f in fields(GenerationArguments)) + ['adapters']
         if not isinstance(self, TrainArguments):
@@ -238,8 +238,9 @@ def _init_device(self):
 
     def get_template(self, processor: 'Processor') -> 'Template':
         template_kwargs = self.get_template_kwargs()
-        template = get_template(self.template, processor, use_chat_template=self.use_chat_template, **template_kwargs)
+        template = get_template(self.template, processor, **template_kwargs)
         logger.info(f'default_system: {template.template_meta.default_system}')
+        template.set_mode(self.task_type)  # default mode
         return template
 
     def get_model_processor(self, *, model=None, model_type=None, model_revision=None, **kwargs):
@@ -251,10 +252,6 @@ def get_model_processor(self, *, model=None, model_type=None, model_revision=Non
         kwargs['model_type'] = model_type or self.model_type
         kwargs['model_revision'] = model_revision or self.model_revision
 
-        model_kwargs = {}
-        if self.num_labels is not None:
-            from transformers import AutoModelForSequenceClassification
-            kwargs['automodel_class'] = AutoModelForSequenceClassification
-            model_kwargs = {'num_labels': self.num_labels}
-        model, processor = get_model_tokenizer(**kwargs, model_kwargs=model_kwargs)
+        model, processor = get_model_tokenizer(**kwargs)
+        model.model_info.task_type = self.task_type
         return model, processor
diff --git a/swift/llm/argument/base_args/generation_args.py b/swift/llm/argument/base_args/generation_args.py
index b051a2adf..038e27736 100644
--- a/swift/llm/argument/base_args/generation_args.py
+++ b/swift/llm/argument/base_args/generation_args.py
@@ -34,8 +34,11 @@ class GenerationArguments:
 
     stream: bool = False
     stop_words: List[str] = field(default_factory=list)
+    logprobs: bool = False
 
     def get_request_config(self):
+        if getattr(self, 'task_type') != 'causal_lm':
+            return
         from swift.llm import RequestConfig
 
         return RequestConfig(
@@ -46,4 +49,5 @@ def get_request_config(self):
             num_beams=self.num_beams,
             stop=self.stop_words,
             stream=self.stream,
-            repetition_penalty=self.repetition_penalty)
+            repetition_penalty=self.repetition_penalty,
+            logprobs=self.logprobs)
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index bf61e07c4..26f52f7bb 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -24,10 +24,12 @@ class ModelArguments:
         model_revision (Optional[str]): Revision of the model. Default is None.
         torch_dtype (Literal): Model parameter dtype. Default is None.
         attn_impl (Literal): Attention implementation to use. Default is None.
+        num_labels (Optional[int]): Number of labels for classification tasks. Default is None.
         rope_scaling (Literal): Type of rope scaling to use. Default is None.
         device_map (Optional[str]): Configuration for device mapping. Default is None.
         local_repo_path (Optional[str]): Path to the local github repository for model. Default is None.
     """
+    task_type: Literal['causal_lm', 'seq_cls'] = None
     model: Optional[str] = None  # model id or model path
     model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
@@ -38,6 +40,7 @@ class ModelArguments:
     # None: It will be automatically selected between sdpa and eager.
     attn_impl: Literal['flash_attn', 'sdpa', 'eager', None] = None
 
+    num_labels: Optional[int] = None
     rope_scaling: Literal['linear', 'dynamic'] = None
     device_map: Optional[Union[dict, str]] = None
     # When some model code needs to be downloaded from GitHub,
@@ -119,15 +122,25 @@ def _init_model_info(self) -> torch.dtype:
             self._init_rope_scaling()
         return self.model_info.torch_dtype
 
+    def _init_task_type(self):
+        if self.task_type is None:
+            if self.num_labels is None:
+                self.task_type = 'causal_lm'
+            else:
+                self.task_type = 'seq_cls'
+        if self.task_type == 'seq_cls':
+            assert self.num_labels is not None, 'Please set --num_labels <num_labels>.'
+
     def __post_init__(self):
         if self.model is None:
             raise ValueError(f'Please set --model <model_id_or_path>`, model: {self.model}')
+        self._init_task_type()
         self.model_suffix = get_model_name(self.model)
         self._init_device_map()
         self._init_torch_dtype()
 
     def get_model_kwargs(self):
-        return {
+        kwargs = {
             'model_id_or_path': self.model,
             'torch_dtype': self.torch_dtype,
             'model_type': self.model_type,
@@ -140,3 +153,8 @@ def get_model_kwargs(self):
             'attn_impl': self.attn_impl,
             'rope_scaling': self.rope_scaling,
         }
+        if self.task_type == 'seq_cls':
+            from transformers import AutoModelForSequenceClassification
+            kwargs['automodel_class'] = AutoModelForSequenceClassification
+            kwargs['model_kwargs'] = {'num_labels': self.num_labels}
+        return kwargs
diff --git a/swift/llm/argument/base_args/template_args.py b/swift/llm/argument/base_args/template_args.py
index 16f43de63..01f0a5f45 100644
--- a/swift/llm/argument/base_args/template_args.py
+++ b/swift/llm/argument/base_args/template_args.py
@@ -60,5 +60,6 @@ def get_template_kwargs(self):
             'tools_prompt': self.tools_prompt,
             'loss_scale': self.loss_scale,
             'sequence_parallel_size': self.sequence_parallel_size,
-            'template_backend': self.template_backend
+            'template_backend': self.template_backend,
+            'use_chat_template': self.use_chat_template
         }
diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
index 165abe231..1d2566930 100644
--- a/swift/llm/argument/train_args.py
+++ b/swift/llm/argument/train_args.py
@@ -97,7 +97,6 @@ class TrainArguments(TorchAccArguments, TunerArguments, Seq2SeqTrainingOverrideA
         resume_only_model (bool): Flag to resume training only the model. Default is False.
         check_model (bool): Flag to check the model is latest. Default is True.
         loss_type (Optional[str]): Type of loss function to use. Default is None.
-        num_labels (Optional[int]): Number of labels for classification tasks. Default is None.
         packing (bool): Flag to enable packing of datasets. Default is False.
         lazy_tokenize (Optional[bool]): Flag to enable lazy tokenization. Default is None.
         acc_strategy (Literal['token', 'seq']): Strategy for accumulation. Default is 'token'.
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index 80ea28749..d0d5b002c 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -169,10 +169,35 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         split=['train', 'validation'],
     ))
 
+
+class JdClsPreprocessor(ClsPreprocessor):
+
+    def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
+        label = int(row['label'])
+        res = super().preprocess(row)
+        res['messages'].pop()
+        res['label'] = label
+        return res
+
+
 register_dataset(
     DatasetMeta(
         ms_dataset_id='DAMO_NLP/jd',
-        preprocess_func=ClsPreprocessor(['negative', 'positive'], task='Sentiment Classification', is_pair_seq=False),
+        subsets=[
+            SubsetDataset(
+                'default',
+                'default',
+                preprocess_func=ClsPreprocessor(['negative', 'positive'],
+                                                task='Sentiment Classification',
+                                                is_pair_seq=False)),
+            SubsetDataset(
+                'cls',
+                'default',
+                preprocess_func=JdClsPreprocessor(['negative', 'positive'],
+                                                  task='Sentiment Classification',
+                                                  is_pair_seq=False),
+            ),
+        ],
         tags=['text-generation', 'classification', '🔥'],
         split=['train', 'validation'],
     ))
@@ -358,7 +383,8 @@ class HC3Preprocessor(ResponsePreprocessor):
     def preprocess(self, row):
         rows = []
         for response in ['Human', 'ChatGPT']:
-            query = self.prompt.format(question=row['query'], answer=row[f'{response.lower()}_answers'])
+            query = self.prompt.format(
+                question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
             rows.append(super().preprocess({'query': query, 'response': response}))
         return rows
 
@@ -368,7 +394,8 @@ class HC3ClsPreprocessor(HC3Preprocessor):
     def preprocess(self, row):
         rows = []
         for i, response in enumerate(['Human', 'ChatGPT']):
-            query = self.prompt.format(question=row['query'], answer=row[f'{response.lower()}_answers'])
+            query = self.prompt.format(
+                question=row['query'], answer=self.random_state.choice(row[f'{response.lower()}_answers']))
             rows.append(ResponsePreprocessor.preprocess(self, {'query': query, 'label': i}))
         return rows
 
@@ -396,11 +423,27 @@ def preprocess(self, row):
         subsets=hc3_subsets,
         tags=['text-generation', 'classification', '🔥']))
 
+hc3_subset_names = ['finance', 'medicine']
+hc3_subsets: List[SubsetDataset] = []
+for hc3_subset_name in hc3_subset_names:
+    hc3_subsets.append(
+        SubsetDataset(
+            name=hc3_subset_name,
+            subset=hc3_subset_name,
+            preprocess_func=HC3Preprocessor(),
+        ))
+    hc3_subsets.append(
+        SubsetDataset(
+            name=f'{hc3_subset_name}_cls',
+            subset=hc3_subset_name,
+            preprocess_func=HC3ClsPreprocessor(),
+        ))
+
 register_dataset(
     DatasetMeta(
         ms_dataset_id='simpleai/HC3',
         hf_dataset_id='Hello-SimpleAI/HC3',
-        subsets=['finance', 'medicine'],
+        subsets=hc3_subsets,
         preprocess_func=HC3Preprocessor(),
         tags=['text-generation', 'classification', '🔥']))
 
diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py
index 9741e3210..df7a972a7 100644
--- a/swift/llm/dataset/dataset/mllm.py
+++ b/swift/llm/dataset/dataset/mllm.py
@@ -314,6 +314,8 @@ def prepare_dataset(self, dataset: HfDataset) -> HfDataset:
         return super().prepare_dataset(dataset)
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        if row['video_idx'] not in self.mp4_set:
+            return None
         transfer_to_option = {
             '0': 'A',
             '1': 'B',
@@ -321,24 +323,36 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
             '3': 'D',
             '4': 'E',
         }
-        if row['video_idx'] not in self.mp4_set:
-            return None
-
         row = {
-            'query': row['query'] + '\n' + str(row['option']),
+            'query': row['query'] + '\n' + '\n'.join(row['option']),
             'response': transfer_to_option[row['response']],
             'videos': [os.path.join(self.local_dir, f"{row['video_idx']}.mp4")],
         }
         return super().preprocess(row)
 
 
+class EmoSchemaClsPreprocessor(EmoSchemaPreprocessor):
+
+    def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        if row['video_idx'] not in self.mp4_set:
+            return None
+        row = {
+            'query': row['query'] + '\n' + '\n'.join(row['option']),
+            'label': int(row['response']),
+            'videos': [os.path.join(self.local_dir, f"{row['video_idx']}.mp4")],
+        }
+        return ResponsePreprocessor.preprocess(self, row)
+
+
 register_dataset(
     DatasetMeta(
         ms_dataset_id='AI-ModelScope/egoschema',
         hf_dataset_id='lmms-lab/egoschema',
-        subsets=['Subset'],
+        subsets=[
+            SubsetDataset('default', 'Subset', preprocess_func=EmoSchemaPreprocessor()),
+            SubsetDataset('cls', 'Subset', preprocess_func=EmoSchemaClsPreprocessor())
+        ],
         split=['test'],
-        preprocess_func=EmoSchemaPreprocessor(),
         tags=['chat', 'multi-modal', 'video'],
     ))
 
diff --git a/swift/llm/infer/infer.py b/swift/llm/infer/infer.py
index 6b6ed37ef..ec3ee4ef0 100644
--- a/swift/llm/infer/infer.py
+++ b/swift/llm/infer/infer.py
@@ -33,7 +33,7 @@ def __init__(self, args: Union[List[str], InferArguments, None] = None) -> None:
 
         if args.infer_backend == 'pt':
             model, self.template = prepare_model_template(args)
-            self.infer_engine = PtEngine.from_model_template(model, self.template)
+            self.infer_engine = PtEngine.from_model_template(model, self.template, max_batch_size=args.max_batch_size)
             logger.info(f'model: {self.infer_engine.model}')
         else:
             self.infer_engine = self.get_infer_engine(args)
@@ -98,7 +98,7 @@ def infer_single(self, infer_request: Union[InferRequest, Dict[str, Any]], reque
                                 template=self.template,
                                 use_tqdm=False,
                                 **self.infer_kwargs)
-        if request_config.stream:
+        if request_config and request_config.stream:
             response = ''
             for res in res_or_gen:
                 delta = res[0].choices[0].delta.content
@@ -115,6 +115,7 @@ def infer_cli(self) -> List[Dict[str, Any]]:
         args = self.args
         template = self.template
         request_config = args.get_request_config()
+        logger.info(f'request_config: {request_config}')
 
         logger.info('Input `exit` or `quit` to exit the conversation.')
         logger.info('Input `multi-line` to switch to multi-line input mode.')
@@ -168,7 +169,7 @@ def infer_dataset(self) -> List[Dict[str, Any]]:
         val_dataset = self._prepare_val_dataset()
         logger.info(f'val_dataset: {val_dataset}')
         result_list = []
-        if request_config.stream:
+        if request_config and request_config.stream:
             for data in val_dataset:
                 labels = InferRequest.remove_response(data['messages'])
                 query = data['messages'][-1]['content']
@@ -192,7 +193,9 @@ def infer_dataset(self) -> List[Dict[str, Any]]:
                 val_dataset, request_config, template=self.template, use_tqdm=True, **self.infer_kwargs)
             for data, resp, labels in zip(val_dataset, resp_list, labels_list):
                 response = resp.choices[0].message.content
-                data = {'response': response, 'labels': labels, **data}
+                if labels:
+                    data['labels'] = labels
+                data = {'response': response, 'logprobs': resp.choices[0].logprobs, **data}
                 result_list.append(data)
             if is_dist:
                 total_result_list = [None for _ in range(args.world_size)] if args.rank == 0 else None
diff --git a/swift/llm/infer/infer_engine/pt_engine.py b/swift/llm/infer/infer_engine/pt_engine.py
index 28c764fe6..978ce90bc 100644
--- a/swift/llm/infer/infer_engine/pt_engine.py
+++ b/swift/llm/infer/infer_engine/pt_engine.py
@@ -75,9 +75,10 @@ def __init__(
         if isinstance(adapters, str):
             adapters = [adapters]
         self.adapters = adapters or []
-        self._post_init()
         for adapter in self.adapters:
             self._add_adapter(safe_snapshot_download(adapter, use_hf=use_hf, hub_token=hub_token))
+        self._post_init()
+        self.task_type = 'causal_lm'
 
     def _post_init(self):
         super()._post_init()
@@ -95,6 +96,7 @@ def from_model_template(cls, model, template=None, *, max_batch_size: int = 1):
         self.processor = template.processor
         self.max_batch_size = max_batch_size
         self._post_init()
+        self.task_type = self.model_info.task_type
         return self
 
     def _prepare_generation_config(self, request_config: RequestConfig) -> _GenerationConfig:
@@ -146,8 +148,8 @@ def _update_batched_logprobs(batched_logprobs: List[torch.Tensor], logits_stream
     def _infer_stream(self,
                       template: Template,
                       inputs: Dict[str, Any],
-                      generation_config: GenerationConfig,
                       *,
+                      generation_config: GenerationConfig,
                       adapter_request: Optional[AdapterRequest] = None,
                       **kwargs) -> Iterator[List[Optional[ChatCompletionStreamResponse]]]:
         generate_kwargs = {}
@@ -254,11 +256,44 @@ def _get_adapter_names(self, adapter_request: AdapterRequest) -> List[str]:
             self._add_adapter(adapter_request.path, adapter_name)
         return [adapter_name]
 
+    @staticmethod
+    def _get_seq_cls_logprobs(logprobs):
+        res = []
+        for i, logprob in enumerate(logprobs.tolist()):
+            res.append({'index': i, 'logprob': logprob})
+        return {'content': res}
+
+    def _infer_seq_cls(self,
+                       template: Template,
+                       inputs: Dict[str, Any],
+                       adapter_request: Optional[AdapterRequest] = None,
+                       **kwargs):
+        call_kwargs = {}
+        if adapter_request is not None:
+            call_kwargs['adapter_names'] = self._get_adapter_names(adapter_request)
+        num_prompt_tokens = self._get_num_tokens(inputs)
+        inputs.pop('labels')
+        logits = self.model(**inputs, **call_kwargs).logits
+        logprobs = torch.log_softmax(logits, -1)
+        preds = torch.argmax(logits, dim=-1).tolist()
+        res = []
+        for i, pred in enumerate(preds):
+            usage_info = self._get_usage_info(num_prompt_tokens, 1)
+            choices = [
+                ChatCompletionResponseChoice(
+                    index=0,
+                    message=ChatMessage(role='assistant', content=str(pred), tool_calls=None),
+                    finish_reason='stop',
+                    logprobs=self._get_seq_cls_logprobs(logprobs[i]))
+            ]
+            res.append(ChatCompletionResponse(model=self.model_name, choices=choices, usage=usage_info))
+        return res
+
     def _infer_full(self,
                     template: Template,
                     inputs: Dict[str, Any],
-                    generation_config: GenerationConfig,
                     *,
+                    generation_config: GenerationConfig,
                     adapter_request: Optional[AdapterRequest] = None,
                     template_inputs=None) -> Union[List[ChatCompletionResponse]]:
         # bos_token TODO: encoder-decoder
@@ -345,7 +380,12 @@ def _infer(
         if template.use_model:
             template.model = self.model
 
-        template.set_mode('pt')
+        generation_config = None
+        if self.task_type == 'seq_cls':
+            template.set_mode('seq_cls')
+        else:
+            template.set_mode('pt')
+
         max_workers = min(32, os.cpu_count(), len(infer_requests))
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             futures = [
@@ -358,9 +398,10 @@ def _infer(
         inputs = to_device(template.data_collator(batched_inputs), self.model.device)
         if self.model.model_meta.is_multimodal:
             _, inputs = template.pre_forward_hook(self.model, None, inputs)
-        self.set_default_max_tokens(request_config, inputs)
-        generation_config = self._prepare_generation_config(request_config)
-        self._add_stop_words(generation_config, request_config, template)
+        if self.task_type != 'seq_cls':
+            self.set_default_max_tokens(request_config, inputs)
+            generation_config = self._prepare_generation_config(request_config)
+            self._add_stop_words(generation_config, request_config, template)
 
         kwargs = {
             'template': template,
@@ -380,7 +421,8 @@ def _gen_wrapper():
 
             return _gen_wrapper()
         else:
-            return self._update_metrics(self._infer_full(**kwargs), metrics)
+            infer_func = self._infer_seq_cls if template.mode == 'seq_cls' else self._infer_full
+            return self._update_metrics(infer_func(**kwargs), metrics)
 
     @torch.inference_mode()
     def infer(
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
index a3dc03b1a..66069df3c 100644
--- a/swift/llm/model/model/qwen.py
+++ b/swift/llm/model/model/qwen.py
@@ -1,5 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Dict, Optional, Tuple, Type
 
 import torch
 from transformers import AutoConfig, BitsAndBytesConfig, PreTrainedTokenizerBase
@@ -7,7 +7,7 @@
 from transformers.models.auto.tokenization_auto import get_tokenizer_config
 
 from swift.llm import TemplateType
-from swift.utils import get_dist_setting, get_logger
+from swift.utils import get_dist_setting, get_env_args, get_logger
 from ..constant import LLMModelType, MLLMModelType
 from ..model_arch import ModelArch
 from ..patcher import patch_fixed_device, patch_output_clone, patch_output_to_input_device
@@ -472,34 +472,29 @@ def _get_cast_dtype(self) -> torch.dtype:
         model_arch=ModelArch.llama))
 
 
+def patch_qwen_vl_utils():
+    from qwen_vl_utils import vision_process
+    for key in [
+            'image_factor', 'min_pixels', 'max_pixels', 'max_ratio', 'video_min_pixels', 'video_max_pixels',
+            'video_total_pixels', 'frame_factor', 'fps', 'fps_min_frames', 'fps_max_frames'
+    ]:
+        type_func = float if key == 'fps' else int
+        setattr(vision_process, key.upper(), get_env_args(key, type_func, getattr(vision_process, key.upper())))
+
+
 def get_model_tokenizer_qwen2_vl(model_dir: str,
                                  model_info: ModelInfo,
                                  model_kwargs: Dict[str, Any],
                                  load_model: bool = True,
                                  **kwargs):
-    try:
-        from torchvision.io import video
-        if not hasattr(video, '_patching'):
-            # not read audio
-            video._patching = True
-            _old_read_from_stream = video._read_from_stream
-
-            def _read_from_stream(container: 'av.container.Container', start_offset: float, end_offset: float,
-                                  pts_unit: str, stream: 'av.stream.Stream', *args, **kwargs) -> List['av.frame.Frame']:
-                if stream.type == 'video':
-                    return _old_read_from_stream(container, start_offset, end_offset, pts_unit, stream, *args, **kwargs)
-                return []
-
-            video._read_from_stream = _read_from_stream
-    except Exception:
-        pass
-
     from transformers import Qwen2VLForConditionalGeneration
     kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2VLForConditionalGeneration
     model, tokenizer = get_model_tokenizer_multimodal(model_dir, model_info, model_kwargs, load_model, **kwargs)
     if model is not None and hasattr(model.model, 'embed_tokens'):
         patch_output_clone(model.model.embed_tokens)
         patch_output_to_input_device(model.model.embed_tokens)
+
+    patch_qwen_vl_utils()
     return model, tokenizer
 
 
@@ -535,7 +530,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float,
         get_model_tokenizer_qwen2_vl,
         model_arch=ModelArch.qwen2_vl,
         architectures=['Qwen2VLForConditionalGeneration'],
-        requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav'],
+        requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav', 'decord'],
         tags=['vision', 'video']))
 
 register_model(
@@ -549,7 +544,7 @@ def _read_from_stream(container: 'av.container.Container', start_offset: float,
         get_model_tokenizer_qwen2_vl,
         model_arch=ModelArch.qwen2_vl,
         architectures=['Qwen2VLForConditionalGeneration'],
-        requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav'],
+        requires=['transformers>=4.45', 'qwen_vl_utils', 'pyav', 'decord'],
         tags=['vision', 'video']))
 
 
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index e81716c56..ca5003ce3 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -176,8 +176,9 @@ def get_model_tokenizer_from_local(model_dir: str,
     if tokenizer is None:
         tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 
-    if model_kwargs.get('num_labels') is not None:
-        model_config.num_labels = model_kwargs.pop('num_labels')
+    num_labels = model_kwargs.pop('num_labels', None)
+    if num_labels:
+        model_config.num_labels = num_labels
 
     model = None
     if load_model:
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
index 3374921c6..7195b266e 100644
--- a/swift/llm/model/utils.py
+++ b/swift/llm/model/utils.py
@@ -59,6 +59,7 @@ class ModelInfo:
 
     # extra
     config: Optional[PretrainedConfig] = None
+    task_type: Optional[str] = None
 
     def __post_init__(self):
         from .register import get_model_name
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index 2ef821b94..549362847 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -103,7 +103,7 @@ def __init__(
 
         self.mode: Literal['pt', 'vllm', 'lmdeploy',  # infer
                            'train', 'rlhf', 'kto'  # train
-                           ] = 'pt'
+                           'seq_cls'] = 'pt'
         self._handles = []
         self._deepspeed_initialize = None
 
@@ -201,7 +201,7 @@ def encode(self,
             encoded = Template._encode(self, inputs)
             for key in ['images', 'audios', 'videos']:
                 encoded[key] = getattr(inputs, key)
-        elif self.mode in {'pt', 'train'}:
+        elif self.mode in {'pt', 'train', 'seq_cls'}:
             encoded = self._encode(inputs)
         elif self.mode == 'rlhf':
             encoded = self._rlhf_encode(inputs)
@@ -685,7 +685,7 @@ def pre_forward_hook(self, model: nn.Module, args, kwargs):
             kwargs.pop('input_ids', None)
 
         if isinstance(model, PeftModel):
-            parameters = inspect.signature(model.base_model.model.forward).parameters
+            parameters = inspect.signature(model.model.forward).parameters
         else:
             parameters = inspect.signature(model.forward).parameters
         if 'position_ids' not in parameters:
@@ -696,7 +696,9 @@ def pre_forward_hook(self, model: nn.Module, args, kwargs):
     def is_training(self):
         return self.mode not in {'vllm', 'lmdeploy', 'pt'}
 
-    def set_mode(self, mode: Literal['vllm', 'lmdeploy', 'pt', 'train', 'rlhf', 'kto']) -> None:
+    def set_mode(self, mode: Literal['vllm', 'lmdeploy', 'pt', 'seq_cls', 'train', 'rlhf', 'kto']) -> None:
+        if mode == 'causal_lm':
+            mode = 'train'
         self.mode = mode
 
     def register_post_encode_hook(self, models: List[nn.Module]) -> None:
@@ -741,6 +743,8 @@ def data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int
             return self._kto_data_collator(batch, padding_to=padding_to)
         elif self.mode in {'pt', 'train'}:
             return self._data_collator(batch, padding_to=padding_to)
+        elif self.mode == 'seq_cls':
+            return self._seq_cls_data_collator(batch, padding_to=padding_to)
 
     @staticmethod
     def _fetch_inputs_startswith(batch: List[Dict[str, Any]], prefix: str) -> List[Dict[str, Any]]:
@@ -797,6 +801,16 @@ def _kto_data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optiona
             res['label'] = label
         return res
 
+    def _seq_cls_data_collator(self,
+                               batch: List[Dict[str, Any]],
+                               *,
+                               padding_to: Optional[int] = None) -> Dict[str, Any]:
+        labels = [b['label'] for b in batch if b.get('label') is not None]
+        res = self._data_collator(batch, padding_to=padding_to)
+        if labels:
+            res['labels'] = torch.tensor(labels, dtype=torch.long)
+        return res
+
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
         """
         Args:
@@ -862,11 +876,6 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         pixel_values_videos = [b['pixel_values_videos'] for b in batch if b.get('pixel_values_videos') is not None]
         if len(pixel_values_videos) > 0:
             res['pixel_values_videos'] = torch.concat(pixel_values_videos)
-        # sequence_classification
-        if self.is_training:
-            label = [b['label'] for b in batch if b.get('label') is not None]
-            if label:
-                res['label'] = label
         if use_torchacc() or self.sequence_parallel_size > 1:
             res = self._torchacc_xtuner_data_collator(res, padding_to, self.tokenizer, padding_side)
         return res
diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
index 6d3453469..9c9e3becb 100644
--- a/swift/llm/template/template/qwen.py
+++ b/swift/llm/template/template/qwen.py
@@ -12,7 +12,7 @@
 from ..template_inputs import StdTemplateInputs
 from ..template_meta import TemplateMeta
 from ..utils import Context, Word, findall
-from ..vision_utils import load_audio_qwen, load_batch, load_video_qwen2
+from ..vision_utils import load_audio_qwen, load_batch
 from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta
 
 
@@ -146,29 +146,7 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
 
 
 def _process_image_qwen(image):
-    from qwen_vl_utils.vision_process import IMAGE_FACTOR, MIN_PIXELS, MAX_PIXELS, smart_resize
-    image_factor = get_env_args('image_factor', int, IMAGE_FACTOR)
-    # resize
-    resized_height = get_env_args('resized_height', int, None)
-    resized_width = get_env_args('resized_width', int, None)
-    if resized_height and resized_width:
-        resized_height, resized_width = smart_resize(
-            resized_height,
-            resized_width,
-            factor=image_factor,
-        )
-    else:
-        width, height = image.size
-        min_pixels = get_env_args('min_pixels', int, MIN_PIXELS)
-        max_pixels = get_env_args('max_pixels', int, MAX_PIXELS)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=image_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-    image = image.resize((resized_width, resized_height))
+
     return image
 
 
@@ -178,12 +156,13 @@ class Qwen2VLTemplate(Template):
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
+        from qwen_vl_utils import fetch_image, fetch_video
         assert media_type in {'image', 'video'}
         if media_type == 'image':
-            inputs.images[index] = _process_image_qwen(inputs.images[index])
+            inputs.images[index] = fetch_image({'image': inputs.images[index]})
             return ['<|vision_start|><|image_pad|><|vision_end|>']
         else:
-            inputs.videos[index] = load_video_qwen2(inputs.videos[index])
+            inputs.videos[index] = fetch_video({'video': inputs.videos[index]})
             return ['<|vision_start|><|video_pad|><|vision_end|>']
 
     def replace_object(self, object_: Dict[str, Any], index: int, inputs: StdTemplateInputs) -> List[Context]:
@@ -219,10 +198,12 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             if locals()[media_type]:
                 if media_type == 'images':
                     media_token = self.image_token_id
-                    media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt')
+                    media_inputs = processor.image_processor(
+                        images=images, videos=None, return_tensors='pt', do_rescale=False)
                     media_grid_thw = media_inputs['image_grid_thw']
                 else:
-                    media_inputs = processor.image_processor(images=None, videos=videos, return_tensors='pt')
+                    media_inputs = processor.image_processor(
+                        images=None, videos=videos, return_tensors='pt', do_rescale=False)
                     media_grid_thw = media_inputs['video_grid_thw']
                     media_token = self.video_token_id
                 idx_list = findall(input_ids, media_token)
diff --git a/swift/llm/template/vision_utils.py b/swift/llm/template/vision_utils.py
index 5bee5033e..fb1e68d9a 100644
--- a/swift/llm/template/vision_utils.py
+++ b/swift/llm/template/vision_utils.py
@@ -9,7 +9,6 @@
 import numpy as np
 import requests
 import torch
-from packaging import version
 from PIL import Image, ImageDraw
 
 from swift.utils import get_env_args
@@ -269,79 +268,6 @@ def load_audio_qwen(audio_io: BytesIO, sampling_rate: int):
     return librosa.load(audio_io, sr=sampling_rate)[0]
 
 
-def load_video_qwen2(video_path: str):
-    import torchvision
-    from torchvision import io, transforms
-    from qwen_vl_utils.vision_process import (round_by_factor, FPS, FRAME_FACTOR, FPS_MIN_FRAMES, FPS_MAX_FRAMES,
-                                              VIDEO_MIN_PIXELS, VIDEO_MAX_PIXELS, VIDEO_TOTAL_PIXELS, smart_resize,
-                                              ceil_by_factor, floor_by_factor)
-    from torchvision.transforms import InterpolationMode
-
-    if version.parse(torchvision.__version__) >= version.parse('0.19'):
-        video_path = load_file(video_path)
-    video, _, info = io.read_video(
-        video_path,
-        pts_unit='sec',
-        output_format='TCHW',
-    )
-    nframes = get_env_args('nframes', int, None)
-    fps = get_env_args('fps', int, None)
-    frame_factor = get_env_args('frame_factor', int, FRAME_FACTOR)
-    assert not (fps and nframes), 'Only accept either `fps` or `nframes`'
-    if nframes is not None:
-        nframes = round_by_factor(nframes, frame_factor)
-    else:
-        if fps is None:
-            fps = FPS
-        nframes = video.size(0) / info['video_fps'] * fps
-        nframes = round_by_factor(nframes, frame_factor)
-        min_frames = get_env_args('fps_min_frames', int, FPS_MIN_FRAMES)
-        max_frames = get_env_args('fps_max_frames', int, FPS_MAX_FRAMES)
-        if nframes < min_frames:
-            nframes = ceil_by_factor(min_frames, frame_factor)
-        if nframes > max_frames:
-            nframes = floor_by_factor(max_frames, frame_factor)
-
-    if not (frame_factor <= nframes and nframes <= video.size(0)):
-        raise ValueError(f'nframes should in interval [{frame_factor}, {video.size(0)}], but got {nframes}.')
-
-    idx = torch.linspace(0, video.size(0) - 1, nframes).round().long()
-    height, width = video.shape[2:]
-    video = video[idx]
-
-    min_pixels = get_env_args('video_min_pixels', int, VIDEO_MIN_PIXELS)
-    total_pixels = get_env_args('video_total_pixels', int, VIDEO_TOTAL_PIXELS)
-    max_pixels = get_env_args('video_max_pixels', int, None)
-    if max_pixels is None:
-        max_pixels = VIDEO_MAX_PIXELS
-        max_pixels = max(min(max_pixels, total_pixels / nframes * frame_factor), min_pixels * 1.05)
-    # resize
-    resized_height = get_env_args('resized_height', int, None)
-    resized_width = get_env_args('resized_width', int, None)
-    if resized_height and resized_width:
-        resized_height, resized_width = smart_resize(
-            resized_height,
-            resized_width,
-            factor=frame_factor,
-        )
-    else:
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=frame_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-    video = transforms.functional.resize(
-        video,
-        [resized_height, resized_width],
-        interpolation=InterpolationMode.BICUBIC,
-        antialias=True,
-    ).float()
-    return video
-
-
 def normalize_bbox(objects: List[Dict[str, Any]], images: List[Image.Image], to_type: Literal['real', 'norm_1000',
                                                                                               'norm_1']) -> None:
     """Normalize bbox to needed.
diff --git a/swift/llm/train/pt.py b/swift/llm/train/pt.py
index 22bf69d39..4ed90a83a 100644
--- a/swift/llm/train/pt.py
+++ b/swift/llm/train/pt.py
@@ -9,8 +9,9 @@ class SwiftPt(SwiftSft):
     args_class = TrainArguments
     args: args_class
 
-    def _prepare_template(self, use_chat_template: bool) -> None:
-        super()._prepare_template(use_chat_template=False)
+    def _prepare_template(self) -> None:
+        self.args.use_chat_template = False
+        super()._prepare_template()
         self.template.loss_scale = 'all'
 
 
diff --git a/swift/llm/train/rlhf.py b/swift/llm/train/rlhf.py
index 12af39711..906a3e116 100644
--- a/swift/llm/train/rlhf.py
+++ b/swift/llm/train/rlhf.py
@@ -22,9 +22,9 @@ def _prepare_model_tokenizer(self):
 
         super()._prepare_model_tokenizer()
 
-    def _prepare_template(self, use_chat_template: bool) -> None:
+    def _prepare_template(self) -> None:
         args = self.args
-        super()._prepare_template(use_chat_template=use_chat_template)
+        super()._prepare_template()
         mode = 'kto' if args.rlhf_type == 'kto' else 'rlhf'
         self.template.set_mode(mode)
 
diff --git a/swift/llm/train/sft.py b/swift/llm/train/sft.py
index 22f604ec5..56ce3d87c 100644
--- a/swift/llm/train/sft.py
+++ b/swift/llm/train/sft.py
@@ -14,7 +14,6 @@
 from ..dataset import EncodePreprocessor, GetLengthPreprocessor, LazyLLMDataset, PackingPreprocessor, load_dataset
 from ..infer import prepare_generation_config
 from ..model import get_model_arch
-from ..template import get_template
 from ..utils import deep_getattr, dynamic_gradient_checkpointing
 from .tuner import TunerMixin
 
@@ -27,11 +26,11 @@ class SwiftSft(SwiftPipeline, TunerMixin):
 
     def __init__(self, args: Union[List[str], TrainArguments, None] = None) -> None:
         super().__init__(args)
-        self.args.save_args()
         self.train_msg = {}
         self._prepare_model_tokenizer()
-        self._prepare_template(True)
+        self._prepare_template()
         self._prepare_callbacks()
+        self.args.save_args()
 
     def _prepare_gradient_checkpointing(self):
         args = self.args
@@ -71,14 +70,10 @@ def _prepare_model_tokenizer(self):
             self._prepare_generation_config()
         self._prepare_gradient_checkpointing()
 
-    def _prepare_template(self, use_chat_template: bool) -> None:
-        args = self.args
-        template_kwargs = args.get_template_kwargs()
-        template = get_template(args.template, self.processor, use_chat_template=use_chat_template, **template_kwargs)
-        logger.info(f'default_system: {template.template_meta.default_system}')
+    def _prepare_template(self) -> None:
+        template = self.args.get_template(self.processor)
         if template.use_model:
             template.model = self.model
-        template.set_mode('train')
         self.template = template
 
     def _get_dataset(self):
@@ -113,6 +108,11 @@ def run(self):
         args = self.args
 
         train_dataset, val_dataset = self._get_dataset()
+        if args.task_type == 'seq_cls' and isinstance(train_dataset, HfDataset):
+            min_num_labels = int(max(train_dataset['label']) + 1)
+            assert args.num_labels >= min_num_labels, (
+                f'args.num_labels: {args.num_labels}, min_num_labels: {min_num_labels}')
+
         train_dataset, val_dataset = self._encode_dataset(train_dataset, val_dataset)
         data_collator = self._get_data_collator()
         # Some tuners require train_dataset and data_collator for preparation: LoRA-GA
diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
index 21d097b40..94a62b61a 100644
--- a/swift/llm/train/tuner.py
+++ b/swift/llm/train/tuner.py
@@ -153,14 +153,14 @@ def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset
         'lorap_lr_ratio': args.lorap_lr_ratio,
         'init_lora_weights': args.init_weights,
     }
-
+    task_type = args.task_type.upper()
     if args.train_type in ('lora', 'longlora'):
         if args.use_swift_lora:
             lora_config = LoRAConfig(lora_dtype=args.lora_dtype, **lora_kwargs)
             model = Swift.prepare_model(model, lora_config)
             logger.info(f'lora_config: {lora_config}')
         elif args.tuner_backend == 'peft':
-            lora_config = LoraConfig(task_type='CAUSAL_LM', lora_dtype=args.lora_dtype, **lora_kwargs)
+            lora_config = LoraConfig(task_type=task_type, lora_dtype=args.lora_dtype, **lora_kwargs)
             if args.init_weights == 'lora-ga':
                 try:
                     import lora_ga
@@ -211,7 +211,7 @@ def prepare_adapter(args: TrainArguments, model, *, template=None, train_dataset
         lora_kwargs.pop('lorap_lr_ratio', None)
         lora_kwargs['rank_pattern'] = None
         adalora_config = AdaLoraConfig(
-            task_type='CAUSAL_LM',
+            task_type=task_type,
             **lora_kwargs,
             target_r=args.adalora_target_r,
             init_r=args.adalora_init_r,
diff --git a/swift/plugin/__init__.py b/swift/plugin/__init__.py
index eb9e19bd0..f67289bfd 100644
--- a/swift/plugin/__init__.py
+++ b/swift/plugin/__init__.py
@@ -5,7 +5,6 @@
 
 if TYPE_CHECKING:
     from .callback import extra_callbacks
-    from .custom_trainer import custom_trainer_class
     from .loss import LOSS_MAPPING, get_loss_func
     from .loss_scale import loss_scale_map
     from .metric import InferStats, MeanMetric, Metric, compute_acc, get_metric
@@ -17,7 +16,6 @@
     _extra_objects = {k: v for k, v in globals().items() if not k.startswith('_')}
     _import_structure = {
         'callback': ['extra_callbacks'],
-        'custom_trainer': ['custom_trainer_class'],
         'loss': ['LOSS_MAPPING', 'get_loss_func'],
         'loss_scale': ['loss_scale_map'],
         'metric': ['InferStats', 'MeanMetric', 'Metric', 'compute_acc', 'get_metric'],
diff --git a/swift/plugin/custom_trainer.py b/swift/plugin/custom_trainer.py
deleted file mode 100644
index 45f225481..000000000
--- a/swift/plugin/custom_trainer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-
-from swift.trainers import Trainer
-
-
-class SequenceClassificationTrainer(Trainer):
-    """A trainer for text-classification task"""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.label_names = ['labels']
-
-    def compute_loss(self, model, inputs, return_outputs=None, **kwargs):
-        if 'label' in inputs:
-            inputs['labels'] = torch.tensor(inputs.pop('label')).unsqueeze(1)
-        return super().compute_loss(model, inputs, return_outputs=return_outputs)
-
-    def prediction_step(
-        self,
-        model: torch.nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-        **gen_kwargs,
-    ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        if 'label' in inputs:
-            inputs['labels'] = torch.tensor(inputs.pop('label')).unsqueeze(1)
-        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys, **gen_kwargs)
-
-
-# To train sequence classification tasks, uncomment this.
-def custom_trainer_class(trainer_mapping, training_args_mapping):
-    # trainer_mapping['train'] = 'swift.plugin.custom_trainer.SequenceClassificationTrainer'
-    pass
diff --git a/swift/plugin/metric.py b/swift/plugin/metric.py
index d942713f6..b34f19d78 100644
--- a/swift/plugin/metric.py
+++ b/swift/plugin/metric.py
@@ -132,24 +132,20 @@ def compute_acc(preds,
     if isinstance(preds, torch.Tensor):
         preds = preds.cpu().numpy()
         labels = labels.cpu().numpy()
-
-    if is_encoder_decoder:
-        labels = labels[..., :]
-        preds = preds[..., :]
-    else:
+    if preds.ndim >= 2 and not is_encoder_decoder:
         labels = labels[..., 1:]
         preds = preds[..., :-1]
     if preds.shape != labels.shape:
         return {}
 
     masks = labels != -100
-    if acc_strategy == 'seq':
+    if acc_strategy == 'token' or preds.ndim == 1:
+        acc_list = (preds[masks] == labels[masks]).tolist()
+    else:
         acc_list = []
         for i, m in enumerate(masks):
             acc_list.append(np.all(preds[i, m] == labels[i, m]))
-    else:
-        acc_list = (preds[masks] == labels[masks]).tolist()
-    return {f'{acc_strategy}_acc': acc_list}
+    return {f'{acc_strategy}_acc' if preds.ndim >= 2 else 'acc': acc_list}
 
 
 def compute_acc_metrics(eval_prediction: EvalPrediction,
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index c8c9302b6..aa3458862 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -4,7 +4,9 @@
 import os
 import shutil
 import time
+from contextlib import contextmanager
 from copy import copy
+from functools import wraps
 from types import MethodType
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
@@ -27,9 +29,10 @@
 
 from swift.hub import get_hub
 from swift.llm import Template
-from swift.plugin import extra_tuners
+from swift.plugin import MeanMetric, compute_acc, extra_tuners
 from swift.tuners import SwiftModel
-from swift.utils import get_logger, is_mp_ddp
+from swift.utils import get_logger, is_mp_ddp, use_torchacc
+from swift.utils.torchacc_utils import ta_trim_graph
 from .arguments import TrainingArguments
 from .optimizers.galore import create_optimizer_and_scheduler
 from .utils import can_return_loss, find_labels, get_function, is_instance_of_ms_model
@@ -246,6 +249,29 @@ def _save_checkpoint(self, *args, **kwargs):
         logger.info(f'Saving model checkpoint to {self.state.last_model_checkpoint}')
         return result
 
+    @contextmanager
+    def _patch_loss_function(self):
+        model = self.model
+        if isinstance(model, PeftModel):
+            model = model.model
+        model_cls = model.__class__
+        if not hasattr(model_cls, 'loss_function'):
+            yield
+            return
+
+        loss_function = model.loss_function
+        _old_loss_function = model_cls.loss_function
+
+        @staticmethod
+        @wraps(loss_function)
+        def new_loss_function(logits, labels, **kwargs):
+            labels = labels.to(logits.device)  # fix device_map
+            return loss_function(logits=logits, labels=labels, **kwargs)
+
+        model_cls.loss_function = new_loss_function
+        yield
+        model_cls.loss_function = _old_loss_function
+
     def train(self, *args, **kwargs):
         if self.model.model_meta.is_multimodal:
             models = list(
@@ -255,11 +281,11 @@ def train(self, *args, **kwargs):
                 ]))
             self.template.register_post_encode_hook(models)
             logger.info(f'Successfully registered post_encode hook: {[model.__class__.__name__ for model in models]}')
-        self.model_accepts_loss_kwargs = True  # fix transformers>=4.46.2
         self._save_initial_model(self.args.output_dir)
-        with self.hub.patch_hub():
-            return super().train(*args, **kwargs)
+        with self.hub.patch_hub(), self._patch_loss_function():
+            res = super().train(*args, **kwargs)
         self.template.remove_post_encode_hook()
+        return res
 
     def push_to_hub(self, *args, **kwargs):
         with self.hub.patch_hub():
@@ -353,3 +379,19 @@ def get_train_dataloader(self):
         else:
             from swift.trainers.xtuner import get_xtuner_train_dataloader
             return get_xtuner_train_dataloader(self)
+
+    def _compute_acc(self, outputs, labels) -> None:
+        args = self.args
+        acc_steps = args.acc_steps
+        preds = outputs.logits.argmax(dim=-1)
+        if self.state.global_step % acc_steps == 0:
+            if use_torchacc():
+                ta_trim_graph()
+                preds = preds.to('cpu')
+                labels = labels.to('cpu')
+            metrics = compute_acc(
+                preds, labels, acc_strategy=args.acc_strategy, is_encoder_decoder=args.is_encoder_decoder)
+            for k, v in metrics.items():
+                if k not in self._custom_metrics:
+                    self._custom_metrics[k] = MeanMetric(nan_value=None)
+                self._custom_metrics[k].update(v)
diff --git a/swift/trainers/rlhf_trainer/rlhf_mixin.py b/swift/trainers/rlhf_trainer/rlhf_mixin.py
index 3d57b83f0..708e65b46 100644
--- a/swift/trainers/rlhf_trainer/rlhf_mixin.py
+++ b/swift/trainers/rlhf_trainer/rlhf_mixin.py
@@ -150,7 +150,7 @@ def get_batch_logps(self, logits: torch.FloatTensor, labels: torch.LongTensor, *
             labels = labels.clone()  # fix trl bug
         return super().get_batch_logps(logits, labels, *args, **kwargs)
 
-    def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=None):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         res = super().compute_loss(model, inputs, return_outputs=return_outputs)
         # compat transformers>=4.46.*
         if num_items_in_batch is not None:
diff --git a/swift/trainers/trainer_factory.py b/swift/trainers/trainer_factory.py
index 795f56d61..480ca8287 100644
--- a/swift/trainers/trainer_factory.py
+++ b/swift/trainers/trainer_factory.py
@@ -4,7 +4,6 @@
 from dataclasses import asdict
 from typing import Dict
 
-from swift.plugin import custom_trainer_class
 from swift.utils import get_logger
 
 logger = get_logger()
@@ -12,7 +11,8 @@
 
 class TrainerFactory:
     TRAINER_MAPPING = {
-        'train': 'swift.trainers.Seq2SeqTrainer',
+        'causal_lm': 'swift.trainers.Seq2SeqTrainer',
+        'seq_cls': 'swift.trainers.Trainer',
         'dpo': 'swift.trainers.DPOTrainer',
         'orpo': 'swift.trainers.ORPOTrainer',
         'kto': 'swift.trainers.KTOTrainer',
@@ -22,7 +22,8 @@ class TrainerFactory:
     }
 
     TRAINING_ARGS_MAPPING = {
-        'train': 'swift.trainers.Seq2SeqTrainingArguments',
+        'causal_lm': 'swift.trainers.Seq2SeqTrainingArguments',
+        'seq_cls': 'swift.trainers.TrainingArguments',
         'dpo': 'swift.trainers.DPOConfig',
         'orpo': 'swift.trainers.ORPOConfig',
         'kto': 'swift.trainers.KTOConfig',
@@ -31,14 +32,12 @@ class TrainerFactory:
         'ppo': 'swift.trainers.PPOConfig',
     }
 
-    custom_trainer_class(TRAINER_MAPPING, TRAINING_ARGS_MAPPING)
-
     @staticmethod
     def get_cls(args, mapping: Dict[str, str]):
         if hasattr(args, 'rlhf_type'):
             train_method = args.rlhf_type
         else:
-            train_method = 'train'
+            train_method = args.task_type
         module_path, class_name = mapping[train_method].rsplit('.', 1)
         module = importlib.import_module(module_path)
         return getattr(module, class_name)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 84ed05328..51e3ad178 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -13,16 +13,22 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.utils import is_peft_available
 
-from swift.plugin import MeanMetric, compute_acc
-from swift.utils import JsonlWriter, Serializer, use_torchacc
-from swift.utils.torchacc_utils import ta_trim_graph
-from .arguments import Seq2SeqTrainingArguments
+from swift.utils import JsonlWriter, Serializer
+from .arguments import Seq2SeqTrainingArguments, TrainingArguments
 from .mixin import SwiftMixin
 from .torchacc_mixin import TorchAccMixin
 
 
 class Trainer(SwiftMixin, HfTrainer):
-    pass
+    args: TrainingArguments
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        loss, outputs = super().compute_loss(model, inputs, return_outputs=True)
+        if inputs.get('labels') is not None:
+            self._compute_acc(outputs, inputs['labels'])
+        if num_items_in_batch is not None:
+            loss /= self.args.gradient_accumulation_steps
+        return (loss, outputs) if return_outputs else loss
 
 
 class Seq2SeqTrainer(TorchAccMixin, SwiftMixin, HfSeq2SeqTrainer):
@@ -30,6 +36,7 @@ class Seq2SeqTrainer(TorchAccMixin, SwiftMixin, HfSeq2SeqTrainer):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.model_accepts_loss_kwargs = True  # fix transformers>=4.46.2
         if self.args.predict_with_generate:
             from swift.llm import PtEngine
             self.infer_engine = PtEngine.from_model_template(
@@ -94,7 +101,7 @@ def prediction_step(
         labels_list = pad_sequence(labels_list, batch_first=True, padding_value=0)
         return None, response_list, labels_list
 
-    def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=None):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         loss_kwargs = {}
         labels = None
         if (self.label_smoother is not None or self.compute_loss_func is not None) and 'labels' in inputs:
@@ -127,7 +134,7 @@ def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=No
         else:
             unwrapped_model = self.accelerator.unwrap_model(model)
             if is_peft_available() and isinstance(unwrapped_model, PeftModel):
-                model_name = unwrapped_model.base_model.model._get_name()
+                model_name = unwrapped_model.model._get_name()
             else:
                 model_name = unwrapped_model._get_name()
             # User-defined compute_loss function
@@ -145,23 +152,7 @@ def compute_loss(self, model, inputs, return_outputs=None, num_items_in_batch=No
         if getattr(self.args, 'average_tokens_across_devices', False):
             loss *= self.accelerator.num_processes
 
-        if outputs.logits is not None:
-            # In case of Liger
-            self._compute_token_acc(outputs, labels)
+        if outputs.logits is not None and labels is not None:
+            # Liger does not have logits
+            self._compute_acc(outputs, labels)
         return (loss, outputs) if return_outputs else loss
-
-    def _compute_token_acc(self, outputs, labels) -> None:
-
-        acc_steps = self.args.acc_steps
-        preds = outputs.logits.argmax(dim=2)
-        if self.state.global_step % acc_steps == 0:
-            if use_torchacc():
-                ta_trim_graph()
-                preds = preds.to('cpu')
-                labels = labels.to('cpu')
-            metrics = compute_acc(
-                preds, labels, acc_strategy=self.args.acc_strategy, is_encoder_decoder=self.args.is_encoder_decoder)
-            for k, v in metrics.items():
-                if k not in self._custom_metrics:
-                    self._custom_metrics[k] = MeanMetric(nan_value=None)
-                self._custom_metrics[k].update(v)
diff --git a/tests/train/test_cls.py b/tests/train/test_cls.py
new file mode 100644
index 000000000..5d20f790b
--- /dev/null
+++ b/tests/train/test_cls.py
@@ -0,0 +1,21 @@
+import os
+
+kwargs = {
+    'per_device_train_batch_size': 2,
+    'per_device_eval_batch_size': 2,
+    'save_steps': 10,
+    'gradient_accumulation_steps': 4,
+    'num_train_epochs': 1,
+}
+
+
+def test_llm():
+    from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
+    result = sft_main(
+        TrainArguments(model='Qwen/Qwen2.5-7B-Instruct', num_labels=2, dataset=['DAMO_NLP/jd:cls#2000'], **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
+
+
+if __name__ == '__main__':
+    test_llm()
diff --git a/tests/train/test_sft.py b/tests/train/test_sft.py
index 8fff7d01f..797458c5d 100644
--- a/tests/train/test_sft.py
+++ b/tests/train/test_sft.py
@@ -14,7 +14,7 @@
 def test_llm_ddp():
     os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
     from swift.llm import sft_main, TrainArguments, infer_main, InferArguments
-    sft_main(
+    result = sft_main(
         TrainArguments(
             model='Qwen/Qwen2-7B-Instruct',
             dataset=['AI-ModelScope/alpaca-gpt4-data-zh#100', 'AI-ModelScope/alpaca-gpt4-data-en#100'],
@@ -23,6 +23,8 @@ def test_llm_ddp():
             target_modules=['all-linear', 'all-embedding'],
             modules_to_save=['all-embedding', 'all-norm'],
             **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
 
 
 def test_unsloth():

From 5e15fb125119a1c677b7c736b540107e99c4e3b5 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 25 Dec 2024 10:53:53 +0800
Subject: [PATCH 08/13] fix web-ui (#2758)

---
 swift/ui/llm_infer/llm_infer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/ui/llm_infer/llm_infer.py b/swift/ui/llm_infer/llm_infer.py
index 65ac9f202..d855d34d8 100644
--- a/swift/ui/llm_infer/llm_infer.py
+++ b/swift/ui/llm_infer/llm_infer.py
@@ -401,6 +401,8 @@ def send_message(cls, running_task, template_type, prompt: str, image, video, au
         if infer_request.messages[-1]['role'] != 'assistant':
             infer_request.messages.append({'role': 'assistant', 'content': ''})
         for chunk in stream_resp:
+            if chunk[0] is None:
+                continue
             stream_resp_with_history += chunk[0].choices[0].delta.content if chat else chunk.choices[0].text
             infer_request.messages[-1]['content'] = stream_resp_with_history
             yield '', cls._replace_tag_with_media(infer_request), gr.update(value=None), gr.update(

From dbeec0fed05190fffa496c384b44d89c18a09ecb Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 25 Dec 2024 14:51:27 +0800
Subject: [PATCH 09/13] fix bugs (#2761)

---
 README.md                                     |  2 ++
 README_CN.md                                  |  2 ++
 ...53\351\200\237\345\274\200\345\247\213.md" |  2 ++
 ...44\350\241\214\345\217\202\346\225\260.md" |  2 +-
 docs/source_en/GetStarted/Quick-start.md      |  2 ++
 .../Instruction/Command-line-parameters.md    |  2 +-
 .../notebook/qwen2.5-self-cognition/infer.sh  |  7 +++++
 .../self-cognition-sft.ipynb                  |  2 +-
 .../notebook/qwen2.5-self-cognition/sft.sh    | 30 +++++++++++++++++++
 examples/notebook/qwen2vl-ocr/ocr-sft.ipynb   |  2 +-
 examples/train/demo.sh                        |  1 +
 swift/llm/argument/base_args/base_args.py     |  4 +--
 swift/llm/argument/base_args/model_args.py    |  2 +-
 swift/llm/template/base.py                    |  2 ++
 swift/trainers/mixin.py                       | 27 +----------------
 swift/trainers/trainers.py                    | 28 +++++++++++++++++
 16 files changed, 84 insertions(+), 33 deletions(-)
 create mode 100644 examples/notebook/qwen2.5-self-cognition/infer.sh
 create mode 100644 examples/notebook/qwen2.5-self-cognition/sft.sh

diff --git a/README.md b/README.md
index cbce47f37..9884d0fc3 100644
--- a/README.md
+++ b/README.md
@@ -147,6 +147,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
@@ -157,6 +158,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/README_CN.md b/README_CN.md
index 86eca3fad..60b7c79a9 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -140,6 +140,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
@@ -150,6 +151,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index c04094e6f..d30b25cf8 100644
--- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -64,6 +64,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora并使用vLLM进行推理加速
@@ -74,6 +75,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
index 8f3e9e29a..5ec575c0b 100644
--- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
+++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md"
@@ -16,10 +16,10 @@
 - custom_register_path: 自定义模型、对话模板和数据集注册的`.py`文件路径
 
 ### 模型参数
-- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: 模型id或模型本地路径。如果是自定义模型请配合`model_type`和`template`使用，具体可以参考[自定义模型](../Customization/自定义模型.md)
 - model_type: 模型类型。相同的模型架构、template、模型加载过程被定义为一个model_type
 - model_revision: 模型版本
+- task_type: 默认为'causal_lm'. 可选为'causal_lm', 'seq_cls'. 例子可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥torch_dtype: 模型权重的数据类型，支持`float16`,`bfloat16`,`float32`，默认从config文件中读取
 - attn_impl: attention类型，支持`flash_attn`, `sdpa`, `eager`，默认使用sdpa
 - num_labels: 分类模型需要指定。代表标签数量，默认为None
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
index f717afaa9..b5d58e09a 100644
--- a/docs/source_en/GetStarted/Quick-start.md
+++ b/docs/source_en/GetStarted/Quick-start.md
@@ -64,6 +64,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
     --stream true \
+    --temperature 0 \
     --max_new_tokens 2048
 
 # merge-lora and use vLLM for inference acceleration
@@ -74,6 +75,7 @@ swift infer \
     --merge_lora true \
     --infer_backend vllm \
     --max_model_len 8192 \
+    --temperature 0 \
     --max_new_tokens 2048
 ```
 
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
index 9e89e6b41..21e7020d6 100644
--- a/docs/source_en/Instruction/Command-line-parameters.md
+++ b/docs/source_en/Instruction/Command-line-parameters.md
@@ -16,11 +16,11 @@ The introduction to command line parameters will cover base arguments, atomic ar
 - custom_register_path: The file path for the custom model, chat template, and dataset registration `.py` files.
 
 ### Model Arguments
-- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - 🔥model: Model ID or local path to the model. If it's a custom model, please use it with `model_type` and `template`. The specific details can be referred to in the [Custom Model](../Customization/Custom-model.md).
 - model_type: Model type. The same model architecture, template, and loading process define a model_type.
 - model_revision: Model version.
 - 🔥torch_dtype: Data type for model weights, supports `float16`, `bfloat16`, `float32`, default is read from the config file.
+- task_type: Defaults to 'causal_lm'. Options include 'causal_lm' and 'seq_cls'. You can view examples [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/seq_cls).
 - attn_impl: Attention type, supports `flash_attn`, `sdpa`, `eager`, default is sdpa.
 - num_labels: To be specified for classification models, representing the number of labels, default is None.
 - rope_scaling: Rope type, supports `linear` and `dynamic`, to be used with `max_length`.
diff --git a/examples/notebook/qwen2.5-self-cognition/infer.sh b/examples/notebook/qwen2.5-self-cognition/infer.sh
new file mode 100644
index 000000000..d957257cb
--- /dev/null
+++ b/examples/notebook/qwen2.5-self-cognition/infer.sh
@@ -0,0 +1,7 @@
+# Here is the command-line style inference code.
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
index d2c52f252..65c43fdc6 100644
--- a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
+++ b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -22,7 +22,7 @@
    "outputs": [],
    "source": [
     "# # install ms-swift\n",
-    "# pip install git+https://github.com/modelscope/ms-swift.git"
+    "# pip install ms-swift -U"
    ]
   },
   {
diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
new file mode 100644
index 000000000..119ffd2f1
--- /dev/null
+++ b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -0,0 +1,30 @@
+# Here is the command-line style training code.
+# 22GB
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-3B-Instruct \
+    --train_type lora \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
+              AI-ModelScope/alpaca-gpt4-data-en#500 \
+              swift/self-cognition#500 \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --model_author 小黄 'Xiao Huang' \
+    --model_name '魔搭' 'ModelScope'
diff --git a/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb b/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
index 47dd56662..cd4c214ab 100644
--- a/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
+++ b/examples/notebook/qwen2vl-ocr/ocr-sft.ipynb
@@ -22,7 +22,7 @@
    "outputs": [],
    "source": [
     "# # install ms-swift\n",
-    "# pip install git+https://github.com/modelscope/ms-swift.git"
+    "# pip install ms-swift -U"
    ]
   },
   {
diff --git a/examples/train/demo.sh b/examples/train/demo.sh
index c9c93e1d8..7602459e2 100644
--- a/examples/train/demo.sh
+++ b/examples/train/demo.sh
@@ -34,4 +34,5 @@ swift sft \
 # swift infer \
 #     --adapters output/vx-xxx/checkpoint-xxx \
 #     --stream true \
+#     --temperature 0 \
 #     --max_new_tokens 2048
diff --git a/swift/llm/argument/base_args/base_args.py b/swift/llm/argument/base_args/base_args.py
index ace7bf7a4..5a2187dbe 100644
--- a/swift/llm/argument/base_args/base_args.py
+++ b/swift/llm/argument/base_args/base_args.py
@@ -170,8 +170,8 @@ def from_pretrained(cls, checkpoint_dir: str):
         self.load_args_from_ckpt()
         return self
 
-    def _init_ckpt_dir(self, adapters=None):
-        model_dirs = (adapters or self.adapters).copy()
+    def _init_ckpt_dir(self):
+        model_dirs = self.adapters.copy()
         if self.model:
             model_dirs.append(self.model)
         self.ckpt_dir = None
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
index 26f52f7bb..2f60e86b2 100644
--- a/swift/llm/argument/base_args/model_args.py
+++ b/swift/llm/argument/base_args/model_args.py
@@ -29,11 +29,11 @@ class ModelArguments:
         device_map (Optional[str]): Configuration for device mapping. Default is None.
         local_repo_path (Optional[str]): Path to the local github repository for model. Default is None.
     """
-    task_type: Literal['causal_lm', 'seq_cls'] = None
     model: Optional[str] = None  # model id or model path
     model_type: Optional[str] = field(
         default=None, metadata={'help': f'model_type choices: {list(MODEL_MAPPING.keys())}'})
     model_revision: Optional[str] = None
+    task_type: Literal['causal_lm', 'seq_cls'] = None
 
     torch_dtype: Literal['bfloat16', 'float16', 'float32', None] = None
     # flash_attn: It will automatically convert names based on the model.
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index 549362847..1ec5107cf 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -462,6 +462,8 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
     @staticmethod
     def _add_default_tags(inputs: StdTemplateInputs):
         total_content = '\n'.join([message['content'] or '' for message in inputs.messages])
+        if inputs.system:
+            total_content = f'{inputs.system}\n{total_content}'
         for media_type in ['image', 'audio', 'video']:
             media_key, media_tag = f'{media_type}s', f'<{media_type}>'
             medias = getattr(inputs, media_key)
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
index aa3458862..495bc9235 100644
--- a/swift/trainers/mixin.py
+++ b/swift/trainers/mixin.py
@@ -4,9 +4,7 @@
 import os
 import shutil
 import time
-from contextlib import contextmanager
 from copy import copy
-from functools import wraps
 from types import MethodType
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
@@ -249,29 +247,6 @@ def _save_checkpoint(self, *args, **kwargs):
         logger.info(f'Saving model checkpoint to {self.state.last_model_checkpoint}')
         return result
 
-    @contextmanager
-    def _patch_loss_function(self):
-        model = self.model
-        if isinstance(model, PeftModel):
-            model = model.model
-        model_cls = model.__class__
-        if not hasattr(model_cls, 'loss_function'):
-            yield
-            return
-
-        loss_function = model.loss_function
-        _old_loss_function = model_cls.loss_function
-
-        @staticmethod
-        @wraps(loss_function)
-        def new_loss_function(logits, labels, **kwargs):
-            labels = labels.to(logits.device)  # fix device_map
-            return loss_function(logits=logits, labels=labels, **kwargs)
-
-        model_cls.loss_function = new_loss_function
-        yield
-        model_cls.loss_function = _old_loss_function
-
     def train(self, *args, **kwargs):
         if self.model.model_meta.is_multimodal:
             models = list(
@@ -282,7 +257,7 @@ def train(self, *args, **kwargs):
             self.template.register_post_encode_hook(models)
             logger.info(f'Successfully registered post_encode hook: {[model.__class__.__name__ for model in models]}')
         self._save_initial_model(self.args.output_dir)
-        with self.hub.patch_hub(), self._patch_loss_function():
+        with self.hub.patch_hub():
             res = super().train(*args, **kwargs)
         self.template.remove_post_encode_hook()
         return res
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
index 51e3ad178..1614b2bdc 100644
--- a/swift/trainers/trainers.py
+++ b/swift/trainers/trainers.py
@@ -2,6 +2,7 @@
 # Part of the implementation is borrowed from huggingface/transformers.
 import os
 from contextlib import contextmanager, nullcontext
+from functools import wraps
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -22,6 +23,33 @@
 class Trainer(SwiftMixin, HfTrainer):
     args: TrainingArguments
 
+    @contextmanager
+    def _patch_loss_function(self):
+        model = self.model
+        if isinstance(model, PeftModel):
+            model = model.model
+        model_cls = model.__class__
+        if not hasattr(model_cls, 'loss_function'):
+            yield
+            return
+
+        loss_function = model.loss_function
+        _old_loss_function = model_cls.loss_function
+
+        @staticmethod
+        @wraps(loss_function)
+        def new_loss_function(logits, labels, **kwargs):
+            labels = labels.to(logits.device)  # fix device_map
+            return loss_function(logits=logits, labels=labels, **kwargs)
+
+        model_cls.loss_function = new_loss_function
+        yield
+        model_cls.loss_function = _old_loss_function
+
+    def train(self, *args, **kwargs):
+        with self._patch_loss_function():
+            return super().train(*args, **kwargs)
+
     def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
         loss, outputs = super().compute_loss(model, inputs, return_outputs=True)
         if inputs.get('labels') is not None:

From 6488cba1052890ba3590f04b2459e456281196c9 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Wed, 25 Dec 2024 16:32:02 +0800
Subject: [PATCH 10/13] fix shell (#2764)

---
 README.md                                                   | 6 +++---
 README_CN.md                                                | 6 +++---
 .../\345\277\253\351\200\237\345\274\200\345\247\213.md"    | 6 +++---
 "docs/source/Instruction/\345\257\274\345\207\272.md"       | 6 +++---
 docs/source_en/GetStarted/Quick-start.md                    | 6 +++---
 docs/source_en/Instruction/Export.md                        | 6 +++---
 examples/export/quantize/awq.sh                             | 4 ++--
 examples/export/quantize/gptq.sh                            | 4 ++--
 examples/notebook/qwen2.5-self-cognition/sft.sh             | 6 +++---
 examples/train/all_to_all/train.sh                          | 2 +-
 examples/train/demo.sh                                      | 6 +++---
 examples/train/full/train.sh                                | 2 +-
 examples/train/lazy_tokenize/train.sh                       | 2 +-
 examples/train/multi-gpu/ddp/train.sh                       | 2 +-
 examples/train/multi-gpu/ddp_device_map/train.sh            | 2 +-
 examples/train/multi-gpu/deepspeed/train_zero2.sh           | 2 +-
 examples/train/multi-gpu/deepspeed/train_zero3.sh           | 2 +-
 examples/train/multi-gpu/fsdp_qlora/train.sh                | 2 +-
 examples/train/multi-node/accelerate/train_node1.sh         | 2 +-
 examples/train/multi-node/accelerate/train_node2.sh         | 2 +-
 examples/train/multi-node/deepspeed/train.sh                | 2 +-
 examples/train/multi-node/dlc/train.sh                      | 2 +-
 examples/train/multi-node/swift/train_node1.sh              | 2 +-
 examples/train/multi-node/swift/train_node2.sh              | 2 +-
 examples/train/multi-node/torchrun/train_node1.sh           | 2 +-
 examples/train/multi-node/torchrun/train_node2.sh           | 2 +-
 examples/train/multimodal/grounding.sh                      | 2 +-
 examples/train/multimodal/ocr.sh                            | 2 +-
 examples/train/multimodal/vqa.sh                            | 2 +-
 examples/train/packing/train.sh                             | 2 +-
 examples/train/plugins/train_loss_scale.sh                  | 2 +-
 examples/train/rlhf/kto.sh                                  | 2 +-
 examples/train/seq_cls/sft.sh                               | 2 +-
 examples/train/sequence_parallel/train.sh                   | 2 +-
 examples/train/streaming/train.sh                           | 2 +-
 examples/train/tuners/adalora/train.sh                      | 2 +-
 examples/train/tuners/adapter/train.sh                      | 2 +-
 examples/train/tuners/boft/train.sh                         | 2 +-
 examples/train/tuners/bone/train.sh                         | 2 +-
 examples/train/tuners/dora/train.sh                         | 2 +-
 examples/train/tuners/galore/train_galore.sh                | 2 +-
 examples/train/tuners/galore/train_qgalore.sh               | 2 +-
 examples/train/tuners/liger/train.sh                        | 2 +-
 examples/train/tuners/lisa/train.sh                         | 2 +-
 examples/train/tuners/llamapro/train.sh                     | 2 +-
 examples/train/tuners/longlora/train.sh                     | 2 +-
 examples/train/tuners/lora-ga/train.sh                      | 2 +-
 examples/train/tuners/lora/train.sh                         | 2 +-
 examples/train/tuners/neftune/train.sh                      | 2 +-
 examples/train/tuners/olora/train.sh                        | 2 +-
 examples/train/tuners/pissa/train.sh                        | 2 +-
 examples/train/tuners/qlora/train.sh                        | 2 +-
 examples/train/tuners/reft/train.sh                         | 2 +-
 examples/train/tuners/unsloth/train.sh                      | 2 +-
 requirements/framework.txt                                  | 2 +-
 55 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/README.md b/README.md
index 9884d0fc3..a572beaed 100644
--- a/README.md
+++ b/README.md
@@ -114,9 +114,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/README_CN.md b/README_CN.md
index 60b7c79a9..c3796e0a5 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -107,9 +107,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index d30b25cf8..c69597316 100644
--- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -31,9 +31,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git "a/docs/source/Instruction/\345\257\274\345\207\272.md" "b/docs/source/Instruction/\345\257\274\345\207\272.md"
index 1a48068e8..3e5426f59 100644
--- "a/docs/source/Instruction/\345\257\274\345\207\272.md"
+++ "b/docs/source/Instruction/\345\257\274\345\207\272.md"
@@ -77,7 +77,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method bnb \
     --quant_bits 4 \
     --torch_dtype bfloat16
@@ -86,7 +86,7 @@ CUDA_VISIBLE_DEVICES=0 swift sft \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method hqq \
     --quant_bits 4
 
@@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES=0 swift sft \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method eetq \
     --torch_dtype float16
 ```
diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md
index b5d58e09a..c410e4484 100644
--- a/docs/source_en/GetStarted/Quick-start.md
+++ b/docs/source_en/GetStarted/Quick-start.md
@@ -31,9 +31,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/docs/source_en/Instruction/Export.md b/docs/source_en/Instruction/Export.md
index fec9f44f6..175fb7b5e 100644
--- a/docs/source_en/Instruction/Export.md
+++ b/docs/source_en/Instruction/Export.md
@@ -78,7 +78,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method bnb \
     --quant_bits 4 \
     --torch_dtype bfloat16
@@ -87,7 +87,7 @@ CUDA_VISIBLE_DEVICES=0 swift sft \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method hqq \
     --quant_bits 4
 
@@ -95,7 +95,7 @@ CUDA_VISIBLE_DEVICES=0 swift sft \
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model Qwen/Qwen2-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5000 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#5000' \
     --quant_method eetq \
     --torch_dtype float16
 ```
diff --git a/examples/export/quantize/awq.sh b/examples/export/quantize/awq.sh
index 379ad270e..04ebbaf6c 100644
--- a/examples/export/quantize/awq.sh
+++ b/examples/export/quantize/awq.sh
@@ -1,8 +1,8 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift export \
     --model Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
     --quant_n_samples 128 \
     --quant_batch_size 1 \
     --max_length 2048 \
diff --git a/examples/export/quantize/gptq.sh b/examples/export/quantize/gptq.sh
index f53d251bd..b5f8c43e2 100644
--- a/examples/export/quantize/gptq.sh
+++ b/examples/export/quantize/gptq.sh
@@ -3,8 +3,8 @@ OMP_NUM_THREADS=14 \
 CUDA_VISIBLE_DEVICES=0 \
 swift export \
     --model Qwen/Qwen2.5-1.5B-Instruct \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
     --quant_n_samples 128 \
     --quant_batch_size 1 \
     --max_length 2048 \
diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
index 119ffd2f1..43f97974f 100644
--- a/examples/notebook/qwen2.5-self-cognition/sft.sh
+++ b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -4,9 +4,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-3B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/all_to_all/train.sh b/examples/train/all_to_all/train.sh
index d99a26585..572eb1f16 100644
--- a/examples/train/all_to_all/train.sh
+++ b/examples/train/all_to_all/train.sh
@@ -7,7 +7,7 @@ image_area=518400 \
 swift sft \
     --model BAAI/Emu3-Gen \
     --train_type lora \
-    --dataset swift/TextCaps#40 \
+    --dataset 'swift/TextCaps#40' \
     --loss_scale react \
     --tools_prompt react_zh \
     --torch_dtype bfloat16 \
diff --git a/examples/train/demo.sh b/examples/train/demo.sh
index 7602459e2..e5d135cb6 100644
--- a/examples/train/demo.sh
+++ b/examples/train/demo.sh
@@ -3,9 +3,9 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \
-              AI-ModelScope/alpaca-gpt4-data-en#500 \
-              swift/self-cognition#500 \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/full/train.sh b/examples/train/full/train.sh
index 1190e43bc..ed560bf54 100644
--- a/examples/train/full/train.sh
+++ b/examples/train/full/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type full \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-5 \
diff --git a/examples/train/lazy_tokenize/train.sh b/examples/train/lazy_tokenize/train.sh
index d48192b6e..c6f3a168a 100644
--- a/examples/train/lazy_tokenize/train.sh
+++ b/examples/train/lazy_tokenize/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/multi-gpu/ddp/train.sh b/examples/train/multi-gpu/ddp/train.sh
index 48a3dae7d..6ce56701b 100644
--- a/examples/train/multi-gpu/ddp/train.sh
+++ b/examples/train/multi-gpu/ddp/train.sh
@@ -7,7 +7,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --lora_rank 8 \
diff --git a/examples/train/multi-gpu/ddp_device_map/train.sh b/examples/train/multi-gpu/ddp_device_map/train.sh
index a73d656d4..3949ae766 100644
--- a/examples/train/multi-gpu/ddp_device_map/train.sh
+++ b/examples/train/multi-gpu/ddp_device_map/train.sh
@@ -6,7 +6,7 @@ NPROC_PER_NODE=$nproc_per_node \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/multi-gpu/deepspeed/train_zero2.sh b/examples/train/multi-gpu/deepspeed/train_zero2.sh
index d7505e1cb..61b92e6fd 100644
--- a/examples/train/multi-gpu/deepspeed/train_zero2.sh
+++ b/examples/train/multi-gpu/deepspeed/train_zero2.sh
@@ -6,7 +6,7 @@ NPROC_PER_NODE=$nproc_per_node \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/multi-gpu/deepspeed/train_zero3.sh b/examples/train/multi-gpu/deepspeed/train_zero3.sh
index af7a6a4f2..5bed97bf5 100644
--- a/examples/train/multi-gpu/deepspeed/train_zero3.sh
+++ b/examples/train/multi-gpu/deepspeed/train_zero3.sh
@@ -6,7 +6,7 @@ NPROC_PER_NODE=$nproc_per_node \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --lora_rank 8 \
diff --git a/examples/train/multi-gpu/fsdp_qlora/train.sh b/examples/train/multi-gpu/fsdp_qlora/train.sh
index 827b55024..8b10a78b5 100644
--- a/examples/train/multi-gpu/fsdp_qlora/train.sh
+++ b/examples/train/multi-gpu/fsdp_qlora/train.sh
@@ -6,7 +6,7 @@ accelerate launch --config_file "./examples/train/fsdp_qlora/fsdp_offload.json"
     swift/cli/sft.py \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --max_length 2048 \
diff --git a/examples/train/multi-node/accelerate/train_node1.sh b/examples/train/multi-node/accelerate/train_node1.sh
index 752c91ade..03f630e56 100644
--- a/examples/train/multi-node/accelerate/train_node1.sh
+++ b/examples/train/multi-node/accelerate/train_node1.sh
@@ -4,7 +4,7 @@ accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_nod
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --lora_rank 8 \
     --lora_alpha 32 \
diff --git a/examples/train/multi-node/accelerate/train_node2.sh b/examples/train/multi-node/accelerate/train_node2.sh
index 603f502b5..2149a5a83 100644
--- a/examples/train/multi-node/accelerate/train_node2.sh
+++ b/examples/train/multi-node/accelerate/train_node2.sh
@@ -4,7 +4,7 @@ accelerate launch --config_file ./examples/train/multi-node/accelerate/multi_nod
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --lora_rank 8 \
     --lora_alpha 32 \
diff --git a/examples/train/multi-node/deepspeed/train.sh b/examples/train/multi-node/deepspeed/train.sh
index 253347d78..8616c737e 100644
--- a/examples/train/multi-node/deepspeed/train.sh
+++ b/examples/train/multi-node/deepspeed/train.sh
@@ -5,7 +5,7 @@ deepspeed --hostfile=./examples/train/multi-node-deepspeed/host.txt \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --lora_rank 8 \
     --lora_alpha 32 \
diff --git a/examples/train/multi-node/dlc/train.sh b/examples/train/multi-node/dlc/train.sh
index a2aed445e..182088eb2 100644
--- a/examples/train/multi-node/dlc/train.sh
+++ b/examples/train/multi-node/dlc/train.sh
@@ -3,7 +3,7 @@ NODE_RANK=$RANK \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --lora_rank 8 \
diff --git a/examples/train/multi-node/swift/train_node1.sh b/examples/train/multi-node/swift/train_node1.sh
index 03cc29265..976f757ab 100644
--- a/examples/train/multi-node/swift/train_node1.sh
+++ b/examples/train/multi-node/swift/train_node1.sh
@@ -7,7 +7,7 @@ swift sft \
       --model Qwen/Qwen2.5-7B-Instruct \
       --train_type lora \
       --torch_dtype bfloat16 \
-      --dataset swift/self-cognition#1000 \
+      --dataset 'swift/self-cognition#1000' \
       --num_train_epochs 1 \
       --lora_rank 8 \
       --lora_alpha 32 \
diff --git a/examples/train/multi-node/swift/train_node2.sh b/examples/train/multi-node/swift/train_node2.sh
index cc6a11538..22e4eee12 100644
--- a/examples/train/multi-node/swift/train_node2.sh
+++ b/examples/train/multi-node/swift/train_node2.sh
@@ -7,7 +7,7 @@ swift sft \
       --model Qwen/Qwen2.5-7B-Instruct \
       --train_type lora \
       --torch_dtype bfloat16 \
-      --dataset swift/self-cognition#1000 \
+      --dataset 'swift/self-cognition#1000' \
       --num_train_epochs 1 \
       --lora_rank 8 \
       --lora_alpha 32 \
diff --git a/examples/train/multi-node/torchrun/train_node1.sh b/examples/train/multi-node/torchrun/train_node1.sh
index 007cd8656..0072ca47c 100644
--- a/examples/train/multi-node/torchrun/train_node1.sh
+++ b/examples/train/multi-node/torchrun/train_node1.sh
@@ -4,7 +4,7 @@ torchrun --master_port 29500 --nproc_per_node=4 --nnodes=2 --node_rank=0 --maste
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --lora_rank 8 \
     --lora_alpha 32 \
diff --git a/examples/train/multi-node/torchrun/train_node2.sh b/examples/train/multi-node/torchrun/train_node2.sh
index 4704717b9..0de8699c3 100644
--- a/examples/train/multi-node/torchrun/train_node2.sh
+++ b/examples/train/multi-node/torchrun/train_node2.sh
@@ -4,7 +4,7 @@ torchrun --master_port 29500 --nproc_per_node=4 --nnodes=2 --node_rank=1 --maste
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --torch_dtype bfloat16 \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --lora_rank 8 \
     --lora_alpha 32 \
diff --git a/examples/train/multimodal/grounding.sh b/examples/train/multimodal/grounding.sh
index 01f04b3a6..2edd83c52 100644
--- a/examples/train/multimodal/grounding.sh
+++ b/examples/train/multimodal/grounding.sh
@@ -4,7 +4,7 @@ MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
     --train_type lora \
-    --dataset swift/refcoco:grounding#1000 \
+    --dataset 'swift/refcoco:grounding#1000' \
     --num_train_epochs 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
diff --git a/examples/train/multimodal/ocr.sh b/examples/train/multimodal/ocr.sh
index 4e39f45d1..d12a0c02c 100644
--- a/examples/train/multimodal/ocr.sh
+++ b/examples/train/multimodal/ocr.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0,1 \
 MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/QVQ-72B-Preview \
-    --dataset AI-ModelScope/LaTeX_OCR:human_handwrite#20000 \
+    --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
     --train_type lora \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
diff --git a/examples/train/multimodal/vqa.sh b/examples/train/multimodal/vqa.sh
index f5e0aeeba..0fce44bf3 100644
--- a/examples/train/multimodal/vqa.sh
+++ b/examples/train/multimodal/vqa.sh
@@ -6,7 +6,7 @@ MAX_PIXELS=1003520 \
 swift sft \
     --model Qwen/Qwen2-VL-7B-Instruct \
     --train_type lora \
-    --dataset swift/OK-VQA_train#1000 \
+    --dataset 'swift/OK-VQA_train#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/packing/train.sh b/examples/train/packing/train.sh
index aaf2bac62..68720ae77 100644
--- a/examples/train/packing/train.sh
+++ b/examples/train/packing/train.sh
@@ -7,7 +7,7 @@ swift sft \
     --packing true \
     --max_length 8192 \
     --max_steps 100 \
-    --dataset swift/self-cognition#5000 \
+    --dataset 'swift/self-cognition#5000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/plugins/train_loss_scale.sh b/examples/train/plugins/train_loss_scale.sh
index 9349a882b..3722c497d 100644
--- a/examples/train/plugins/train_loss_scale.sh
+++ b/examples/train/plugins/train_loss_scale.sh
@@ -5,7 +5,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/rlhf/kto.sh b/examples/train/rlhf/kto.sh
index 96ebe3bed..5d031bce4 100644
--- a/examples/train/rlhf/kto.sh
+++ b/examples/train/rlhf/kto.sh
@@ -6,7 +6,7 @@ swift rlhf \
     --rlhf_type kto \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000 \
+    --dataset 'AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto#10000' \
     --num_train_epochs 2 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
diff --git a/examples/train/seq_cls/sft.sh b/examples/train/seq_cls/sft.sh
index a7f0d229c..067c6664e 100644
--- a/examples/train/seq_cls/sft.sh
+++ b/examples/train/seq_cls/sft.sh
@@ -4,7 +4,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B \
     --train_type lora \
-    --dataset DAMO_NLP/jd:cls#2000 \
+    --dataset 'DAMO_NLP/jd:cls#2000' \
     --torch_dtype bfloat16 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/sequence_parallel/train.sh b/examples/train/sequence_parallel/train.sh
index 14bc6e1d8..1b61f3d5f 100644
--- a/examples/train/sequence_parallel/train.sh
+++ b/examples/train/sequence_parallel/train.sh
@@ -6,7 +6,7 @@ NPROC_PER_NODE=$nproc_per_node \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset AI-ModelScope/LongAlpaca-12k#5000 \
+    --dataset 'AI-ModelScope/LongAlpaca-12k#5000' \
     --num_train_epochs 1 \
     --sequence_parallel_size 2 \
     --learning_rate 1e-4 \
diff --git a/examples/train/streaming/train.sh b/examples/train/streaming/train.sh
index e941e0a9a..b864a48f2 100644
--- a/examples/train/streaming/train.sh
+++ b/examples/train/streaming/train.sh
@@ -2,7 +2,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --streaming true \
     --max_steps 1000 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/adalora/train.sh b/examples/train/tuners/adalora/train.sh
index e8b11ddf9..d22860d1e 100644
--- a/examples/train/tuners/adalora/train.sh
+++ b/examples/train/tuners/adalora/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type adalora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/adapter/train.sh b/examples/train/tuners/adapter/train.sh
index 958d1b803..d334ae6cb 100644
--- a/examples/train/tuners/adapter/train.sh
+++ b/examples/train/tuners/adapter/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type adapter \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/boft/train.sh b/examples/train/tuners/boft/train.sh
index b6907d0e6..900bf2351 100644
--- a/examples/train/tuners/boft/train.sh
+++ b/examples/train/tuners/boft/train.sh
@@ -4,7 +4,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type boft \
     --label_names labels \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/bone/train.sh b/examples/train/tuners/bone/train.sh
index 88c220fac..1dff4f740 100644
--- a/examples/train/tuners/bone/train.sh
+++ b/examples/train/tuners/bone/train.sh
@@ -4,7 +4,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type bone \
     --label_names labels \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/dora/train.sh b/examples/train/tuners/dora/train.sh
index 2eb780f83..2bc7d9f23 100644
--- a/examples/train/tuners/dora/train.sh
+++ b/examples/train/tuners/dora/train.sh
@@ -4,7 +4,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
     --use_dora true \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/galore/train_galore.sh b/examples/train/tuners/galore/train_galore.sh
index ac47f68e6..4728e0e49 100644
--- a/examples/train/tuners/galore/train_galore.sh
+++ b/examples/train/tuners/galore/train_galore.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type full \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-5 \
diff --git a/examples/train/tuners/galore/train_qgalore.sh b/examples/train/tuners/galore/train_qgalore.sh
index a250c3a17..cdebbe044 100644
--- a/examples/train/tuners/galore/train_qgalore.sh
+++ b/examples/train/tuners/galore/train_qgalore.sh
@@ -5,7 +5,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type full \
     --torch_dtype bfloat16 \
-    --dataset lvjianjin/AdvertiseGen#1000 \
+    --dataset 'lvjianjin/AdvertiseGen#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-5 \
diff --git a/examples/train/tuners/liger/train.sh b/examples/train/tuners/liger/train.sh
index 068020de9..bb872a710 100644
--- a/examples/train/tuners/liger/train.sh
+++ b/examples/train/tuners/liger/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --use_liger true \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/tuners/lisa/train.sh b/examples/train/tuners/lisa/train.sh
index 704d43442..8a8475a43 100644
--- a/examples/train/tuners/lisa/train.sh
+++ b/examples/train/tuners/lisa/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type full \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --lisa_activated_layers 2 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/tuners/llamapro/train.sh b/examples/train/tuners/llamapro/train.sh
index b1d328c2b..d0956449d 100644
--- a/examples/train/tuners/llamapro/train.sh
+++ b/examples/train/tuners/llamapro/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type llamapro \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --llamapro_num_new_blocks 4 \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/tuners/longlora/train.sh b/examples/train/tuners/longlora/train.sh
index 64c03f42d..35697f21e 100644
--- a/examples/train/tuners/longlora/train.sh
+++ b/examples/train/tuners/longlora/train.sh
@@ -2,7 +2,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
     --train_type longlora \
-    --dataset AI-ModelScope/LongAlpaca-12k#1000 \
+    --dataset 'AI-ModelScope/LongAlpaca-12k#1000' \
     --num_train_epochs 1 \
     --learning_rate 1e-4 \
     --attn_impl flash_attn \
diff --git a/examples/train/tuners/lora-ga/train.sh b/examples/train/tuners/lora-ga/train.sh
index dd132200d..fbfe76cc6 100644
--- a/examples/train/tuners/lora-ga/train.sh
+++ b/examples/train/tuners/lora-ga/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2-1.5B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/lora/train.sh b/examples/train/tuners/lora/train.sh
index 51058b446..e8c231c67 100644
--- a/examples/train/tuners/lora/train.sh
+++ b/examples/train/tuners/lora/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/neftune/train.sh b/examples/train/tuners/neftune/train.sh
index a6dcb1585..bf53a4d90 100644
--- a/examples/train/tuners/neftune/train.sh
+++ b/examples/train/tuners/neftune/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --neftune_noise_alpha 15 \
diff --git a/examples/train/tuners/olora/train.sh b/examples/train/tuners/olora/train.sh
index 2614fb8d1..1ead995df 100644
--- a/examples/train/tuners/olora/train.sh
+++ b/examples/train/tuners/olora/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/pissa/train.sh b/examples/train/tuners/pissa/train.sh
index c258c124e..9139ba441 100644
--- a/examples/train/tuners/pissa/train.sh
+++ b/examples/train/tuners/pissa/train.sh
@@ -3,7 +3,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/qlora/train.sh b/examples/train/tuners/qlora/train.sh
index fbf97a682..716845374 100644
--- a/examples/train/tuners/qlora/train.sh
+++ b/examples/train/tuners/qlora/train.sh
@@ -2,7 +2,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/examples/train/tuners/reft/train.sh b/examples/train/tuners/reft/train.sh
index c4a0d593a..0b3853bfe 100644
--- a/examples/train/tuners/reft/train.sh
+++ b/examples/train/tuners/reft/train.sh
@@ -2,7 +2,7 @@ CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --train_type reft \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --reft_intervention_type 'LoreftIntervention' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
diff --git a/examples/train/tuners/unsloth/train.sh b/examples/train/tuners/unsloth/train.sh
index 87adf7ff1..829114896 100644
--- a/examples/train/tuners/unsloth/train.sh
+++ b/examples/train/tuners/unsloth/train.sh
@@ -4,7 +4,7 @@ swift sft \
     --model Qwen/Qwen2.5-7B-Instruct \
     --tuner_backend unsloth \
     --train_type lora \
-    --dataset swift/self-cognition#1000 \
+    --dataset 'swift/self-cognition#1000' \
     --num_train_epochs 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
diff --git a/requirements/framework.txt b/requirements/framework.txt
index 541d01414..ee068f99a 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -28,7 +28,7 @@ sentencepiece
 tensorboard
 tiktoken
 tqdm
-transformers>=4.33,<4.48
+transformers>=4.33,<4.49
 transformers_stream_generator
 trl>=0.11,<0.12
 uvicorn

From 57ab2a1ebd3913cdbb6c3805800ca97b0584257f Mon Sep 17 00:00:00 2001
From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com>
Date: Wed, 25 Dec 2024 20:32:55 +0800
Subject: [PATCH 11/13] fix app-ui (#2765)

---
 swift/ui/app.py                 | 7 +++++++
 swift/ui/llm_infer/llm_infer.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/swift/ui/app.py b/swift/ui/app.py
index 5ea31dfea..92d87e5a4 100644
--- a/swift/ui/app.py
+++ b/swift/ui/app.py
@@ -87,6 +87,13 @@ def run(self):
             if is_gradio_app:
                 from swift.utils import find_free_port
                 LLMInfer.element('port').value = str(find_free_port())
+                for f in fields(self.args):
+                    if getattr(self.args, f.name) and f.name in LLMInfer.elements() and hasattr(
+                            LLMInfer.elements()[f.name], 'value') and f.name != 'port':
+                        value = getattr(self.args, f.name)
+                        if isinstance(value, list):
+                            value = ' '.join([v or '' for v in value])
+                        LLMInfer.elements()[f.name].value = value
                 app.load(LLMInfer.deploy_model, list(LLMInfer.valid_elements().values()),
                          [LLMInfer.element('runtime_tab'),
                           LLMInfer.element('running_tasks')])
diff --git a/swift/ui/llm_infer/llm_infer.py b/swift/ui/llm_infer/llm_infer.py
index d855d34d8..6b6b609fe 100644
--- a/swift/ui/llm_infer/llm_infer.py
+++ b/swift/ui/llm_infer/llm_infer.py
@@ -297,7 +297,7 @@ def deploy_model(cls, *args):
                 time.sleep(1)
                 cnt += 1
                 if cnt >= 60:
-                    logger.warn(f'Deploy costing too much time, please check log file: {log_file}')
+                    logger.warning_once(f'Deploy costing too much time, please check log file: {log_file}')
             logger.info('Deploy done.')
         cls.deployed = True
         running_task = Runtime.refresh_tasks(log_file)

From f2c0a498030d1f0b689721993d5497b489b2e6d0 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 13:06:52 +0800
Subject: [PATCH 12/13] support modern_bert & support bert deploy (#2767)

---
 ...14\346\225\260\346\215\256\351\233\206.md" | 35 +++++++++--------
 .../Supported-models-and-datasets.md          | 35 +++++++++--------
 examples/deploy/client/README.md              |  1 -
 examples/deploy/client/llm/swift_client.py    |  1 +
 examples/deploy/client/mllm/swift_client.py   |  1 +
 .../self-cognition-sft.ipynb                  |  4 +-
 .../notebook/qwen2.5-self-cognition/sft.sh    |  4 +-
 .../all_to_all/infer.sh}                      |  0
 examples/train/seq_cls/bert/deploy.sh         |  9 +++++
 examples/train/seq_cls/bert/infer.sh          |  5 +++
 examples/train/seq_cls/bert/sft.sh            | 24 ++++++++++++
 examples/train/seq_cls/qwen2_5/deploy.sh      |  8 ++++
 examples/train/seq_cls/{ => qwen2_5}/infer.sh |  1 -
 examples/train/seq_cls/{ => qwen2_5}/sft.sh   |  0
 swift/llm/argument/infer_args.py              |  8 ++--
 swift/llm/infer/deploy.py                     |  2 +
 swift/llm/model/constant.py                   |  3 ++
 swift/llm/model/model/__init__.py             |  4 +-
 swift/llm/model/model/bert.py                 | 38 +++++++++++++++++++
 swift/llm/model/register.py                   |  8 +++-
 swift/llm/template/base.py                    | 24 +++++++-----
 swift/llm/template/template/microsoft.py      |  2 +-
 swift/llm/template/template/utils.py          |  5 ++-
 swift/llm/template/template_meta.py           |  2 -
 swift/utils/torch_utils.py                    |  4 +-
 tests/train/test_cls.py                       | 29 ++++++++++++--
 26 files changed, 192 insertions(+), 65 deletions(-)
 delete mode 100644 examples/deploy/client/README.md
 rename examples/{infer/pt/all_to_all.sh => train/all_to_all/infer.sh} (100%)
 create mode 100644 examples/train/seq_cls/bert/deploy.sh
 create mode 100644 examples/train/seq_cls/bert/infer.sh
 create mode 100644 examples/train/seq_cls/bert/sft.sh
 create mode 100644 examples/train/seq_cls/qwen2_5/deploy.sh
 rename examples/train/seq_cls/{ => qwen2_5}/infer.sh (83%)
 rename examples/train/seq_cls/{ => qwen2_5}/sft.sh (100%)
 create mode 100644 swift/llm/model/model/bert.py

diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
index 0ae7bd93f..0cf45af52 100644
--- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md"
@@ -456,6 +456,9 @@
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
+|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
+|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
+|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-|
 
 
 ### 多模态大模型
@@ -466,24 +469,24 @@
 |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
-|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
+|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
 |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
index 56397f528..b5ac0a249 100644
--- a/docs/source_en/Instruction/Supported-models-and-datasets.md
+++ b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -456,6 +456,9 @@ The table below introduces the models integrated with ms-swift:
 |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)|
 |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)|
 |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)|
+|[answerdotai/ModernBERT-base](https://modelscope.cn/models/answerdotai/ModernBERT-base)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)|
+|[answerdotai/ModernBERT-large](https://modelscope.cn/models/answerdotai/ModernBERT-large)|modern_bert|dummy|-|-|[answerdotai/ModernBERT-large](https://huggingface.co/answerdotai/ModernBERT-large)|
+|[iic/nlp_structbert_backbone_base_std](https://modelscope.cn/models/iic/nlp_structbert_backbone_base_std)|bert|dummy|-|-|-|
 
 
 ### Multimodal large models
@@ -466,24 +469,24 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/Qwen/Qwen-VL-Chat-Int4)|qwen_vl|qwen_vl|-|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |[Qwen/Qwen-Audio-Chat](https://modelscope.cn/models/Qwen/Qwen-Audio-Chat)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
 |[Qwen/Qwen-Audio](https://modelscope.cn/models/Qwen/Qwen-Audio)|qwen_audio|qwen_audio|-|audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
-|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
-|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
-|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
-|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
-|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
-|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
-|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
-|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
-|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-2B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct)|
+|[Qwen/Qwen2-VL-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)|
+|[Qwen/Qwen2-VL-72B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct)|
+|[Qwen/Qwen2-VL-2B](https://modelscope.cn/models/Qwen/Qwen2-VL-2B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B](https://huggingface.co/Qwen/Qwen2-VL-2B)|
+|[Qwen/Qwen2-VL-7B](https://modelscope.cn/models/Qwen/Qwen2-VL-7B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B](https://huggingface.co/Qwen/Qwen2-VL-7B)|
+|[Qwen/Qwen2-VL-72B](https://modelscope.cn/models/Qwen/Qwen2-VL-72B)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B](https://huggingface.co/Qwen/Qwen2-VL-72B)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4)|
+|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8)|
+|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-2B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-7B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-7B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct-AWQ)|
+|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://modelscope.cn/models/Qwen/Qwen2-VL-72B-Instruct-AWQ)|qwen2_vl|qwen2_vl|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/Qwen2-VL-72B-Instruct-AWQ](https://huggingface.co/Qwen/Qwen2-VL-72B-Instruct-AWQ)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45, librosa|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
-|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
+|[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils, pyav, decord|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[AIDC-AI/Ovis1.6-Gemma2-9B](https://modelscope.cn/models/AIDC-AI/Ovis1.6-Gemma2-9B)|ovis1_6|ovis1_6|transformers>=4.42|vision|[AIDC-AI/Ovis1.6-Gemma2-9B](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-9B)|
 |[ZhipuAI/glm-4v-9b](https://modelscope.cn/models/ZhipuAI/glm-4v-9b)|glm4v|glm4v|transformers>=4.42|-|[THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|vision|[THUDM/glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b)|
diff --git a/examples/deploy/client/README.md b/examples/deploy/client/README.md
deleted file mode 100644
index 5198b501b..000000000
--- a/examples/deploy/client/README.md
+++ /dev/null
@@ -1 +0,0 @@
-In each client `.py` program, we have added the `run_deploy` context. The `run_deploy` function is a simple way to deploy locally, making it convenient for users to run the program directly. In common deployment scenarios, users only need to remove the deployment context and modify the `host` and `port` in order to use the client.
diff --git a/examples/deploy/client/llm/swift_client.py b/examples/deploy/client/llm/swift_client.py
index bad18dd86..16df8d3af 100644
--- a/examples/deploy/client/llm/swift_client.py
+++ b/examples/deploy/client/llm/swift_client.py
@@ -53,6 +53,7 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
                            DeployArguments)
     from swift.plugin import InferStats
     # TODO: The current 'pt' deployment does not support automatic batch.
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1,
                             infer_backend='vllm')) as port:
diff --git a/examples/deploy/client/mllm/swift_client.py b/examples/deploy/client/mllm/swift_client.py
index 2dc76e689..85f7f418c 100644
--- a/examples/deploy/client/mllm/swift_client.py
+++ b/examples/deploy/client/mllm/swift_client.py
@@ -118,6 +118,7 @@ def run_client(host: str = '127.0.0.1', port: int = 8000):
                            DeployArguments)
     from swift.plugin import InferStats
     # TODO: The current 'pt' deployment does not support automatic batch.
+    # NOTE: In a real deployment scenario, please comment out the context of run_deploy.
     with run_deploy(
             DeployArguments(model='Qwen/Qwen2-VL-2B-Instruct', verbose=False, log_interval=-1,
                             infer_backend='vllm')) as port:
diff --git a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
index 65c43fdc6..1ecd96cad 100644
--- a/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
+++ b/examples/notebook/qwen2.5-self-cognition/self-cognition-sft.ipynb
@@ -8,7 +8,9 @@
     "\n",
     "Here is a demonstration of using python to perform self-cognition SFT of Qwen2.5-3B-Instruct. Through this tutorial, you can quickly understand some details of swift sft, which will be of great help in customizing ms-swift for you~\n",
     "\n",
-    "Are you ready? Let's begin the journey..."
+    "Are you ready? Let's begin the journey...\n",
+    "\n",
+    "中文版：https://modelscope.cn/notebook/share/ipynb/4340fdeb/self-cognition-sft.ipynb"
    ]
   },
   {
diff --git a/examples/notebook/qwen2.5-self-cognition/sft.sh b/examples/notebook/qwen2.5-self-cognition/sft.sh
index 43f97974f..5881365a0 100644
--- a/examples/notebook/qwen2.5-self-cognition/sft.sh
+++ b/examples/notebook/qwen2.5-self-cognition/sft.sh
@@ -26,5 +26,5 @@ swift sft \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 4 \
     --dataset_num_proc 4 \
-    --model_author 小黄 'Xiao Huang' \
-    --model_name '魔搭' 'ModelScope'
+    --model_name 小黄 'Xiao Huang' \
+    --model_author '魔搭' 'ModelScope'
diff --git a/examples/infer/pt/all_to_all.sh b/examples/train/all_to_all/infer.sh
similarity index 100%
rename from examples/infer/pt/all_to_all.sh
rename to examples/train/all_to_all/infer.sh
diff --git a/examples/train/seq_cls/bert/deploy.sh b/examples/train/seq_cls/bert/deploy.sh
new file mode 100644
index 000000000..13825d349
--- /dev/null
+++ b/examples/train/seq_cls/bert/deploy.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift deploy \
+    --model output/vx-xxx/checkpoint-xxx \
+    --served_model_name bert-base-chinese
+
+# curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+# "model": "bert-base-chinese",
+# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# }'
diff --git a/examples/train/seq_cls/bert/infer.sh b/examples/train/seq_cls/bert/infer.sh
new file mode 100644
index 000000000..abd8f1f02
--- /dev/null
+++ b/examples/train/seq_cls/bert/infer.sh
@@ -0,0 +1,5 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --model output/vx-xxx/checkpoint-xxx \
+    --load_data_args true \
+    --max_batch_size 16
diff --git a/examples/train/seq_cls/bert/sft.sh b/examples/train/seq_cls/bert/sft.sh
new file mode 100644
index 000000000..35081e0af
--- /dev/null
+++ b/examples/train/seq_cls/bert/sft.sh
@@ -0,0 +1,24 @@
+# If `num_labels` is provided, it will be considered a classification task,
+# and AutoModelForSequenceClassification will be used to load the model.
+# The BERT model does not require templates, so it can usually be used without registration.
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model AI-ModelScope/bert-base-chinese \
+    --train_type full \
+    --dataset 'DAMO_NLP/jd:cls#2000' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 512 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --num_labels 2 \
+    --task_type seq_cls
diff --git a/examples/train/seq_cls/qwen2_5/deploy.sh b/examples/train/seq_cls/qwen2_5/deploy.sh
new file mode 100644
index 000000000..5476dae49
--- /dev/null
+++ b/examples/train/seq_cls/qwen2_5/deploy.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0 \
+swift deploy \
+    --adapters output/vx-xxx/checkpoint-xxx
+
+# curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
+# "model": "Qwen2.5-7B",
+# "messages": [{"role": "user", "content": "Task: Sentiment Classification\nSentence: 包装差，容易被调包。\nCategory: negative, positive\nOutput:"}]
+# }'
diff --git a/examples/train/seq_cls/infer.sh b/examples/train/seq_cls/qwen2_5/infer.sh
similarity index 83%
rename from examples/train/seq_cls/infer.sh
rename to examples/train/seq_cls/qwen2_5/infer.sh
index c994148de..43aa93bcc 100644
--- a/examples/train/seq_cls/infer.sh
+++ b/examples/train/seq_cls/qwen2_5/infer.sh
@@ -1,6 +1,5 @@
 CUDA_VISIBLE_DEVICES=0 \
 swift infer \
     --adapters output/vx-xxx/checkpoint-xxx \
-    --max_new_tokens 2048 \
     --load_data_args true \
     --max_batch_size 16
diff --git a/examples/train/seq_cls/sft.sh b/examples/train/seq_cls/qwen2_5/sft.sh
similarity index 100%
rename from examples/train/seq_cls/sft.sh
rename to examples/train/seq_cls/qwen2_5/sft.sh
diff --git a/swift/llm/argument/infer_args.py b/swift/llm/argument/infer_args.py
index ab5f994e1..29a586a07 100644
--- a/swift/llm/argument/infer_args.py
+++ b/swift/llm/argument/infer_args.py
@@ -132,11 +132,9 @@ def _init_result_path(self, folder_name: str) -> None:
     def _init_stream(self):
         self.eval_human = not (self.dataset and self.split_dataset_ratio > 0 or self.val_dataset)
 
-        if self.stream and self.template:
-            template_meta = get_template_meta(self.template)
-            if self.num_beams != 1 or not template_meta.support_stream:
-                self.stream = False
-                logger.info('Setting args.stream: False')
+        if self.stream and self.num_beams != 1:
+            self.stream = False
+            logger.info('Setting args.stream: False')
 
     def _init_pt_ddp(self):
         if self.infer_backend != 'pt' or not is_dist():
diff --git a/swift/llm/infer/deploy.py b/swift/llm/infer/deploy.py
index 1d1956da1..ff0ab8ec5 100644
--- a/swift/llm/infer/deploy.py
+++ b/swift/llm/infer/deploy.py
@@ -121,6 +121,8 @@ def _post_process(self, request_info, response, return_cmpl_response: bool = Fal
 
     def _set_request_config(self, request_config) -> None:
         default_request_config = self.args.get_request_config()
+        if default_request_config is None:
+            return
         for key, val in asdict(request_config).items():
             default_val = getattr(default_request_config, key)
             if default_val is not None and (val is None or isinstance(val, (list, tuple)) and len(val) == 0):
diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py
index 4ac9de3d7..a87f901c7 100644
--- a/swift/llm/model/constant.py
+++ b/swift/llm/model/constant.py
@@ -94,6 +94,9 @@ class LLMModelType:
     polylm = 'polylm'
     aya = 'aya'
 
+    modern_bert = 'modern_bert'
+    bert = 'bert'
+
 
 class MLLMModelType:
     qwen_vl = 'qwen_vl'
diff --git a/swift/llm/model/model/__init__.py b/swift/llm/model/model/__init__.py
index 8808a518b..a972ec64e 100644
--- a/swift/llm/model/model/__init__.py
+++ b/swift/llm/model/model/__init__.py
@@ -1,2 +1,2 @@
-from . import (baai, baichuan, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft, minicpm,
-               mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
+from . import (baai, baichuan, bert, codefuse, deepseek, gemma, glm, internlm, llama, llava, llm, mamba, microsoft,
+               minicpm, mistral, mllm, mplug, openbuddy, qwen, telechat, yi)
diff --git a/swift/llm/model/model/bert.py b/swift/llm/model/model/bert.py
new file mode 100644
index 000000000..f83aef353
--- /dev/null
+++ b/swift/llm/model/model/bert.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from transformers import AutoConfig
+
+from swift.utils import get_logger
+from ..constant import LLMModelType
+from ..register import Model, ModelGroup, ModelMeta, get_model_tokenizer_from_local, register_model
+
+logger = get_logger()
+
+
+def get_model_tokenizer_modern_bert(model_dir, *args, **kwargs):
+    model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    model_config.reference_compile = False
+    kwargs['model_config'] = model_config
+    return get_model_tokenizer_from_local(model_dir, *args, **kwargs)
+
+
+register_model(
+    ModelMeta(
+        LLMModelType.modern_bert, [
+            ModelGroup([
+                Model('answerdotai/ModernBERT-base', 'answerdotai/ModernBERT-base'),
+                Model('answerdotai/ModernBERT-large', 'answerdotai/ModernBERT-large'),
+            ])
+        ],
+        None,
+        get_model_tokenizer_modern_bert,
+        requires=['transformers>=4.48'],
+        tags=['bert']))
+
+register_model(
+    ModelMeta(
+        LLMModelType.bert, [ModelGroup([
+            Model('iic/nlp_structbert_backbone_base_std'),
+        ])],
+        None,
+        get_model_tokenizer_from_local,
+        tags=['bert']))
diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py
index ca5003ce3..a98406eb8 100644
--- a/swift/llm/model/register.py
+++ b/swift/llm/model/register.py
@@ -53,7 +53,7 @@ class ModelMeta:
     # Used to list the model_ids from modelscope/huggingface,
     # which participate in the automatic inference of the model_type.
     model_groups: List[ModelGroup]
-    template: str
+    template: Optional[str]
     get_function: GetModelTokenizerFunction
 
     model_arch: Optional[str] = None
@@ -70,6 +70,8 @@ class ModelMeta:
     tags: List[str] = field(default_factory=list)
 
     def __post_init__(self):
+        if self.template is None:
+            self.template = 'dummy'
         if not isinstance(self.model_groups, (list, tuple)):
             self.model_groups = [self.model_groups]
 
@@ -508,7 +510,9 @@ def get_model_tokenizer(
     tokenizer.model_info = model_info
     tokenizer.model_meta = model_meta
 
-    pad_token = tokenizer.pad_token_id or tokenizer.eos_token_id
+    pad_token = tokenizer.pad_token_id
+    if pad_token is None:
+        pad_token = tokenizer.eos_token_id
     if tokenizer.eos_token_id is None:
         tokenizer.eos_token_id = pad_token
     if tokenizer.pad_token_id is None:
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index 1ec5107cf..7384678cc 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -179,6 +179,12 @@ def _kto_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded['label'] = label
         return encoded
 
+    def _seq_cls_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        encoded = self._encode(inputs)
+        if inputs.label is not None:
+            encoded['labels'] = int(inputs.label)
+        return encoded
+
     def encode(self,
                inputs: Union[TemplateInputs, Dict[str, Any], InferRequest],
                return_template_inputs: bool = False) -> Dict[str, Any]:
@@ -201,8 +207,10 @@ def encode(self,
             encoded = Template._encode(self, inputs)
             for key in ['images', 'audios', 'videos']:
                 encoded[key] = getattr(inputs, key)
-        elif self.mode in {'pt', 'train', 'seq_cls'}:
+        elif self.mode in {'pt', 'train'}:
             encoded = self._encode(inputs)
+        elif self.mode == 'seq_cls':
+            encoded = self._seq_cls_encode(inputs)
         elif self.mode == 'rlhf':
             encoded = self._rlhf_encode(inputs)
         elif self.mode == 'kto':
@@ -599,7 +607,8 @@ def _swift_encode(self, inputs: StdTemplateInputs):
 
     def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         template_backend = self.template_backend
-        if self.template_meta.template_type == 'dummy' and self.use_chat_template and not self.is_training:
+        if (self.template_meta.template_type == 'dummy' and self.use_chat_template and not self.is_training
+                and self.mode != 'seq_cls'):
             template_backend = 'jinja'
         res_context_list, loss_scale_list, answer_len = (
             self._swift_encode(inputs) if template_backend == 'swift' else self._jinja_encode(inputs))
@@ -649,10 +658,6 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             for k in list(encoded.keys()):
                 if k.endswith('loss_scale'):
                     encoded[k] = None
-
-        # sequence_classification
-        if inputs.label is not None:
-            encoded['label'] = inputs.label
         return encoded
 
     def _debug_logger(self, generate_ids):
@@ -807,7 +812,7 @@ def _seq_cls_data_collator(self,
                                batch: List[Dict[str, Any]],
                                *,
                                padding_to: Optional[int] = None) -> Dict[str, Any]:
-        labels = [b['label'] for b in batch if b.get('label') is not None]
+        labels = [b.pop('labels') for b in batch if b.get('labels') is not None]
         res = self._data_collator(batch, padding_to=padding_to)
         if labels:
             res['labels'] = torch.tensor(labels, dtype=torch.long)
@@ -930,8 +935,9 @@ def print_inputs(self, inputs: Dict[str, Any], tokenizer_kwargs: Optional[Dict[s
             if val is not None:
                 key_upper = key.upper()
                 logger.info(f'[{key_upper}_IDS] {val}')
-                val_str = self.safe_decode(val, **tokenizer_kwargs)
-                logger.info(f'[{key_upper}] {val_str}')
+                if isinstance(val, (list, tuple, torch.Tensor)):
+                    val_str = self.safe_decode(val, **tokenizer_kwargs)
+                    logger.info(f'[{key_upper}] {val_str}')
         if inputs.get('loss_scale') is not None:
             val = inputs['loss_scale']
             logger.info(f'[LOSS_SCALE] {val}')
diff --git a/swift/llm/template/template/microsoft.py b/swift/llm/template/template/microsoft.py
index d94f441fe..e7a46d9f6 100644
--- a/swift/llm/template/template/microsoft.py
+++ b/swift/llm/template/template/microsoft.py
@@ -87,7 +87,7 @@ def decode(self, generate_ids: List[int], **kwargs) -> Any:
         chat_sep=None,
         suffix=['</s>'],
         template_cls=FlorenceTemplate,
-        support_stream=False))
+    ))
 
 
 @dataclass
diff --git a/swift/llm/template/template/utils.py b/swift/llm/template/template/utils.py
index f1c8182fd..fcbdddf19 100644
--- a/swift/llm/template/template/utils.py
+++ b/swift/llm/template/template/utils.py
@@ -22,8 +22,9 @@ class ChatmlTemplateMeta(TemplateMeta):
 @dataclass
 class EmptyTemplateMeta(TemplateMeta):
     prefix: Prompt = field(default_factory=list)
-    prompt: Prompt = field(default_factory=list)
-    chat_sep: Optional[Prompt] = field(default_factory=list)
+    prompt: Prompt = field(default_factory=lambda: ['{{QUERY}}'])
+    chat_sep: Optional[Prompt] = None
+    auto_add_bos: bool = True
 
 
 register_template(ChatmlTemplateMeta(LLMTemplateType.chatml))
diff --git a/swift/llm/template/template_meta.py b/swift/llm/template/template_meta.py
index 5ca9ea136..98520516c 100644
--- a/swift/llm/template/template_meta.py
+++ b/swift/llm/template/template_meta.py
@@ -47,7 +47,6 @@ class TemplateMeta:
     placeholder_tokens: List[Union[int, str]] = field(default_factory=list)
 
     default_tools_prompt: str = 'react_en'
-    support_stream: bool = True
 
     def to_generate_template_meta(self) -> 'TemplateMeta':
         self = deepcopy(self)
@@ -60,7 +59,6 @@ def to_generate_template_meta(self) -> 'TemplateMeta':
             auto_add_bos=True,
             stop_words=self.stop_words,
             placeholder_tokens=self.placeholder_tokens,
-            support_stream=self.support_stream,
         )
 
     @staticmethod
diff --git a/swift/utils/torch_utils.py b/swift/utils/torch_utils.py
index 512a50ba7..d318e752e 100644
--- a/swift/utils/torch_utils.py
+++ b/swift/utils/torch_utils.py
@@ -221,9 +221,9 @@ def find_all_linears(model: nn.Module) -> List[str]:
     else:
         linear_cls = [nn.Linear]
 
-    # 'score': classification model
+    # 'score', 'classifier': classification model
     # 'v_head': reward model
-    ignore_layers = [lm_head_name, 'score', 'v_head']
+    ignore_layers = [lm_head_name, 'score', 'v_head', 'classifier']
     return _find_layers(
         model, lambda name, module: isinstance(module, tuple(linear_cls)) and all(layer not in name
                                                                                   for layer in ignore_layers))
diff --git a/tests/train/test_cls.py b/tests/train/test_cls.py
index 5d20f790b..27ed5b902 100644
--- a/tests/train/test_cls.py
+++ b/tests/train/test_cls.py
@@ -1,9 +1,11 @@
 import os
 
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
 kwargs = {
     'per_device_train_batch_size': 2,
     'per_device_eval_batch_size': 2,
-    'save_steps': 10,
+    'save_steps': 50,
     'gradient_accumulation_steps': 4,
     'num_train_epochs': 1,
 }
@@ -12,10 +14,31 @@
 def test_llm():
     from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
     result = sft_main(
-        TrainArguments(model='Qwen/Qwen2.5-7B-Instruct', num_labels=2, dataset=['DAMO_NLP/jd:cls#2000'], **kwargs))
+        TrainArguments(
+            model='Qwen/Qwen2.5-1.5B-Instruct',
+            train_type='lora',
+            num_labels=2,
+            dataset=['DAMO_NLP/jd:cls#2000'],
+            **kwargs))
     last_model_checkpoint = result['last_model_checkpoint']
     infer_main(InferArguments(adapters=last_model_checkpoint, load_data_args=True))
 
 
+def test_bert():
+
+    from swift.llm import TrainArguments, sft_main, infer_main, InferArguments
+    result = sft_main(
+        TrainArguments(
+            model='answerdotai/ModernBERT-base',
+            # model='iic/nlp_structbert_backbone_base_std',
+            train_type='full',
+            num_labels=2,
+            dataset=['DAMO_NLP/jd:cls#2000'],
+            **kwargs))
+    last_model_checkpoint = result['last_model_checkpoint']
+    infer_main(InferArguments(model=last_model_checkpoint, load_data_args=True))
+
+
 if __name__ == '__main__':
-    test_llm()
+    # test_llm()
+    test_bert()

From 6542c5455424f25e1d443682d78b1f1d51201001 Mon Sep 17 00:00:00 2001
From: Jintao <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 14:25:57 +0800
Subject: [PATCH 13/13] fix alpaca (#2771)

---
 swift/llm/__init__.py                  |  5 +++--
 swift/llm/dataset/dataset/llm.py       | 13 ++++++-----
 swift/llm/dataset/preprocessor/core.py | 30 ++++++++------------------
 swift/llm/model/__init__.py            |  2 +-
 swift/llm/model/utils.py               |  5 +++--
 tests/general/test_dataset.py          |  7 ++++--
 tests/general/test_stream.py           |  9 ++++----
 7 files changed, 33 insertions(+), 38 deletions(-)

diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
index 1c5013173..2173ce93b 100644
--- a/swift/llm/__init__.py
+++ b/swift/llm/__init__.py
@@ -18,7 +18,8 @@
     from .model import (register_model, MODEL_MAPPING, ModelType, get_model_tokenizer, safe_snapshot_download,
                         HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys,
                         ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup,
-                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth)
+                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth,
+                        git_clone_github)
     from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor,
                           DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor,
                           LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE,
@@ -51,7 +52,7 @@
             'ModelInfo', 'ModelMeta', 'ModelKeys', 'register_model_arch', 'MultiModelKeys', 'ModelArch',
             'MODEL_ARCH_MAPPING', 'get_model_arch', 'get_model_info_meta', 'get_model_name', 'register_model',
             'ModelGroup', 'Model', 'get_model_tokenizer_with_flash_attn', 'get_model_tokenizer_multimodal',
-            'load_by_unsloth'
+            'load_by_unsloth', 'git_clone_github'
         ],
         'dataset': [
             'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING',
diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index d0d5b002c..56af8a47e 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -9,17 +9,20 @@
 from ..register import DatasetMeta, SubsetDataset, register_dataset
 
 
-def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str:
-    if inp.startswith('输入：'):
-        inp = inp[3:]
-    return f'{inst}\n{inp}'
+class AlpacaZhPreprocessor(AlpacaPreprocessor):
+
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if input_ and input_.startswith('输入：'):
+            input_ = input_[3:]
+        return super().concat_inst_input(instruction, input_)
 
 
 register_dataset(
     DatasetMeta(
         ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh',
         hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh',
-        preprocess_func=AlpacaPreprocessor(concat_inst_input=_concat_inst_inp_alpaca_zh),
+        preprocess_func=AlpacaZhPreprocessor(),
         tags=['chat', 'general', '🔥'],
     ))
 
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 24c6fa73a..c3d692f4c 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -312,18 +312,14 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 class AlpacaPreprocessor(ResponsePreprocessor):
 
-    def __init__(self,
-                 *,
-                 concat_inst_input: Union[Callable[[str, str], str]] = '\n',
-                 columns_mapping: Optional[Dict[str, str]] = None,
-                 **kwargs) -> None:
-        """Alpaca format preprocessor
-
-        Args:
-            concat_inst_input: The concat sep between instruction and input
-        """
-        super().__init__(columns_mapping=columns_mapping, **kwargs)
-        self.concat_inst_input = concat_inst_input
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if instruction and input_:
+            query = f'{instruction}\n{input_}'
+        else:
+            query = instruction or input_
+        assert isinstance(query, str), f'query: {query}'
+        return query
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         instruction = row.pop('instruction', None)
@@ -331,15 +327,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         output = row.pop('output', None)
         if output is not None:
             row['response'] = output
-
-        if instruction is not None or input_ is not None:
-            instruction = instruction or ''
-            input_ = input_ or ''
-            if isinstance(self.concat_inst_input, str):
-                query = instruction + self.concat_inst_input + input_
-            else:
-                query = self.concat_inst_input(instruction, input_)
-            row['query'] = query
+        row['query'] = self.concat_inst_input(instruction, input_)
         return super().preprocess(row)
 
 
diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py
index d0db4befb..754d71520 100644
--- a/swift/llm/model/__init__.py
+++ b/swift/llm/model/__init__.py
@@ -6,4 +6,4 @@
                        get_default_torch_dtype, get_model_info_meta, get_model_name, get_model_tokenizer,
                        get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, get_model_with_value_head,
                        load_by_unsloth, register_model)
-from .utils import HfConfigFactory, ModelInfo, safe_snapshot_download
+from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
index 7195b266e..efacbc91f 100644
--- a/swift/llm/model/utils.py
+++ b/swift/llm/model/utils.py
@@ -274,6 +274,8 @@ def git_clone_github(github_url: str,
                      local_repo_name: Optional[str] = None,
                      branch: Optional[str] = None,
                      commit_hash: Optional[str] = None) -> str:
+    if github_url.endswith('.git'):
+        github_url = github_url[:-4]
     git_cache_dir = os.path.join(get_cache_dir(), '_github')
     os.makedirs(git_cache_dir, exist_ok=True)
     if local_repo_name is None:
@@ -282,8 +284,7 @@ def git_clone_github(github_url: str,
     local_repo_path = os.path.join(git_cache_dir, local_repo_name)
     with safe_ddp_context(hash_id=local_repo_path):
         if not os.path.exists(local_repo_path):
-            if not github_url.endswith('.git'):
-                github_url = f'{github_url}.git'
+            github_url = f'{github_url}.git'
             command = ['git', '-C', git_cache_dir, 'clone', github_url, local_repo_name]
             command_str = f"git -C '{git_cache_dir}' clone '{github_url}' {local_repo_name}"
             if branch is not None:
diff --git a/tests/general/test_dataset.py b/tests/general/test_dataset.py
index cf4da8312..371401fbe 100644
--- a/tests/general/test_dataset.py
+++ b/tests/general/test_dataset.py
@@ -15,8 +15,11 @@ def test_sft():
     # _test_dataset(['AI-ModelScope/Duet-v0.5'])
     # _test_dataset(['swift/SlimOrca', 'swift/cosmopedia-100k'])
     # _test_dataset(['OmniData/Zhihu-KOL-More-Than-100-Upvotes'])
-    _test_dataset(['OmniData/Zhihu-KOL'])
-    # _test_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200'])
+    # _test_dataset(['OmniData/Zhihu-KOL'])
+    _test_dataset([
+        'AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#1000',
+        'AI-ModelScope/LongAlpaca-12k#1000'
+    ])
     # _test_dataset(['swift/Infinity-Instruct:all'])
     # _test_dataset(['swift/sharegpt:all'])
     # _test_dataset(['AI-ModelScope/sharegpt_gpt4:all'])
diff --git a/tests/general/test_stream.py b/tests/general/test_stream.py
index 08828d12f..ad2069622 100644
--- a/tests/general/test_stream.py
+++ b/tests/general/test_stream.py
@@ -3,11 +3,10 @@
 
 def test_local_dataset():
     # please use git clone
-    local_dataset = '/mnt/nas2/huangjintao.hjt/work/datasets/swift-sft-mixture:firefly#100'
-    dataset = load_dataset(datasets=[local_dataset], streaming=True)[0]
-    for i, x in enumerate(dataset):
-        pass
-    print(i, x)
+    from swift.llm import git_clone_github
+    model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git')
+    dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0]
+    print(next(iter(dataset)))
 
 
 def test_hub_dataset():