From 6e3fa6d2eecacdcad201df424d0586860cc07f1d Mon Sep 17 00:00:00 2001 From: Jintao Date: Mon, 23 Dec 2024 16:33:38 +0800 Subject: [PATCH 01/13] support paligemma2 (#2735) --- README.md | 6 +++--- README_CN.md | 6 +++--- ...253\351\200\237\345\274\200\345\247\213.md" | 4 ++-- ...214\346\225\260\346\215\256\351\233\206.md" | 11 +++++++++++ docs/source_en/GetStarted/Quick-start.md | 4 ++-- .../Supported-models-and-datasets.md | 11 +++++++++++ swift/llm/model/model/gemma.py | 17 +++++++++++++++++ swift/llm/template/template/gemma.py | 2 +- tests/test_align/test_template/test_vision.py | 18 +++++++++++++++--- 9 files changed, 65 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 5083d6f14..82fff525f 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,13 @@ You can contact us and communicate with us by adding our group: ## 📝 Introduction -🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of over 400 large models and 100+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, Llama3.2, GLM4, Internlm2.5, Yi1.5, Mistral, DeepSeek, Baichuan2, Gemma2, and TeleChat2. The multi-modal LLMs include models such as Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2. +🍲 ms-swift is an official framework provided by the ModelScope community for fine-tuning and deploying large language models and multi-modal large models. It currently supports the training (pre-training, fine-tuning, human alignment), inference, evaluation, quantization, and deployment of 400+ large models and 150+ multi-modal large models. These large language models (LLMs) include models such as Qwen2.5, Llama3.3, GLM4, Internlm2.5, Yi1.5, Mistral, DeepSeek2.5, Baichuan2, Gemma2, and TeleChat2. The multi-modal LLMs include models such as Qwen2-VL, Qwen2-Audio, Llama3.2-Vision, Llava, InternVL2.5, MiniCPM-V-2.6, GLM4v, Xcomposer2.5, Yi-VL, DeepSeek-VL2, Phi3.5-Vision, and GOT-OCR2. -🍔 In addition, ms-swift gathers the latest training technologies, including LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger. ms-swift supports accelerating the inference, evaluation, and deployment modules using vLLM and LMDeploy. To help researchers and developers fine-tune and apply large models more easily, ms-swift also provides a Gradio-based Web-UI interface and a wealth of best practices. +🍔 In addition, ms-swift gathers the latest training technologies, including LoRA, QLoRA, Llama-Pro, LongLoRA, GaLore, Q-GaLore, LoRA+, LISA, DoRA, FourierFt, ReFT, UnSloth, and Liger. ms-swift supports acceleration of inference, evaluation, and deployment modules using vLLM and LMDeploy, and supports the quantization of large models and multi-modal large models using technologies such as GPTQ, AWQ, and BNB. To help researchers and developers fine-tune and apply large models more easily, ms-swift also provides a Gradio-based Web-UI interface and a wealth of best practices. **Why choose ms-swift?** -- 🍎 **Model Types**: Supports 400+ large language models and **100+ multi-modal large models** and all-to-all models, **providing a comprehensive solution from training to deployment**. +- 🍎 **Model Types**: Supports 400+ large language models and **150+ multi-modal large models** and all-to-all models, **providing a comprehensive solution from training to deployment**. - **Dataset Types**: Comes with 150+ pre-training, fine-tuning, human alignment, multi-modal datasets, and supports custom datasets. - **Hardware Support**: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, etc. - 🍊 **Lightweight Training**: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel. diff --git a/README_CN.md b/README_CN.md index 92140cb02..4e204b7d9 100644 --- a/README_CN.md +++ b/README_CN.md @@ -53,12 +53,12 @@ | ## 📝 简介 -🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持400+大模型与100+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中LLM包括:Qwen2.5、Llama3.2、GLM4、Internlm2.5、Yi1.5、Mistral、DeepSeek、Baichuan2、Gemma2、TeleChat2等模型,多模态LLM包括:Qwen2-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。 +🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架,现已支持450+大模型与150+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。其中大模型包括:Qwen2.5、Llama3.3、GLM4、Internlm2.5、Yi1.5、Mistral、DeepSeek2.5、Baichuan2、Gemma2、TeleChat2等模型,多模态大模型包括:Qwen2-VL、Qwen2-Audio、Llama3.2-Vision、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。 -🍔 除此之外,ms-swift汇集了最新的训练技术,包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速。为了帮助研究者和开发者更轻松地微调和应用大模型,ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。 +🍔 除此之外,ms-swift汇集了最新的训练技术,包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速,并支持使用GPTQ、AWQ、BNB等技术对大模型和多模态大模型进行量化。为了帮助研究者和开发者更轻松地微调和应用大模型,ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。 **为什么选择ms-swift?** -- 🍎 **模型类型**:支持400+纯文本大模型、**100+多模态大模型**,All-to-All全模态模型的**训练到部署全流程**。 +- 🍎 **模型类型**:支持400+纯文本大模型、**150+多模态大模型**,All-to-All全模态模型的**训练到部署全流程**。 - **数据集类型**:内置150+预训练、微调、人类对齐、多模态等各种类型的数据集,并支持自定义数据集。 - **硬件支持**:CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU等。 - 🍊 **轻量训练**:支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。 diff --git "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 1e1e8961e..306b0a62f 100644 --- "a/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source/GetStarted/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -1,8 +1,8 @@ # 快速开始 -ms-swift是魔搭社区提供的大模型与多模态大模型训练部署框架,现已支持400+大模型与100+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。模型开发者可以在ms-swift框架中一站式完成围绕大模型的各类需求。目前ms-swift的主要能力包含: +ms-swift是魔搭社区提供的大模型与多模态大模型训练部署框架,现已支持400+大模型与150+多模态大模型的训练(预训练、微调、人类对齐)、推理、评测、量化与部署。模型开发者可以在ms-swift框架中一站式完成围绕大模型的各类需求。目前ms-swift的主要能力包含: -- 🍎 模型类型:支持400+纯文本大模型、100+多模态大模型,All-to-All全模态模型的训练到部署全流程。 +- 🍎 模型类型:支持400+纯文本大模型、150+多模态大模型,All-to-All全模态模型的训练到部署全流程。 - 数据集类型:内置150+预训练、微调、人类对齐、多模态等各种类型的数据集,并支持自定义数据集。 - 硬件支持:CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU等。 - 🍊 轻量训练:支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。 diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 66b9a8027..89061f4cb 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -603,6 +603,17 @@ |[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)| |[AI-ModelScope/paligemma-3b-mix-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)| |[AI-ModelScope/paligemma-3b-mix-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448)| +|[AI-ModelScope/paligemma2-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-224](https://huggingface.co/google/paligemma2-3b-pt-224)| +|[AI-ModelScope/paligemma2-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-448](https://huggingface.co/google/paligemma2-3b-pt-448)| +|[AI-ModelScope/paligemma2-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-896](https://huggingface.co/google/paligemma2-3b-pt-896)| +|[AI-ModelScope/paligemma2-10b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-224](https://huggingface.co/google/paligemma2-10b-pt-224)| +|[AI-ModelScope/paligemma2-10b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-448](https://huggingface.co/google/paligemma2-10b-pt-448)| +|[AI-ModelScope/paligemma2-10b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-896](https://huggingface.co/google/paligemma2-10b-pt-896)| +|[AI-ModelScope/paligemma2-28b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-224](https://huggingface.co/google/paligemma2-28b-pt-224)| +|[AI-ModelScope/paligemma2-28b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-448](https://huggingface.co/google/paligemma2-28b-pt-448)| +|[AI-ModelScope/paligemma2-28b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-896](https://huggingface.co/google/paligemma2-28b-pt-896)| +|[AI-ModelScope/paligemma2-3b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-ft-docci-448](https://huggingface.co/google/paligemma2-3b-ft-docci-448)| +|[AI-ModelScope/paligemma2-10b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-ft-docci-448](https://huggingface.co/google/paligemma2-10b-ft-docci-448)| |[LLM-Research/Molmo-7B-O-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-O-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-O-0924](https://huggingface.co/allenai/Molmo-7B-O-0924)| |[LLM-Research/Molmo-7B-D-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-D-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)| |[LLM-Research/Molmo-72B-0924](https://modelscope.cn/models/LLM-Research/Molmo-72B-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-72B-0924](https://huggingface.co/allenai/Molmo-72B-0924)| diff --git a/docs/source_en/GetStarted/Quick-start.md b/docs/source_en/GetStarted/Quick-start.md index 415bbfb7b..47861d52e 100644 --- a/docs/source_en/GetStarted/Quick-start.md +++ b/docs/source_en/GetStarted/Quick-start.md @@ -1,8 +1,8 @@ # Quick Start -ms-swift is a comprehensive training and deployment framework for large language models and multimodal large models, provided by the ModelScope Community. It currently supports the training (CPT, SFT, RLHF), inference, evaluation, quantization, and deployment of over 400 LLM and over 100 MLLM. Model developers can fulfill all kinds of needs related to large models in a single platform within the ms-swift framework. The main capabilities of ms-swift include: +ms-swift is a comprehensive training and deployment framework for large language models and multimodal large models, provided by the ModelScope Community. It currently supports the training (CPT, SFT, RLHF), inference, evaluation, quantization, and deployment of 400+ LLM and 150+ MLLM. Model developers can fulfill all kinds of needs related to large models in a single platform within the ms-swift framework. The main capabilities of ms-swift include: -- 🍎 Model Types: Supports the full process from training to deployment of over 400 text-based large models and over 100 multimodal large models, including All-to-All all-modality models. +- 🍎 Model Types: Supports the full process from training to deployment of 400+ text-based large models and 150+ multimodal large models, including All-to-All all-modality models. - Dataset Types: Comes with more than 150 pre-built datasets for pre-training, fine-tuning, human alignment, multimodal, and supports custom datasets. - Hardware Support: Compatible with CPU, RTX series, T4/V100, A10/A100/H100, Ascend NPU, and others. - 🍊 Lightweight Training: Supports lightweight fine-tuning methods like LoRA, QLoRA, DoRA, LoRA+, ReFT, RS-LoRA, LLaMAPro, Adapter, GaLore, Q-Galore, LISA, UnSloth, Liger-Kernel, and more. diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 71990dcec..94bfb0e38 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -603,6 +603,17 @@ The table below introduces the models integrated with ms-swift: |[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)| |[AI-ModelScope/paligemma-3b-mix-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)| |[AI-ModelScope/paligemma-3b-mix-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-mix-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma-3b-mix-448](https://huggingface.co/google/paligemma-3b-mix-448)| +|[AI-ModelScope/paligemma2-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-224](https://huggingface.co/google/paligemma2-3b-pt-224)| +|[AI-ModelScope/paligemma2-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-448](https://huggingface.co/google/paligemma2-3b-pt-448)| +|[AI-ModelScope/paligemma2-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-pt-896](https://huggingface.co/google/paligemma2-3b-pt-896)| +|[AI-ModelScope/paligemma2-10b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-224](https://huggingface.co/google/paligemma2-10b-pt-224)| +|[AI-ModelScope/paligemma2-10b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-448](https://huggingface.co/google/paligemma2-10b-pt-448)| +|[AI-ModelScope/paligemma2-10b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-pt-896](https://huggingface.co/google/paligemma2-10b-pt-896)| +|[AI-ModelScope/paligemma2-28b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-224)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-224](https://huggingface.co/google/paligemma2-28b-pt-224)| +|[AI-ModelScope/paligemma2-28b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-448](https://huggingface.co/google/paligemma2-28b-pt-448)| +|[AI-ModelScope/paligemma2-28b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma2-28b-pt-896)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-28b-pt-896](https://huggingface.co/google/paligemma2-28b-pt-896)| +|[AI-ModelScope/paligemma2-3b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-3b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-3b-ft-docci-448](https://huggingface.co/google/paligemma2-3b-ft-docci-448)| +|[AI-ModelScope/paligemma2-10b-ft-docci-448](https://modelscope.cn/models/AI-ModelScope/paligemma2-10b-ft-docci-448)|paligemma|paligemma|transformers>=4.41|vision|[google/paligemma2-10b-ft-docci-448](https://huggingface.co/google/paligemma2-10b-ft-docci-448)| |[LLM-Research/Molmo-7B-O-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-O-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-O-0924](https://huggingface.co/allenai/Molmo-7B-O-0924)| |[LLM-Research/Molmo-7B-D-0924](https://modelscope.cn/models/LLM-Research/Molmo-7B-D-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-7B-D-0924](https://huggingface.co/allenai/Molmo-7B-D-0924)| |[LLM-Research/Molmo-72B-0924](https://modelscope.cn/models/LLM-Research/Molmo-72B-0924)|molmo|molmo|transformers>=4.45|vision|[allenai/Molmo-72B-0924](https://huggingface.co/allenai/Molmo-72B-0924)| diff --git a/swift/llm/model/model/gemma.py b/swift/llm/model/model/gemma.py index 8d5b8c2dc..7ae3ba91b 100644 --- a/swift/llm/model/model/gemma.py +++ b/swift/llm/model/model/gemma.py @@ -28,9 +28,26 @@ def get_model_tokenizer_paligemma_vision(model_dir: str, Model('AI-ModelScope/paligemma-3b-pt-224', 'google/paligemma-3b-pt-224'), Model('AI-ModelScope/paligemma-3b-pt-448', 'google/paligemma-3b-pt-448'), Model('AI-ModelScope/paligemma-3b-pt-896', 'google/paligemma-3b-pt-896'), + ]), + ModelGroup([ Model('AI-ModelScope/paligemma-3b-mix-224', 'google/paligemma-3b-mix-224'), Model('AI-ModelScope/paligemma-3b-mix-448', 'google/paligemma-3b-mix-448'), ]), + ModelGroup([ + Model('AI-ModelScope/paligemma2-3b-pt-224', 'google/paligemma2-3b-pt-224'), + Model('AI-ModelScope/paligemma2-3b-pt-448', 'google/paligemma2-3b-pt-448'), + Model('AI-ModelScope/paligemma2-3b-pt-896', 'google/paligemma2-3b-pt-896'), + Model('AI-ModelScope/paligemma2-10b-pt-224', 'google/paligemma2-10b-pt-224'), + Model('AI-ModelScope/paligemma2-10b-pt-448', 'google/paligemma2-10b-pt-448'), + Model('AI-ModelScope/paligemma2-10b-pt-896', 'google/paligemma2-10b-pt-896'), + Model('AI-ModelScope/paligemma2-28b-pt-224', 'google/paligemma2-28b-pt-224'), + Model('AI-ModelScope/paligemma2-28b-pt-448', 'google/paligemma2-28b-pt-448'), + Model('AI-ModelScope/paligemma2-28b-pt-896', 'google/paligemma2-28b-pt-896'), + ]), + ModelGroup([ + Model('AI-ModelScope/paligemma2-3b-ft-docci-448', 'google/paligemma2-3b-ft-docci-448'), + Model('AI-ModelScope/paligemma2-10b-ft-docci-448', 'google/paligemma2-10b-ft-docci-448'), + ]), ], TemplateType.paligemma, get_model_tokenizer_paligemma_vision, diff --git a/swift/llm/template/template/gemma.py b/swift/llm/template/template/gemma.py index 24c1d4936..dabf3644f 100644 --- a/swift/llm/template/template/gemma.py +++ b/swift/llm/template/template/gemma.py @@ -42,7 +42,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]: encoded['token_type_ids'] = [0] * len(encoded['input_ids']) if raw_image: model_inputs = processor(text=inputs.to_history()['query'], images=raw_image[0], return_tensors='pt') - encoded['pixel_values'] = model_inputs['pixel_values'] + encoded['pixel_values'] = model_inputs['pixel_values'].to(self.config.torch_dtype) return encoded diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py index bb53b1978..6191fe450 100644 --- a/tests/test_align/test_template/test_vision.py +++ b/tests/test_align/test_template/test_vision.py @@ -180,8 +180,19 @@ def test_ovis1_6(): def test_paligemma(): - pt_engine = PtEngine('AI-ModelScope/paligemma-3b-pt-224') - _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'caption en'}]) + pt_engine = PtEngine('AI-ModelScope/paligemma-3b-mix-224') + response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'detect cat'}]) + assert response == ' cat' + + +def test_paligemma2(): + pt_engine = PtEngine('AI-ModelScope/paligemma2-3b-ft-docci-448', torch_dtype=torch.bfloat16) + response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': 'caption en'}]) + assert response == ( + 'A close up view of a white kitten with black stripes on its head and body. The kitten is looking straight ' + 'ahead with its light blue eyes. The kitten has a pink nose and mouth. The kitten is sitting on a white ' + 'surface. A white light is shining on the kitten and the white surface. A shadow is being cast underneath ' + 'the kitten and the white surface.') def test_pixtral(): @@ -299,6 +310,7 @@ def test_doc_owl2(): # test_minicpmv() # test_got_ocr() # test_paligemma() + test_paligemma2() # test_pixtral() # test_llama_vision() # test_llava_hf() @@ -314,4 +326,4 @@ def test_doc_owl2(): # test_mplug_owl2() # test_molmo() # test_molmoe() - test_doc_owl2() + # test_doc_owl2() From 64cede0f413fd386d98b2e33996be7ea0f5805f3 Mon Sep 17 00:00:00 2001 From: Jintao Date: Mon, 23 Dec 2024 17:10:18 +0800 Subject: [PATCH 02/13] fix windows (#2733) --- ...\273\244\350\241\214\345\217\202\346\225\260.md" | 2 +- .../Instruction/Command-line-parameters.md | 2 +- examples/export/quantize/awq.sh | 2 +- examples/export/quantize/gptq.sh | 2 +- swift/llm/argument/export_args.py | 2 +- swift/llm/model/register.py | 9 ++++++--- tests/export/quant.py | 13 ++++++++++++- 7 files changed, 23 insertions(+), 9 deletions(-) diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index 283912e13..bb867063f 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -365,7 +365,7 @@ RLHF参数继承于[训练参数](#训练参数) - 🔥output_dir: 导出结果存储路径,默认为None - 🔥quant_method: 可选为'gptq', 'awq',默认为None -- quant_n_samples: gptq/awq的校验集抽样数,默认为256 +- quant_n_samples: gptq/awq的校验集抽样数,默认为128 - max_length: 校准集的max_length, 默认值2048 - quant_batch_size: 量化batch_size,默认为1 - group_size: 量化group大小,默认为128 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index fc959e977..ef0236a8c 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -367,7 +367,7 @@ Export Arguments include the [basic arguments](#base-arguments) and [merge argum - 🔥output_dir: Path for storing export results, default is None. - 🔥quant_method: Options are 'gptq' and 'awq', default is None. -- quant_n_samples: Sampling size for the validation set in gptq/awq, default is 256. +- quant_n_samples: Sampling size for the validation set in gptq/awq, default is 128. - max_length: Max length for the calibration set, default value is 2048. - quant_batch_size: Quantization batch size, default is 1. - group_size: Group size for quantization, default is 128. diff --git a/examples/export/quantize/awq.sh b/examples/export/quantize/awq.sh index 571b4717d..379ad270e 100644 --- a/examples/export/quantize/awq.sh +++ b/examples/export/quantize/awq.sh @@ -3,7 +3,7 @@ swift export \ --model Qwen/Qwen2.5-1.5B-Instruct \ --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \ AI-ModelScope/alpaca-gpt4-data-en#500 \ - --quant_n_samples 256 \ + --quant_n_samples 128 \ --quant_batch_size 1 \ --max_length 2048 \ --quant_method awq \ diff --git a/examples/export/quantize/gptq.sh b/examples/export/quantize/gptq.sh index 7e207f205..f53d251bd 100644 --- a/examples/export/quantize/gptq.sh +++ b/examples/export/quantize/gptq.sh @@ -5,7 +5,7 @@ swift export \ --model Qwen/Qwen2.5-1.5B-Instruct \ --dataset AI-ModelScope/alpaca-gpt4-data-zh#500 \ AI-ModelScope/alpaca-gpt4-data-en#500 \ - --quant_n_samples 256 \ + --quant_n_samples 128 \ --quant_batch_size 1 \ --max_length 2048 \ --quant_method gptq \ diff --git a/swift/llm/argument/export_args.py b/swift/llm/argument/export_args.py index f5e24ca49..7a7f39595 100644 --- a/swift/llm/argument/export_args.py +++ b/swift/llm/argument/export_args.py @@ -33,7 +33,7 @@ class ExportArguments(MergeArguments, BaseArguments): # awq/gptq quant_method: Literal['awq', 'gptq', 'bnb'] = None - quant_n_samples: int = 256 + quant_n_samples: int = 128 max_length: int = 2048 quant_batch_size: int = 1 group_size: int = 128 diff --git a/swift/llm/model/register.py b/swift/llm/model/register.py index 702f869fd..e81716c56 100644 --- a/swift/llm/model/register.py +++ b/swift/llm/model/register.py @@ -1,5 +1,6 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os +import platform import re from copy import deepcopy from dataclasses import asdict, dataclass, field @@ -333,9 +334,11 @@ def get_model_name(model_id_or_path: str) -> Optional[str]: model_id_or_path = model_id_or_path.rstrip('/') match_ = re.search('/models--.+?--(.+?)/snapshots/', model_id_or_path) if match_ is not None: - model_name = match_.group(1) - else: - model_name = model_id_or_path.rsplit('/', 1)[-1] + return match_.group(1) + + model_name = model_id_or_path.rsplit('/', 1)[-1] + if platform.system().lower() == 'windows': + model_name = model_name.rsplit('\\', 1)[-1] # compat modelscope snapshot_download model_name = model_name.replace('___', '.') return model_name diff --git a/tests/export/quant.py b/tests/export/quant.py index 563244418..1c0b41dbd 100644 --- a/tests/export/quant.py +++ b/tests/export/quant.py @@ -24,6 +24,16 @@ def test_vlm_quant(quant_method: Literal['gptq', 'awq'] = 'awq'): quant_method=quant_method)) +def test_audio_quant(quant_method: Literal['gptq', 'awq'] = 'awq'): + from swift.llm import export_main, ExportArguments + export_main( + ExportArguments( + model='Qwen/Qwen2-Audio-7B-Instruct', + quant_bits=4, + dataset=['speech_asr/speech_asr_aishell1_trainsets:validation#1000'], + quant_method=quant_method)) + + def test_vlm_bnb_quant(): from swift.llm import export_main, ExportArguments, infer_main, InferArguments export_main(ExportArguments(model='Qwen/Qwen2-VL-7B-Instruct', quant_bits=4, quant_method='bnb')) @@ -34,4 +44,5 @@ def test_vlm_bnb_quant(): if __name__ == '__main__': # test_llm_quant('gptq') # test_vlm_quant('gptq') - test_vlm_bnb_quant() + test_audio_quant('gptq') + # test_vlm_bnb_quant() From f17ca92808697388b4b901e2d0bd0d5e53fa8414 Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Mon, 23 Dec 2024 18:41:13 +0800 Subject: [PATCH 03/13] support mm llamapro (#2738) --- swift/tuners/llamapro.py | 67 ++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/swift/tuners/llamapro.py b/swift/tuners/llamapro.py index e20e7ab94..ab343d8bc 100644 --- a/swift/tuners/llamapro.py +++ b/swift/tuners/llamapro.py @@ -1,12 +1,12 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from copy import deepcopy -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields from typing import Optional import torch from torch import nn -from swift.llm import MODEL_ARCH_MAPPING, ModelKeys +from swift.llm import MODEL_ARCH_MAPPING, HfConfigFactory, ModelKeys from swift.utils.logger import get_logger from .utils import ActivationMixin, SwiftAdapter, SwiftConfig, SwiftOutput @@ -46,11 +46,9 @@ class LLaMAPro(SwiftAdapter): @staticmethod def prepare_model(model: nn.Module, config: LLaMAProConfig, adapter_name: str) -> SwiftOutput: """Prepare a model with `LLaMAProConfig`""" - num_hidden_layers = None - if hasattr(model.config, 'num_hidden_layers'): - num_hidden_layers = model.config.num_hidden_layers - elif hasattr(model.config, 'num_layers'): - num_hidden_layers = model.config.num_layers + num_hidden_layers = HfConfigFactory.get_config_attr(model.config, 'num_hidden_layers') + if num_hidden_layers is None: + num_hidden_layers = HfConfigFactory.get_config_attr(model.config, 'num_layers') assert num_hidden_layers is not None, 'Cannot find num of layers config' assert num_hidden_layers % config.num_new_blocks == 0, f'Model layers {num_hidden_layers} ' \ @@ -60,8 +58,26 @@ def prepare_model(model: nn.Module, config: LLaMAProConfig, adapter_name: str) - num_stride = num_hidden_layers // config.num_groups - # We only support decoder only model for now. - module_list = LLaMAPro._find_module_list(config, model) + try: + module_list = LLaMAPro._find_module_list(config, model) + except AssertionError as e: + model_type = LLaMAPro.search_correct_model_type(model) + if model_type is None: + language_model_name = SwiftAdapter.get_model_key_mapping(config.model_type, config).language_model + if language_model_name: + if isinstance(language_model_name, str): + language_model_name = [language_model_name] + language_model = model.get_submodule(language_model_name[0]) + model_type = LLaMAPro.search_correct_model_type(language_model) + if model_type: + model = language_model + + if model_type: + config.model_type = model_type + module_list = LLaMAPro._find_module_list(config, model) + else: + raise e + new_module_list = nn.ModuleList() new_module_idx = [] for idx, module in enumerate(module_list): @@ -107,7 +123,10 @@ def _update_module_attr(config: LLaMAProConfig, module_list): if model_type in ('llama', 'mistral', 'qwen2', 'yi', 'gemma', 'deepseek', 'openbuddy', 'xverse', 'orion', 'bluelm', 'ziya', 'skywork', 'deepseek-v2', 'minicpm', 'phi3', 'internlm2'): for idx, module in enumerate(module_list): - getattr(module, attention).layer_idx = idx + try: + getattr(module, attention).layer_idx = idx + except AttributeError: + getattr(module, 'cross_attn').layer_idx = idx elif model_type in ('chatglm', 'glm4'): for idx, module in enumerate(module_list): getattr(module, attention).layer_number = idx @@ -135,6 +154,34 @@ def get_model_key_mapping(cls, model_type, config) -> ModelKeys: 'LLaMAPro only support models with o_proj and down_proj components.' return model_key_mapping + @classmethod + def search_correct_model_type(cls, module: nn.Module): + for arch_name, arch_type in MODEL_ARCH_MAPPING.items(): + arch_type: ModelKeys + if getattr(arch_type, 'module_list') is None: + # Need to be a LLM arch + continue + + matched = True + for f in fields(arch_type): + arch_str = getattr(arch_type, f.name) + if f.name == 'arch_name' or arch_str is None: + continue + + arch_str = arch_str.replace('{}', '0') + try: + sub_module = module.get_submodule(arch_str) + if sub_module is None: + matched = False + except AttributeError: + matched = False + + if not matched: + break + + if matched: + return arch_name + @staticmethod def _update_module_weight(config: LLaMAProConfig, module_list, new_module_idx): model_key_mapping = LLaMAPro.get_model_key_mapping(config.model_type, config) From c1f10f4c099913c7f3f35e5f4ec67e67b4b95f09 Mon Sep 17 00:00:00 2001 From: Jintao Date: Mon, 23 Dec 2024 19:35:54 +0800 Subject: [PATCH 04/13] support AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B (#2739) --- ...14\346\225\260\346\215\256\351\233\206.md" | 5 +++-- .../Supported-models-and-datasets.md | 5 +++-- swift/llm/model/constant.py | 4 +++- swift/llm/model/model/llm.py | 17 ++++++++++++++- swift/llm/template/constant.py | 4 +++- swift/llm/template/template/llm.py | 12 +++++++++++ swift/version.py | 2 +- tests/test_align/test_template/test_llm.py | 21 ++++++++++++++++++- 8 files changed, 61 insertions(+), 9 deletions(-) diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index 89061f4cb..0ae7bd93f 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -418,6 +418,9 @@ |[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)| |[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)| |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)| +|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)| +|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-| +|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)| |[IEITYuan/Yuan2.0-2B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-2B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)| |[IEITYuan/Yuan2.0-51B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-51B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)| |[IEITYuan/Yuan2.0-102B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-102B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)| @@ -451,8 +454,6 @@ |[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)| |[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)| |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| -|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|-| -|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 94bfb0e38..56397f528 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -418,6 +418,9 @@ The table below introduces the models integrated with ms-swift: |[LLM-Research/gemma-2-9b-it](https://modelscope.cn/models/LLM-Research/gemma-2-9b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-9b-it](https://huggingface.co/google/gemma-2-9b-it)| |[LLM-Research/gemma-2-27b](https://modelscope.cn/models/LLM-Research/gemma-2-27b)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)| |[LLM-Research/gemma-2-27b-it](https://modelscope.cn/models/LLM-Research/gemma-2-27b-it)|gemma2|gemma|transformers>=4.42|-|[google/gemma-2-27b-it](https://huggingface.co/google/gemma-2-27b-it)| +|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|[skywork/Skywork-13B-base](https://huggingface.co/skywork/Skywork-13B-base)| +|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-| +|[AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B](https://modelscope.cn/models/AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B)|skywork_o1|skywork_o1|transformers>=4.43|-|[Skywork/Skywork-o1-Open-Llama-3.1-8B](https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B)| |[IEITYuan/Yuan2.0-2B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-2B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-2B-hf](https://huggingface.co/IEITYuan/Yuan2-2B-hf)| |[IEITYuan/Yuan2.0-51B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-51B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-51B-hf](https://huggingface.co/IEITYuan/Yuan2-51B-hf)| |[IEITYuan/Yuan2.0-102B-hf](https://modelscope.cn/models/IEITYuan/Yuan2.0-102B-hf)|yuan2|yuan|-|-|[IEITYuan/Yuan2-102B-hf](https://huggingface.co/IEITYuan/Yuan2-102B-hf)| @@ -451,8 +454,6 @@ The table below introduces the models integrated with ms-swift: |[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-1.4b-hf](https://huggingface.co/state-spaces/mamba-1.4b-hf)| |[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf)|mamba|default|transformers>=4.39.0|-|[state-spaces/mamba-2.8b-hf](https://huggingface.co/state-spaces/mamba-2.8b-hf)| |[damo/nlp_polylm_13b_text_generation](https://modelscope.cn/models/damo/nlp_polylm_13b_text_generation)|polylm|default|-|-|[DAMO-NLP-MT/polylm-13b](https://huggingface.co/DAMO-NLP-MT/polylm-13b)| -|[skywork/Skywork-13B-base](https://modelscope.cn/models/skywork/Skywork-13B-base)|skywork|skywork|-|-|-| -|[skywork/Skywork-13B-chat](https://modelscope.cn/models/skywork/Skywork-13B-chat)|skywork|skywork|-|-|-| |[AI-ModelScope/aya-expanse-8b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-8b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-8b](https://huggingface.co/CohereForAI/aya-expanse-8b)| |[AI-ModelScope/aya-expanse-32b](https://modelscope.cn/models/AI-ModelScope/aya-expanse-32b)|aya|aya|transformers>=4.44.0|-|[CohereForAI/aya-expanse-32b](https://huggingface.co/CohereForAI/aya-expanse-32b)| diff --git a/swift/llm/model/constant.py b/swift/llm/model/constant.py index 173f4fce9..4ac9de3d7 100644 --- a/swift/llm/model/constant.py +++ b/swift/llm/model/constant.py @@ -78,6 +78,9 @@ class LLMModelType: gemma = 'gemma' gemma2 = 'gemma2' + skywork = 'skywork' + skywork_o1 = 'skywork_o1' + yuan2 = 'yuan2' orion = 'orion' xverse = 'xverse' @@ -89,7 +92,6 @@ class LLMModelType: grok = 'grok' mamba = 'mamba' polylm = 'polylm' - skywork = 'skywork' aya = 'aya' diff --git a/swift/llm/model/model/llm.py b/swift/llm/model/model/llm.py index e46189b38..dfc4639aa 100644 --- a/swift/llm/model/model/llm.py +++ b/swift/llm/model/model/llm.py @@ -80,7 +80,7 @@ def get_skywork_model_tokenizer(model_dir: str, LLMModelType.skywork, [ ModelGroup([ - Model('skywork/Skywork-13B-base'), + Model('skywork/Skywork-13B-base', 'skywork/Skywork-13B-base'), Model('skywork/Skywork-13B-chat'), ]), ], @@ -90,6 +90,21 @@ def get_skywork_model_tokenizer(model_dir: str, model_arch=ModelArch.llama, )) +register_model( + ModelMeta( + LLMModelType.skywork_o1, + [ + ModelGroup([ + Model('AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B', 'Skywork/Skywork-o1-Open-Llama-3.1-8B'), + ]), + ], + TemplateType.skywork_o1, + get_model_tokenizer_with_flash_attn, + architectures=['LlamaForCausalLM'], + requires=['transformers>=4.43'], + model_arch=ModelArch.llama, + )) + def get_model_tokenizer_yuan(model_dir: str, model_info: ModelInfo, diff --git a/swift/llm/template/constant.py b/swift/llm/template/constant.py index 9046bc9c0..963ad75de 100644 --- a/swift/llm/template/constant.py +++ b/swift/llm/template/constant.py @@ -50,6 +50,9 @@ class LLMTemplateType: codefuse = 'codefuse' codefuse_codellama = 'codefuse_codellama' + skywork = 'skywork' + skywork_o1 = 'skywork_o1' + mistral_nemo = 'mistral_nemo' zephyr = 'zephyr' wizardlm2 = 'wizardlm2' @@ -59,7 +62,6 @@ class LLMTemplateType: yuan = 'yuan' xverse = 'xverse' - skywork = 'skywork' bluelm = 'bluelm' orion = 'orion' diff --git a/swift/llm/template/template/llm.py b/swift/llm/template/template/llm.py index b0376a19f..0cd125d8a 100644 --- a/swift/llm/template/template/llm.py +++ b/swift/llm/template/template/llm.py @@ -5,6 +5,7 @@ from ..constant import LLMTemplateType from ..register import TemplateMeta, register_template from ..utils import Prompt +from .llama import Llama3_2TemplateMeta from .utils import DEFAULT_SYSTEM, ChatmlTemplateMeta register_template( @@ -70,6 +71,17 @@ chat_sep=None, suffix=['[SEP]'])) +register_template( + Llama3_2TemplateMeta( + LLMTemplateType.skywork_o1, + default_system=( + 'You are Skywork-o1, a thinking model developed by Skywork AI, specializing in solving complex problems ' + "involving mathematics, coding, and logical reasoning through deep thought. When faced with a user's " + 'request, you first engage in a lengthy and in-depth thinking process to explore possible solutions to ' + 'the problem. After completing your thoughts, you then provide a detailed explanation of the solution ' + 'process in your response.'), + )) + register_template( TemplateMeta( LLMTemplateType.bluelm, diff --git a/swift/version.py b/swift/version.py index 9a09a9153..2691e62f9 100644 --- a/swift/version.py +++ b/swift/version.py @@ -1,5 +1,5 @@ # Make sure to modify __release_datetime__ to release time when making official release. -__version__ = '3.0.0.dev0' +__version__ = '3.1.0.dev0' # default release datetime for branches under active development is set # to be a time far-far-away-into-the-future __release_datetime__ = '2099-10-13 08:56:12' diff --git a/tests/test_align/test_template/test_llm.py b/tests/test_align/test_template/test_llm.py index c43c67365..b1f8272be 100644 --- a/tests/test_align/test_template/test_llm.py +++ b/tests/test_align/test_template/test_llm.py @@ -140,6 +140,24 @@ def test_megrez(): assert res == res2, f'res: {res}, res2: {res2}' +def test_skywork_o1(): + pt_engine = PtEngine('AI-ModelScope/Skywork-o1-Open-Llama-3.1-8B') + res = _infer_model( + pt_engine, + messages=[{ + 'role': + 'user', + 'content': + ('Jane has 12 apples. She gives 4 apples to her friend Mark, then buys 1 more apple, and finally splits ' + 'all her apples equally among herself and her 2 siblings. How many apples does each person get?') + }]) + assert res == ("To solve the problem, let's break it down into a series of logical steps:\n\n1. **Initial Number " + 'of Apples**: Jane starts with 12 apples.\n2. **Apples Given Away**: Jane gives 4 apples to her ' + 'friend Mark. So, the number of apples she has now is:\n \\[\n 12 - 4 = 8\n \\]\n3. **Apples ' + 'Bought**: Jane then buys 1 more apple. So, the number of apples she has now is:\n \\[\n ' + '8 + 1 = 9\n \\]\n4. **Apples Split Equally') + + if __name__ == '__main__': from swift.llm import PtEngine, RequestConfig, get_template, get_model_tokenizer, VllmEngine from swift.utils import get_logger, seed_everything @@ -158,4 +176,5 @@ def test_megrez(): # test_glm_edge() # test_llama() # test_openbuddy() - test_megrez() + # test_megrez() + test_skywork_o1() From f913bca710852b25060d798d588c1609d0af96c2 Mon Sep 17 00:00:00 2001 From: Jintao Date: Tue, 24 Dec 2024 10:29:52 +0800 Subject: [PATCH 05/13] Fix windows encoding gbk (#2741) --- scripts/benchmark/exp_utils.py | 8 ++++---- scripts/benchmark/generate_report.py | 2 +- scripts/utils/run_model_info.py | 4 ++-- swift/hub/hub.py | 2 +- swift/llm/argument/base_args/model_args.py | 2 +- swift/llm/dataset/register.py | 2 +- swift/llm/export/merge_lora.py | 2 +- swift/llm/export/ollama.py | 2 +- swift/plugin/loss_scale.py | 2 +- swift/tuners/base.py | 21 +++++++++++---------- swift/tuners/peft.py | 5 +++-- swift/tuners/utils.py | 4 ++-- swift/ui/base.py | 4 ++-- swift/ui/llm_infer/llm_infer.py | 2 +- swift/ui/llm_infer/runtime.py | 4 ++-- swift/ui/llm_train/runtime.py | 6 +++--- 16 files changed, 37 insertions(+), 35 deletions(-) diff --git a/scripts/benchmark/exp_utils.py b/scripts/benchmark/exp_utils.py index 8caa020ac..5e1e9f8ba 100644 --- a/scripts/benchmark/exp_utils.py +++ b/scripts/benchmark/exp_utils.py @@ -106,7 +106,7 @@ def assert_gpu_not_overlap(self): def run(self, exp: Experiment): if os.path.exists(os.path.join(exp.input_args.save_dir, exp.name + '.json')): - with open(os.path.join(exp.input_args.save_dir, exp.name + '.json'), 'r') as f: + with open(os.path.join(exp.input_args.save_dir, exp.name + '.json'), 'r', encoding='utf-8') as f: _json = json.load(f) if exp.eval_dataset and 'eval_result' not in _json['record']: if not exp.do_eval: @@ -238,7 +238,7 @@ def _find_free_gpu(self, n): def prepare_experiments(self, args: Any): experiments = [] for config_file in args.config: - with open(config_file, 'r') as f: + with open(config_file, 'r', encoding='utf-8') as f: group = os.path.basename(config_file) group = group[:-5] content = json.load(f) @@ -275,7 +275,7 @@ def prepare_experiments(self, args: Any): def _get_metric(exp: Experiment): if exp.do_eval: if os.path.isfile(os.path.join('exp', f'{exp.name}.eval.log')): - with open(os.path.join('exp', f'{exp.name}.eval.log'), 'r') as f: + with open(os.path.join('exp', f'{exp.name}.eval.log'), 'r', encoding='utf-8') as f: for line in f.readlines(): if 'Final report:' in line: return json.loads(line.split('Final report:')[1].replace('\'', '"')) @@ -301,7 +301,7 @@ def _get_metric(exp: Experiment): logging_dir = exp.runtime.get('logging_dir') logging_file = os.path.join(logging_dir, '..', 'logging.jsonl') if os.path.isfile(logging_file): - with open(logging_file, 'r') as f: + with open(logging_file, 'r', encoding='utf-8') as f: for line in f.readlines(): if 'model_info' in line: return json.loads(line) diff --git a/scripts/benchmark/generate_report.py b/scripts/benchmark/generate_report.py index 28332f70d..a02037360 100644 --- a/scripts/benchmark/generate_report.py +++ b/scripts/benchmark/generate_report.py @@ -284,7 +284,7 @@ def generate_export_report(outputs: List[ModelOutput]): def parse_output(file): - with open(file, 'r') as f: + with open(file, 'r', encoding='utf-8') as f: content = json.load(f) name = content['name'] diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py index 8095ca7e9..11c07698e 100644 --- a/scripts/utils/run_model_info.py +++ b/scripts/utils/run_model_info.py @@ -54,7 +54,7 @@ def get_model_info_table(): result[i] += text[i] for i, fpath in enumerate(fpaths): - with open(fpath, 'r') as f: + with open(fpath, 'r', encoding='utf-8') as f: text = f.read() llm_start_idx = text.find('| Model ID |') mllm_start_idx = text[llm_start_idx + 1:].find('| Model ID |') + llm_start_idx + 1 @@ -62,7 +62,7 @@ def get_model_info_table(): mllm_end_idx = text.find(end_words[i][1]) output = text[:llm_start_idx] + result[0] + '\n\n' + text[llm_end_idx:mllm_start_idx] + result[ 1] + '\n\n' + text[mllm_end_idx:] - with open(fpath, 'w') as f: + with open(fpath, 'w', encoding='utf-8') as f: f.write(output) diff --git a/swift/hub/hub.py b/swift/hub/hub.py index e684ba1fd..704cfa39e 100644 --- a/swift/hub/hub.py +++ b/swift/hub/hub.py @@ -256,7 +256,7 @@ def push_to_hub(cls, if commit_description: commit_message = commit_message + '\n' + commit_description if not os.path.exists(os.path.join(folder_path, 'configuration.json')): - with open(os.path.join(folder_path, 'configuration.json'), 'w') as f: + with open(os.path.join(folder_path, 'configuration.json'), 'w', encoding='utf-8') as f: f.write('{"framework": "pytorch", "task": "text-generation", "allow_remote": true}') if ignore_patterns: ignore_patterns = [p for p in ignore_patterns if p != '_*'] diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py index c397238d9..bf61e07c4 100644 --- a/swift/llm/argument/base_args/model_args.py +++ b/swift/llm/argument/base_args/model_args.py @@ -52,7 +52,7 @@ def parse_to_dict(value: Union[str, Dict, None], strict: bool = True) -> Union[s value = {} elif isinstance(value, str): if os.path.exists(value): # local path - with open(value, 'r') as f: + with open(value, 'r', encoding='utf-8') as f: value = json.load(f) else: # json str try: diff --git a/swift/llm/dataset/register.py b/swift/llm/dataset/register.py index 62de5fd27..c7e4700f7 100644 --- a/swift/llm/dataset/register.py +++ b/swift/llm/dataset/register.py @@ -157,7 +157,7 @@ def register_dataset_info(dataset_info: Union[str, List[str], None] = None) -> L if os.path.isfile(dataset_path): log_msg = dataset_path base_dir = os.path.dirname(dataset_path) - with open(dataset_path, 'r') as f: + with open(dataset_path, 'r', encoding='utf-8') as f: dataset_info = json.load(f) else: dataset_info = json.loads(dataset_info) # json diff --git a/swift/llm/export/merge_lora.py b/swift/llm/export/merge_lora.py index 266397dfc..a00ef26f6 100644 --- a/swift/llm/export/merge_lora.py +++ b/swift/llm/export/merge_lora.py @@ -60,7 +60,7 @@ def merge_lora(args: ExportArguments, device_map=None, replace_if_exists=False) '{base_model}', base_model) try: yamlfile = os.path.join(tempdir, 'mergekit.yaml') - with open(yamlfile, 'w') as f: + with open(yamlfile, 'w', encoding='utf-8') as f: f.write(merge_yaml) logger.info(f'Merging with config: {merge_yaml}') os.system(f'mergekit-yaml {yamlfile} {mergekit_path}') diff --git a/swift/llm/export/ollama.py b/swift/llm/export/ollama.py index c1e4d44ea..90e3fbcab 100644 --- a/swift/llm/export/ollama.py +++ b/swift/llm/export/ollama.py @@ -36,7 +36,7 @@ def export_to_ollama(args: ExportArguments): pt_engine = PtEngine.from_model_template(model, template) logger.info(f'Using model_dir: {pt_engine.model_dir}') template_meta = template.template_meta - with open(os.path.join(args.output_dir, 'Modelfile'), 'w') as f: + with open(os.path.join(args.output_dir, 'Modelfile'), 'w', encoding='utf-8') as f: f.write(f'FROM {pt_engine.model_dir}\n') f.write(f'TEMPLATE """{{{{ if .System }}}}' f'{replace_and_concat(template, template_meta.system_prefix, "{{SYSTEM}}", "{{ .System }}")}' diff --git a/swift/plugin/loss_scale.py b/swift/plugin/loss_scale.py index f5e9f5b92..275d2e0e4 100644 --- a/swift/plugin/loss_scale.py +++ b/swift/plugin/loss_scale.py @@ -69,7 +69,7 @@ def __init__(self): if self.loss_scale_config is not None: path = os.path.dirname(os.path.abspath(__file__)) config_path = os.path.join(path, 'agent', self.loss_scale_config) - with open(config_path, 'r') as json_file: + with open(config_path, 'r', encoding='utf-8') as json_file: self.loss_scale_map = json.load(json_file) else: self.loss_scale_map = None diff --git a/swift/tuners/base.py b/swift/tuners/base.py index dc9d47747..654e7c5be 100644 --- a/swift/tuners/base.py +++ b/swift/tuners/base.py @@ -322,7 +322,7 @@ def from_pretrained(cls, raise ValueError(f'Please pass in a local dir or a model id, not a local file: {model_dir}') extra_state_keys = kwargs.pop('extra_state_keys', None) if extra_state_keys is None and os.path.isfile(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME)): - with open(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME), 'r') as file: + with open(os.path.join(model_dir, cls.EXTRA_STATE_DIR, CONFIG_NAME), 'r', encoding='utf-8') as file: _json = json.load(file) extra_state_keys = _json.get('extra_state_keys') if adapter_name is None: @@ -340,7 +340,7 @@ def from_pretrained(cls, logger.warning(f'{_name} is not a valid tuner') continue - with open(config_file, 'r') as file: + with open(config_file, 'r', encoding='utf-8') as file: json_object = json.load(file) if SWIFT_TYPE_KEY not in json_object: @@ -395,7 +395,7 @@ def create_or_update_model_card(self, output_dir: str): if not os.path.exists(os.path.join(output_dir, 'README.md')): lines = [] else: - with open(os.path.join(output_dir, 'README.md'), 'r') as f: + with open(os.path.join(output_dir, 'README.md'), 'r', encoding='utf-8') as f: lines = f.readlines() quantization_config = None @@ -426,7 +426,7 @@ def create_or_update_model_card(self, output_dir: str): lines.append(f'{base_model_heading}\n\n- BaseModel Class {self.base_model.__class__.__name__}\n') # write the lines back to README.md - with open(os.path.join(output_dir, 'README.md'), 'w') as f: + with open(os.path.join(output_dir, 'README.md'), 'w', encoding='utf-8') as f: f.writelines(lines) def add_weighted_adapter( @@ -587,13 +587,14 @@ def save_pretrained(self, os.makedirs(os.path.join(save_directory, self.EXTRA_STATE_DIR), exist_ok=True) self._save_state_dict(output_state_dict, os.path.join(save_directory, self.EXTRA_STATE_DIR), safe_serialization) - with open(os.path.join(save_directory, self.EXTRA_STATE_DIR, CONFIG_NAME), 'w') as file: + with open( + os.path.join(save_directory, self.EXTRA_STATE_DIR, CONFIG_NAME), 'w', encoding='utf-8') as file: json.dump({'extra_state_keys': self.extra_state_keys}, file) else: logger.error('Full parameter training, save_extra_states will be ignored') if not os.path.exists(os.path.join(save_directory, 'configuration.json')): - with open(os.path.join(save_directory, 'configuration.json'), 'w') as f: + with open(os.path.join(save_directory, 'configuration.json'), 'w', encoding='utf-8') as f: f.write('{}') @staticmethod @@ -776,7 +777,7 @@ def has_custom_content(_json): return not LoRAConfig(**_json).can_be_saved_to_peft() for adapter in adapter_names: - with open(os.path.join(ckpt_dir, adapter, CONFIG_NAME)) as f: + with open(os.path.join(ckpt_dir, adapter, CONFIG_NAME), encoding='utf-8') as f: _json = json.load(f) if has_custom_content(_json): raise AssertionError('Cannot transfer to peft format, ' @@ -802,7 +803,7 @@ def has_custom_content(_json): state_dict = new_state_dict SwiftModel._save_state_dict(state_dict, os.path.join(output_dir, adapter), safe_serialization) from swift import LoRAConfig - with open(os.path.join(output_dir, adapter, CONFIG_NAME)) as f: + with open(os.path.join(output_dir, adapter, CONFIG_NAME), encoding='utf-8') as f: _json = json.load(f) peft_config = LoRAConfig(**_json).to_peft_config() peft_config.save_pretrained(os.path.join(output_dir, adapter)) @@ -836,7 +837,7 @@ def from_pretrained(model: Union[nn.Module, SwiftModel, PeftModel], model_id = snapshot_download(model_id, revision=revision) is_peft_model = False if os.path.exists(os.path.join(model_id, CONFIG_NAME)): - with open(os.path.join(model_id, CONFIG_NAME), 'r') as f: + with open(os.path.join(model_id, CONFIG_NAME), 'r', encoding='utf-8') as f: _json = json.load(f) is_peft_model = SWIFT_TYPE_KEY not in _json @@ -845,7 +846,7 @@ def from_pretrained(model: Union[nn.Module, SwiftModel, PeftModel], if isinstance(adapter_name, list) else list(adapter_name.keys())[0] _name = _name or '' if os.path.exists(os.path.join(model_id, _name, CONFIG_NAME)): - with open(os.path.join(model_id, _name, CONFIG_NAME), 'r') as f: + with open(os.path.join(model_id, _name, CONFIG_NAME), 'r', encoding='utf-8') as f: _json = json.load(f) is_peft_model = SWIFT_TYPE_KEY not in _json and 'extra_state_keys' not in _json if is_peft_model: diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py index a01b28d87..c8cdd0708 100644 --- a/swift/tuners/peft.py +++ b/swift/tuners/peft.py @@ -61,7 +61,7 @@ def save_pretrained(self, save_directory: str, **kwargs) -> None: 'lorap_lr_ratio': self.lorap_lr_ratio, 'lorap_emb_lr': self.lorap_emb_lr, } - with open(os.path.join(save_directory, 'additional_config.json'), 'w') as f: + with open(os.path.join(save_directory, 'additional_config.json'), 'w', encoding='utf-8') as f: json.dump(additional_args, f) @classmethod @@ -75,7 +75,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional self = LoraConfig(**self.to_dict()) if os.path.isfile(os.path.join(pretrained_model_name_or_path, 'additional_config.json')): - with open(os.path.join(pretrained_model_name_or_path, 'additional_config.json'), 'r') as f: + with open( + os.path.join(pretrained_model_name_or_path, 'additional_config.json'), 'r', encoding='utf-8') as f: _json = json.load(f) for key, value in _json.items(): setattr(self, key, value) diff --git a/swift/tuners/utils.py b/swift/tuners/utils.py index 72d5904f2..bb23221bc 100644 --- a/swift/tuners/utils.py +++ b/swift/tuners/utils.py @@ -59,7 +59,7 @@ def save_pretrained(self, save_directory, **kwargs): output_path = os.path.join(save_directory, CONFIG_NAME) # save it - with open(output_path, 'w') as writer: + with open(output_path, 'w', encoding='utf-8') as writer: writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) @classmethod @@ -103,7 +103,7 @@ def from_json_file(cls, path_json_file, **kwargs): path_json_file (`str`): The path to the json file. """ - with open(path_json_file, 'r') as file: + with open(path_json_file, 'r', encoding='utf-8') as file: json_object = json.load(file) return json_object diff --git a/swift/ui/base.py b/swift/ui/base.py index 5ee66c01a..ed6c389ac 100644 --- a/swift/ui/base.py +++ b/swift/ui/base.py @@ -138,7 +138,7 @@ def save_cache(cls, key, value): timestamp = str(int(time.time())) key = key.replace('/', '-') filename = os.path.join(cls.cache_dir, key + '-' + timestamp) - with open(filename, 'w') as f: + with open(filename, 'w', encoding='utf-8') as f: json.dump(value, f) @classmethod @@ -161,7 +161,7 @@ def load_cache(cls, key, timestamp) -> BaseArguments: timestamp = int(dt_object.timestamp()) key = key.replace('/', '-') filename = key + '-' + str(timestamp) - with open(os.path.join(cls.cache_dir, filename), 'r') as f: + with open(os.path.join(cls.cache_dir, filename), 'r', encoding='utf-8') as f: return json.load(f) @classmethod diff --git a/swift/ui/llm_infer/llm_infer.py b/swift/ui/llm_infer/llm_infer.py index 07dbd9a43..65ac9f202 100644 --- a/swift/ui/llm_infer/llm_infer.py +++ b/swift/ui/llm_infer/llm_infer.py @@ -230,7 +230,7 @@ def deploy(cls, *args): model = kwargs.get('model') if os.path.exists(model) and os.path.exists(os.path.join(model, 'args.json')): kwargs['ckpt_dir'] = kwargs.pop('model') - with open(os.path.join(kwargs['ckpt_dir'], 'args.json'), 'r') as f: + with open(os.path.join(kwargs['ckpt_dir'], 'args.json'), 'r', encoding='utf-8') as f: _json = json.load(f) kwargs['model_type'] = _json['model_type'] kwargs['train_type'] = _json['train_type'] diff --git a/swift/ui/llm_infer/runtime.py b/swift/ui/llm_infer/runtime.py index 6dbb37812..6b086cf2b 100644 --- a/swift/ui/llm_infer/runtime.py +++ b/swift/ui/llm_infer/runtime.py @@ -131,7 +131,7 @@ def wait(cls, task): latest_data = '' lines = collections.deque(maxlen=int(os.environ.get('MAX_LOG_LINES', 50))) try: - with open(log_file, 'r') as input: + with open(log_file, 'r', encoding='utf-8') as input: input.seek(offset) fail_cnt = 0 while True: @@ -268,7 +268,7 @@ def task_changed(cls, task, base_tab): ret.append(gr.update()) train_type = None if is_custom_path: - with open(os.path.join(all_args['ckpt_dir'], 'args.json'), 'r') as f: + with open(os.path.join(all_args['ckpt_dir'], 'args.json'), 'r', encoding='utf-8') as f: _json = json.load(f) train_type = _json.get('train_type') return ret + [gr.update(value=None), [all_args.get('model_type'), all_args.get('template_type'), train_type]] diff --git a/swift/ui/llm_train/runtime.py b/swift/ui/llm_train/runtime.py index 218d0d6f2..a8a02c198 100644 --- a/swift/ui/llm_train/runtime.py +++ b/swift/ui/llm_train/runtime.py @@ -319,7 +319,7 @@ def wait(cls, logging_dir, task): latest_data = '' lines = collections.deque(maxlen=int(os.environ.get('MAX_LOG_LINES', 50))) try: - with open(log_file, 'r') as input: + with open(log_file, 'r', encoding='utf-8') as input: input.seek(offset) fail_cnt = 0 while True: @@ -451,8 +451,8 @@ def parse_info_from_cmdline(task): all_args[splits[0]] = splits[1] output_dir = all_args['output_dir'] - if os.path.exists(os.path.join(output_dir, 'sft_args.json')): - with open(os.path.join(output_dir, 'sft_args.json'), 'r') as f: + if os.path.exists(os.path.join(output_dir, 'args.json')): + with open(os.path.join(output_dir, 'args.json'), 'r', encoding='utf-8') as f: _json = json.load(f) for key in all_args.keys(): all_args[key] = _json.get(key) From 00c2eaa97c6891684f34a00b556ca47d20103ab8 Mon Sep 17 00:00:00 2001 From: Jintao Date: Tue, 24 Dec 2024 10:52:01 +0800 Subject: [PATCH 06/13] fix docs multimodal; fix pretrain mllm (#2742) --- ...11\346\225\260\346\215\256\351\233\206.md" | 20 +++++++++++++---- .../source_en/Customization/Custom-dataset.md | 22 +++++++++++++++---- swift/llm/template/base.py | 1 + swift/llm/template/template/gemma.py | 2 +- tests/test_align/test_template/test_vision.py | 7 +++--- 5 files changed, 39 insertions(+), 13 deletions(-) diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" index da454cc19..aaa018144 100644 --- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" @@ -34,7 +34,7 @@ query-response格式: ## 推荐数据集格式 -以下给出ms-swift的推荐数据集格式: +以下给出ms-swift的推荐数据集格式,其中system字段是可选的,默认使用template中定义的`default_system`。 ### 预训练 @@ -69,11 +69,23 @@ query-response格式: ### 多模态 -对于多模态数据集,和上述任务的格式相同。区别在于增加了`images`, `videos`, `audios`几个key,分别代表多模态资源: +对于多模态数据集,和上述任务的格式相同。区别在于增加了`images`, `videos`, `audios`几个key,分别代表多模态资源,`` `