From a27182b7fc57d07adc853689978649b2f6b1cf5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yangshen=E2=9A=A1Deng?= Date: Fri, 16 Aug 2024 19:41:05 +0800 Subject: [PATCH] Fix AutoConfig and AutoModel support for Llava-Next-Video (#32844) * Fix: fix all model_type of Llava-Next-Video to llava_next_video * Fix doc for llava_next_video * * Fix formatting issues * Change llava-next-video.md file name into llava_next_video.md to make it compatible with implementation * Fix docs TOC for llava-next-video --- docs/source/en/_toctree.yml | 2 +- docs/source/en/index.md | 2 +- .../en/model_doc/{llava-next-video.md => llava_next_video.md} | 0 src/transformers/models/auto/configuration_auto.py | 4 ++-- src/transformers/models/auto/image_processing_auto.py | 2 +- src/transformers/models/auto/modeling_auto.py | 4 ++-- src/transformers/models/auto/processing_auto.py | 2 +- src/transformers/models/auto/tokenization_auto.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) rename docs/source/en/model_doc/{llava-next-video.md => llava_next_video.md} (100%) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 9f89c8669dc1..2160e5c73373 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -822,7 +822,7 @@ title: Llava - local: model_doc/llava_next title: LLaVA-NeXT - - local: model_doc/llava-next-video + - local: model_doc/llava_next_video title: LLaVa-NeXT-Video - local: model_doc/lxmert title: LXMERT diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 2fe725de7cd9..a65275512309 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -186,7 +186,7 @@ Flax), PyTorch, and/or TensorFlow. | [Llama3](model_doc/llama3) | ✅ | ❌ | ✅ | | [LLaVa](model_doc/llava) | ✅ | ❌ | ❌ | | [LLaVA-NeXT](model_doc/llava_next) | ✅ | ❌ | ❌ | -| [LLaVa-NeXT-Video](model_doc/llava-next-video) | ✅ | ❌ | ❌ | +| [LLaVa-NeXT-Video](model_doc/llava_next_video) | ✅ | ❌ | ❌ | | [Longformer](model_doc/longformer) | ✅ | ✅ | ❌ | | [LongT5](model_doc/longt5) | ✅ | ❌ | ✅ | | [LUKE](model_doc/luke) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava_next_video.md similarity index 100% rename from docs/source/en/model_doc/llava-next-video.md rename to docs/source/en/model_doc/llava_next_video.md diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 2c4f815d7d79..34d2ab0d65f2 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -145,8 +145,8 @@ ("lilt", "LiltConfig"), ("llama", "LlamaConfig"), ("llava", "LlavaConfig"), - ("llava-next-video", "LlavaNextVideoConfig"), ("llava_next", "LlavaNextConfig"), + ("llava_next_video", "LlavaNextVideoConfig"), ("longformer", "LongformerConfig"), ("longt5", "LongT5Config"), ("luke", "LukeConfig"), @@ -436,8 +436,8 @@ ("llama2", "Llama2"), ("llama3", "Llama3"), ("llava", "LLaVa"), - ("llava-next-video", "LLaVa-NeXT-Video"), ("llava_next", "LLaVA-NeXT"), + ("llava_next_video", "LLaVa-NeXT-Video"), ("longformer", "Longformer"), ("longt5", "LongT5"), ("luke", "LUKE"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 8bfc61b9bea3..d072a1b3deb0 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -97,8 +97,8 @@ ("layoutlmv3", ("LayoutLMv3ImageProcessor",)), ("levit", ("LevitImageProcessor",)), ("llava", ("CLIPImageProcessor",)), - ("llava-next-video", ("LlavaNextVideoImageProcessor",)), ("llava_next", ("LlavaNextImageProcessor",)), + ("llava_next_video", ("LlavaNextVideoImageProcessor",)), ("mask2former", ("Mask2FormerImageProcessor",)), ("maskformer", ("MaskFormerImageProcessor",)), ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2b49c295975d..5643246ec4b5 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -308,8 +308,8 @@ ("idefics2", "Idefics2ForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), - ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("longformer", "LongformerForMaskedLM"), ("luke", "LukeForMaskedLM"), ("lxmert", "LxmertForPreTraining"), @@ -721,8 +721,8 @@ ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), - ("llava-next-video", "LlavaNextVideoForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), ("video_llava", "VideoLlavaForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 7877343d5318..1c41b80abe33 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -71,8 +71,8 @@ ("layoutlmv2", "LayoutLMv2Processor"), ("layoutlmv3", "LayoutLMv3Processor"), ("llava", "LlavaProcessor"), - ("llava-next-video", "LlavaNextVideoProcessor"), ("llava_next", "LlavaNextProcessor"), + ("llava_next_video", "LlavaNextVideoProcessor"), ("markuplm", "MarkupLMProcessor"), ("mctct", "MCTCTProcessor"), ("mgp-str", "MgpstrProcessor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index a0f4e4f449f4..b094f50b5e97 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -257,8 +257,8 @@ ), ), ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), ( "longt5",