From a27182b7fc57d07adc853689978649b2f6b1cf5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yangshen=E2=9A=A1Deng?= <yangshen.d@outlook.com>
Date: Fri, 16 Aug 2024 19:41:05 +0800
Subject: [PATCH] Fix AutoConfig and AutoModel support for Llava-Next-Video
 (#32844)

* Fix: fix all model_type of Llava-Next-Video to llava_next_video

* Fix doc for llava_next_video

* * Fix formatting issues
* Change llava-next-video.md file name into llava_next_video.md to make it compatible with implementation

* Fix docs TOC for llava-next-video
---
 docs/source/en/_toctree.yml                                   | 2 +-
 docs/source/en/index.md                                       | 2 +-
 .../en/model_doc/{llava-next-video.md => llava_next_video.md} | 0
 src/transformers/models/auto/configuration_auto.py            | 4 ++--
 src/transformers/models/auto/image_processing_auto.py         | 2 +-
 src/transformers/models/auto/modeling_auto.py                 | 4 ++--
 src/transformers/models/auto/processing_auto.py               | 2 +-
 src/transformers/models/auto/tokenization_auto.py             | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)
 rename docs/source/en/model_doc/{llava-next-video.md => llava_next_video.md} (100%)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 9f89c8669dc1..2160e5c73373 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -822,7 +822,7 @@
         title: Llava
       - local: model_doc/llava_next
         title: LLaVA-NeXT
-      - local: model_doc/llava-next-video
+      - local: model_doc/llava_next_video
         title: LLaVa-NeXT-Video
       - local: model_doc/lxmert
         title: LXMERT
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 2fe725de7cd9..a65275512309 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -186,7 +186,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [Llama3](model_doc/llama3)                        |       ✅        |         ❌         |      ✅      |
 |                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
 |                    [LLaVA-NeXT](model_doc/llava_next)                    |       ✅        |         ❌         |      ❌      |
-|              [LLaVa-NeXT-Video](model_doc/llava-next-video)              |       ✅        |         ❌         |      ❌      |
+|              [LLaVa-NeXT-Video](model_doc/llava_next_video)              |       ✅        |         ❌         |      ❌      |
 |                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
 |                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
 |                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava_next_video.md
similarity index 100%
rename from docs/source/en/model_doc/llava-next-video.md
rename to docs/source/en/model_doc/llava_next_video.md
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 2c4f815d7d79..34d2ab0d65f2 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -145,8 +145,8 @@
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llava", "LlavaConfig"),
-        ("llava-next-video", "LlavaNextVideoConfig"),
         ("llava_next", "LlavaNextConfig"),
+        ("llava_next_video", "LlavaNextVideoConfig"),
         ("longformer", "LongformerConfig"),
         ("longt5", "LongT5Config"),
         ("luke", "LukeConfig"),
@@ -436,8 +436,8 @@
         ("llama2", "Llama2"),
         ("llama3", "Llama3"),
         ("llava", "LLaVa"),
-        ("llava-next-video", "LLaVa-NeXT-Video"),
         ("llava_next", "LLaVA-NeXT"),
+        ("llava_next_video", "LLaVa-NeXT-Video"),
         ("longformer", "Longformer"),
         ("longt5", "LongT5"),
         ("luke", "LUKE"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index 8bfc61b9bea3..d072a1b3deb0 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -97,8 +97,8 @@
             ("layoutlmv3", ("LayoutLMv3ImageProcessor",)),
             ("levit", ("LevitImageProcessor",)),
             ("llava", ("CLIPImageProcessor",)),
-            ("llava-next-video", ("LlavaNextVideoImageProcessor",)),
             ("llava_next", ("LlavaNextImageProcessor",)),
+            ("llava_next_video", ("LlavaNextVideoImageProcessor",)),
             ("mask2former", ("Mask2FormerImageProcessor",)),
             ("maskformer", ("MaskFormerImageProcessor",)),
             ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 2b49c295975d..5643246ec4b5 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -308,8 +308,8 @@
         ("idefics2", "Idefics2ForConditionalGeneration"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("luke", "LukeForMaskedLM"),
         ("lxmert", "LxmertForPreTraining"),
@@ -721,8 +721,8 @@
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("llava-next-video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
+        ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
         ("video_llava", "VideoLlavaForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7877343d5318..1c41b80abe33 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -71,8 +71,8 @@
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llava", "LlavaProcessor"),
-        ("llava-next-video", "LlavaNextVideoProcessor"),
         ("llava_next", "LlavaNextProcessor"),
+        ("llava_next_video", "LlavaNextVideoProcessor"),
         ("markuplm", "MarkupLMProcessor"),
         ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index a0f4e4f449f4..b094f50b5e97 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -257,8 +257,8 @@
                 ),
             ),
             ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-            ("llava-next-video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "longt5",