From 050657f75119c92e6081b7f774a185a7d98e79cf Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 20 May 2024 12:54:09 +0200
Subject: [PATCH 01/37] let it be

---
 .../models/llava/processing_llava.py          | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index ff010f74428a..61d06d3ad251 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -41,12 +41,13 @@ class LlavaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "AutoImageProcessor"
-    tokenizer_class = "AutoTokenizer"
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
 
+
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
@@ -105,12 +106,29 @@ def __call__(
             pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
         else:
             pixel_values = None
+        
+        # Replace the image token with the expanded image token sequence
+        image_str = self.image_token.content
+        num_image_tokens = self._get_number_of_features()
+        prompt_strings = []
+        for sample in text:
+            sample = sample.replace(image_str, image_str * num_image_tokens)
+            prompt_strings.append(sample)
+        
         text_inputs = self.tokenizer(
             text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
         )
 
         return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
 
+    def _get_number_of_features(self) -> int:
+        image_size = self.config.vision_config.image_size
+        patch_size = self.config.vision_config.patch_size
+
+        num_patches = (image_size // patch_size) ** 2
+        num_features = num_patches + 1
+        return num_features
+
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """

From a67087e41cf95fb2f272e08d9185c1745a0479af Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 22 May 2024 13:36:45 +0200
Subject: [PATCH 02/37] draft

---
 .../models/llava/modeling_llava.py            | 224 +++-------------
 .../models/llava/processing_llava.py          |  37 +--
 .../models/llava_next/modeling_llava_next.py  | 208 +++++----------
 .../llava_next/processing_llava_next.py       |  70 ++++-
 .../video_llava/modeling_video_llava.py       | 245 +++---------------
 .../video_llava/processing_video_llava.py     |  28 +-
 .../models/vipllava/modeling_vipllava.py      | 211 +++------------
 7 files changed, 305 insertions(+), 718 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 0426776beed1..e0df904896b5 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -23,7 +23,6 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import ModelOutput
 from ...utils import (
     add_start_docstrings,
@@ -274,84 +273,6 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
-    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
-        num_images, num_image_patches, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == self.config.image_token_index
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
-        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
-
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-
-        # 3. Create the full embedding, already padded to the maximum position
-        final_embedding = torch.zeros(
-            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-        final_attention_mask = torch.zeros(
-            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
-        )
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-        if labels is not None:
-            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
-
-        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
-        image_to_overwrite = torch.full(
-            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
-        )
-        image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-
-        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-
-        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
-        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
-        indices_to_mask = new_token_positions[batch_indices, pad_indices]
-
-        final_embedding[batch_indices, indices_to_mask] = 0
-
-        if labels is None:
-            final_labels = None
-
-        return final_embedding, final_attention_mask, final_labels, position_ids
-
     @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -369,6 +290,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
         r"""
         Args:
@@ -406,6 +328,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -415,63 +338,28 @@ def forward(
             else self.config.vision_feature_select_strategy
         )
 
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
         if inputs_embeds is None:
-            # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
-                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-                else:
-                    raise ValueError(
-                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
-                    )
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-
-            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
-                )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+            # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+            image_features = self.multi_modal_projector(selected_image_feature)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -482,6 +370,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         logits = outputs[0]
@@ -515,56 +404,29 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        **kwargs,
     ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
         )
+
+        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values passed by the user
+        if past_key_values is None:
+            model_inputs["pixel_values"] = pixel_values
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 61d06d3ad251..57dd0b69479f 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -46,7 +46,10 @@ class LlavaProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
-
+        self.image_size = self.image_processor.size["shortest_edge"]
+        self.patch_size = 14  # self.image_processor.path_size
+        self.image_token = "<image>"
+        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
 
     def __call__(
         self,
@@ -106,29 +109,31 @@ def __call__(
             pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
         else:
             pixel_values = None
-        
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
         # Replace the image token with the expanded image token sequence
-        image_str = self.image_token.content
-        num_image_tokens = self._get_number_of_features()
+        num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
+        if self.vision_feature_select_strategy == "default":
+            num_image_tokens -= 1
+
         prompt_strings = []
         for sample in text:
-            sample = sample.replace(image_str, image_str * num_image_tokens)
+            sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
             prompt_strings.append(sample)
-        
+
         text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
         )
-
         return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
 
-    def _get_number_of_features(self) -> int:
-        image_size = self.config.vision_config.image_size
-        patch_size = self.config.vision_config.patch_size
-
-        num_patches = (image_size // patch_size) ** 2
-        num_features = num_patches + 1
-        return num_features
-
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index c052af3b3c8a..e5e0675e962a 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -25,7 +25,6 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...image_processing_utils import select_best_resolution
 from ...modeling_outputs import ModelOutput
 from ...utils import (
@@ -129,6 +128,7 @@ def unpad_image(tensor, original_size):
 
     original_aspect_ratio = original_width / original_height
     current_aspect_ratio = current_width / current_height
+    print(original_height, original_width, current_height, current_width)
 
     if original_aspect_ratio > current_aspect_ratio:
         scale_factor = current_width / original_width
@@ -700,6 +700,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
         r"""
         Args:
@@ -737,6 +738,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -746,104 +748,52 @@ def forward(
             else self.config.vision_feature_select_strategy
         )
 
-        if inputs_embeds is None:
-            # 1. Extract the input embeddings
-            # In case image_token_index is not in the embeddings (extra token but embedding don't have it)
-            for_inputs_embeds_ids = input_ids.clone()
-            for_inputs_embeds_ids[(input_ids == self.config.image_token_index)] = 0
-            inputs_embeds = self.get_input_embeddings()(for_inputs_embeds_ids)
-
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) > 0:
-                # ! infer image_num_patches from image_sizes
-                image_num_patches = [
-                    image_size_to_num_patches(
-                        image_size=imsize,
-                        grid_pinpoints=self.config.image_grid_pinpoints,
-                        patch_size=self.config.vision_config.image_size,
-                    )
-                    for imsize in image_sizes
-                ]
-                # figure out if pixel_values is concatenated or stacked
-                if pixel_values.dim() == 5:
-                    # stacking when input is (batch_size, num_patches, num_channels, height, width)
-                    _pixel_values_list = [
-                        pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)
-                    ]
-                    pixel_values = torch.cat(_pixel_values_list, dim=0)
-                elif pixel_values.dim() != 4:
-                    # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
-                    raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
-
-                image_features = self.vision_tower(pixel_values, output_hidden_states=True)
-                selected_image_feature = image_features.hidden_states[vision_feature_layer]
-
-                if vision_feature_select_strategy == "default":
-                    selected_image_feature = selected_image_feature[:, 1:]
-                elif vision_feature_select_strategy == "full":
-                    selected_image_feature = selected_image_feature
-
-                image_features = self.multi_modal_projector(selected_image_feature)
-
-                image_features = torch.split(image_features, image_num_patches, dim=0)
-
-                # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
-
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
-                )
-
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
 
-            # pixel_values is not None but is empty ---> text only cases
-            elif pixel_values is not None and input_ids.shape[1] != 1 and pixel_values.size(0) == 0:
-                # there are no images
-                pass
-
-            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-            # generation with cache
-            elif past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                # Retrieve the first layer to inspect the logits and mask out the hidden states
-                # that are set to 0
-                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                # Get the target length
-                target_length = input_ids.shape[1]
-                past_length = first_layer_past_key_value.shape[-1]
-
-                extended_attention_mask = torch.ones(
-                    (attention_mask.shape[0], past_length),
-                    dtype=attention_mask.dtype,
-                    device=attention_mask.device,
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            # ! infer image_num_patches from image_sizes
+            image_num_patches = [
+                image_size_to_num_patches(
+                    image_size=imsize,
+                    grid_pinpoints=self.config.image_grid_pinpoints,
+                    patch_size=self.config.vision_config.image_size,
                 )
-
-                # Filter out only the tokens that can be un-attended, this can happen
-                # if one uses Llava + Fused modules where the cache on the
-                # first iteration is already big enough, or if one passes custom cache
-                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                new_batch_index = batch_index[valid_indices]
-                new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                # Zero-out the places where we don't need to attend
-                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-
-                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                for imsize in image_sizes
+            ]
+            # figure out if pixel_values is concatenated or stacked
+            if pixel_values.dim() == 5:
+                # stacking when input is (batch_size, num_patches, num_channels, height, width)
+                _pixel_values_list = [
+                    pix_val[:num_patch] for pix_val, num_patch in zip(pixel_values, image_num_patches)
+                ]
+                pixel_values = torch.cat(_pixel_values_list, dim=0)
+            elif pixel_values.dim() != 4:
+                # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
+                raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
+
+            image_features = self.vision_tower(pixel_values, output_hidden_states=True)
+            selected_image_feature = image_features.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            image_features = self.multi_modal_projector(selected_image_feature)
+            image_features = torch.split(image_features, image_num_patches, dim=0)
+
+            # NOTE we only support multimodal_patch_merge_type == "spatial_unpad"
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -854,6 +804,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         logits = outputs[0]
@@ -894,57 +845,24 @@ def prepare_inputs_for_generation(
         pixel_values=None,
         image_sizes=None,
         attention_mask=None,
+        cache_position=None,
         **kwargs,
     ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "image_sizes": image_sizes,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
         )
+
+        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values passed by the user
+        if past_key_values is None:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+
         return model_inputs
 
     # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._reorder_cache
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 8a4b76e9c68a..d60615fa0ed2 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -19,6 +19,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
+from ...image_processing_utils import select_best_resolution
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
@@ -46,6 +47,11 @@ class LlavaNextProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
 
+        self.image_size = self.image_processor.size["shortest_edge"]
+        self.patch_size = 14  # self.image_processor.path_size
+        self.image_token = "<image>"
+        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
+
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
@@ -108,12 +114,74 @@ def __call__(
             image_inputs = self.image_processor(images, do_pad=do_pad, return_tensors=return_tensors)
         else:
             image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        if not image_inputs:
+            prompt_strings = text
+        else:
+            image_sizes = image_inputs["image_sizes"]
+            prompt_strings = []
+            for image_size, sample in zip(image_sizes, text):
+                # Replace the image token with the expanded image token sequence
+                height, width = image_size
+                num_image_tokens = self._get_number_of_features(height, width)
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
         text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
         )
 
         return BatchFeature(data={**text_inputs, **image_inputs})
 
+    def _get_number_of_features(self, height: int, width: int) -> int:
+        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
+        image_size = self.image_size
+        patch_size = self.patch_size
+
+        npatches = image_size // patch_size
+
+        height_best_resolution, width_best_resolution = select_best_resolution([height, width], image_grid_pinpoints)
+        num_patch_height, num_patch_width = height_best_resolution // image_size, width_best_resolution // image_size
+
+        unpadded_features, newline_features = self._get_unpadded_features(
+            height, width, npatches, num_patch_height, num_patch_width
+        )
+        # The base patch covers the entire image (+1 for the CLS)
+        base_features = npatches**2 + 1
+        num_image_tokens = unpadded_features + newline_features + base_features
+        return num_image_tokens
+
+    def _get_unpadded_features(self, height, width, npatches, num_patch_height, num_patch_width):
+        current_width = npatches * num_patch_height
+        current_height = npatches * num_patch_width
+
+        original_aspect_ratio = width / height
+        current_aspect_ratio = current_width / current_height
+        if original_aspect_ratio > current_aspect_ratio:
+            new_height = (height * current_width) // width
+            padding = (current_height - new_height) // 2
+            current_height -= padding * 2
+        else:
+            new_width = (width * current_height) // height
+            padding = (current_width - new_width) // 2
+            current_width -= padding * 2
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
+
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 7fbd142fbe85..f45bc814fcf2 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -22,7 +22,6 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import BaseModelOutputWithPooling, ModelOutput
 from ...utils import (
     add_start_docstrings,
@@ -279,87 +278,6 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
-    def _merge_input_ids_with_visual_features(
-        self, visual_features, inputs_embeds, input_ids, attention_mask, labels, num_frames=1
-    ):
-        num_images, num_image_patches, embed_dim = visual_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
-        special_vision_token = self.config.video_token_index if num_frames == 8 else self.config.image_token_index
-
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == special_vision_token
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_seq_len = (num_special_image_tokens.max() * (num_image_patches * num_frames - 1)) + sequence_length
-        batch_indices, non_image_indices = torch.where(input_ids != special_vision_token)
-
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = (
-            torch.cumsum((special_image_token_mask * (num_image_patches * num_frames - 1) + 1), dim=-1) - 1
-        )
-        nb_image_pad = max_seq_len - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-
-        # 3. Create the full embedding, already padded to the maximum position
-        # expand input ids so that the second "merge" with videos does not fail
-        final_embedding = torch.zeros(
-            batch_size, max_seq_len, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-        final_attention_mask = torch.zeros(
-            batch_size, max_seq_len, dtype=attention_mask.dtype, device=inputs_embeds.device
-        )
-        final_input_ids = torch.full(
-            (batch_size, max_seq_len), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
-        )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-        final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_seq_len), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
-        else:
-            final_labels = None
-
-        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
-        image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device)
-        image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-
-        if image_to_overwrite.sum() != visual_features.shape[:-1].numel():
-            visual_type = "videos" if num_frames == 8 else "images"
-            num_images //= num_frames
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of {visual_type} tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of {visual_type} given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-
-        final_embedding[image_to_overwrite] = visual_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-
-        return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids
-
     def _get_vision_features(
         self,
         pixel_values_images: Optional[torch.FloatTensor] = None,
@@ -415,6 +333,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, VideoLlavaCausalLMOutputWithPast]:
         r"""
         Args:
@@ -495,6 +414,7 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
@@ -504,79 +424,35 @@ def forward(
             else self.config.vision_feature_select_strategy
         )
 
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
         if inputs_embeds is None:
-            # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # 2. Merge text and images
-            if (pixel_values_images is not None or pixel_values_videos is not None) and input_ids.shape[1] != 1:
-                image_outputs, video_outputs = self._get_vision_features(
-                    pixel_values_images=pixel_values_images,
-                    pixel_values_videos=pixel_values_videos,
-                    vision_feature_layer=vision_feature_layer,
-                    vision_feature_select_strategy=vision_feature_select_strategy,
-                )
+        if pixel_values_images is not None or pixel_values_videos is not None:
+            image_outputs, video_outputs = self._get_vision_features(
+                pixel_values_images=pixel_values_images,
+                pixel_values_videos=pixel_values_videos,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+            )
 
-                # first add image embeds where possible, then expand again and add video embeds
-                if image_outputs is not None:
-                    visual_features = self.multi_modal_projector(image_outputs)
-                    (
-                        inputs_embeds,
-                        attention_mask,
-                        labels,
-                        position_ids,
-                        input_ids,
-                    ) = self._merge_input_ids_with_visual_features(
-                        visual_features, inputs_embeds, input_ids, attention_mask, labels
-                    )
-                if video_outputs is not None:
-                    visual_features = self.multi_modal_projector(video_outputs)
-                    (
-                        inputs_embeds,
-                        attention_mask,
-                        labels,
-                        position_ids,
-                        _,
-                    ) = self._merge_input_ids_with_visual_features(
-                        visual_features,
-                        inputs_embeds,
-                        input_ids,
-                        attention_mask,
-                        labels,
-                        num_frames=8,
-                    )
-            else:
-                # In case input_ids.shape[1] == 1 & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    target_length = input_ids.shape[1]
-                    past_length = first_layer_past_key_value.shape[-1]
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], past_length),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # if one uses Llava + Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            if image_outputs is not None:
+                image_features = self.multi_modal_projector(image_outputs)
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
+                inputs_embeds[special_image_mask] = image_features.flatten()
+            if video_outputs is not None:
+                video_features = self.multi_modal_projector(video_outputs)
+                print(video_features.shape)
+                special_image_mask = (
+                    (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
+                inputs_embeds[special_image_mask] = video_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -587,6 +463,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         logits = outputs[0]
@@ -627,60 +504,24 @@ def prepare_inputs_for_generation(
         pixel_values_images=None,
         pixel_values_videos=None,
         attention_mask=None,
+        cache_position=None,
         **kwargs,
     ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            else:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-            pixel_values_videos = None
-            pixel_values_images = None
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values_videos": pixel_values_videos,
-                "pixel_values_images": pixel_values_images,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
         )
+
+        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values passed by the user
+        if past_key_values is None:
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 0355d756ce27..51e2ad55a665 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -46,6 +46,11 @@ class VideoLlavaProcessor(ProcessorMixin):
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
+        self.image_size = self.image_processor.size["shortest_edge"]
+        self.patch_size = 14  # self.image_processor.path_size
+        self.image_token = "<image>"
+        self.video_token = "<video>"
+        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
 
     def __call__(
         self,
@@ -112,8 +117,29 @@ def __call__(
             encoded_images = self.image_processor(images=images, videos=videos, return_tensors=return_tensors)
             data.update(encoded_images)
 
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # Replace the image token with the expanded image token sequence
+        num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
+        num_video_tokens = num_image_tokens * 8  # Always 8 frames, no `vision_feature_select_strategy`
+        if self.vision_feature_select_strategy == "default":
+            num_image_tokens -= 1
+
+        prompt_strings = []
+        for sample in text:
+            sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+            sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+            prompt_strings.append(sample)
+
         text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
         )
         data.update(text_inputs)
 
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 02821b03a279..0069bf36fc9a 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -22,7 +22,6 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...cache_utils import Cache
 from ...modeling_outputs import ModelOutput
 from ...utils import (
     add_start_docstrings,
@@ -278,84 +277,6 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
-    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
-        num_images, num_image_patches, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
-        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
-        # 1. Create a mask to know where special image tokens are
-        special_image_token_mask = input_ids == self.config.image_token_index
-        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
-        # Compute the maximum embed dimension
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
-        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
-
-        # 2. Compute the positions where text should be written
-        # Calculate new positions for text tokens in merged image-text sequence.
-        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
-        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
-        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
-        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
-        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-        if left_padding:
-            new_token_positions += nb_image_pad[:, None]  # offset for left padding
-        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
-
-        # 3. Create the full embedding, already padded to the maximum position
-        final_embedding = torch.zeros(
-            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
-        final_attention_mask = torch.zeros(
-            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
-        )
-        if labels is not None:
-            final_labels = torch.full(
-                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
-        # set the corresponding tensors into their correct target device.
-        target_device = inputs_embeds.device
-        batch_indices, non_image_indices, text_to_overwrite = (
-            batch_indices.to(target_device),
-            non_image_indices.to(target_device),
-            text_to_overwrite.to(target_device),
-        )
-        attention_mask = attention_mask.to(target_device)
-
-        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
-        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
-        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
-        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-        if labels is not None:
-            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
-
-        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
-        image_to_overwrite = torch.full(
-            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
-        )
-        image_to_overwrite[batch_indices, text_to_overwrite] = False
-        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
-
-        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
-            raise ValueError(
-                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
-                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
-            )
-
-        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        final_attention_mask |= image_to_overwrite
-        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
-
-        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
-        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
-        indices_to_mask = new_token_positions[batch_indices, pad_indices]
-
-        final_embedding[batch_indices, indices_to_mask] = 0
-
-        if labels is None:
-            final_labels = None
-
-        return final_embedding, final_attention_mask, final_labels, position_ids
-
     @add_start_docstrings_to_model_forward(VIPLLAVA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=VipLlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
@@ -373,6 +294,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, VipLlavaCausalLMOutputWithPast]:
         r"""
         Args:
@@ -413,58 +335,29 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layers = (
             vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers
         )
 
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
         if inputs_embeds is None:
-            # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-                # For VIP-llava, the image features are computed this way
-                # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
-                image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
-                image_features = torch.cat(image_features, dim=-1)
-
-                image_features = self.multi_modal_projector(image_features)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels
-                )
-            else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    target_length = input_ids.shape[1]
-                    past_length = first_layer_past_key_value.shape[-1]
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], past_length),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # in the case one uses Llava + Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+            # For VIP-llava, the image features are computed this way
+            # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
+            image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
+            image_features = torch.cat(image_features, dim=-1)
+            image_features = self.multi_modal_projector(image_features)
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -475,6 +368,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            cache_position=cache_position,
         )
 
         logits = outputs[0]
@@ -508,56 +402,29 @@ def forward(
         )
 
     def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        **kwargs,
     ):
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
         )
+
+        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values passed by the user
+        if past_key_values is None:
+            model_inputs["pixel_values"] = pixel_values
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):

From 1e2b873a4f37ea279bb2d952e77a14943f515d4a Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 22 May 2024 14:18:03 +0200
Subject: [PATCH 03/37] should not have changed

---
 src/transformers/models/llava/processing_llava.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 57dd0b69479f..0d6b59c7f6bc 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -41,8 +41,8 @@ class LlavaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "CLIPImageProcessor"
-    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
 
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)

From 70145d460c394e1308f489a26cac3e066df90bfa Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 29 May 2024 12:32:14 +0200
Subject: [PATCH 04/37] add warnings

---
 .../models/llava/modeling_llava.py            | 151 +++++++++++++-
 .../models/llava/processing_llava.py          |  35 ++--
 .../models/llava_next/modeling_llava_next.py  |  79 +++++++-
 .../llava_next/processing_llava_next.py       |  18 +-
 .../video_llava/modeling_video_llava.py       | 191 ++++++++++++++++--
 .../video_llava/processing_video_llava.py     |  38 ++--
 .../models/vipllava/modeling_vipllava.py      | 152 +++++++++++++-
 7 files changed, 593 insertions(+), 71 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index e0df904896b5..64931c2ed084 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -273,6 +273,84 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+        image_to_overwrite = torch.full(
+            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+        )
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
+        indices_to_mask = new_token_positions[batch_indices, pad_indices]
+
+        final_embedding[batch_indices, indices_to_mask] = 0
+
+        if labels is None:
+            final_labels = None
+
+        return final_embedding, final_attention_mask, final_labels, position_ids
+
     @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -343,8 +421,12 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
 
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -358,8 +440,53 @@ def forward(
                 raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
 
             image_features = self.multi_modal_projector(selected_image_feature)
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = image_features.flatten()
+            
+            if legacy_processing:  
+                logger.warning(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+                # prefill stage vs decoding stage (legacy behavior copied)
+                if input_ids.shape[1] != 1:
+                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.44
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -413,7 +540,19 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = self.language_model.prepare_inputs_for_generation(
+    
+        model_inputs = {}
+
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
+            cache_position = None        
+        else:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            if past_key_values is None:
+                model_inputs["pixel_values"] = pixel_values
+        
+        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -422,11 +561,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values passed by the user
-        if past_key_values is None:
-            model_inputs["pixel_values"] = pixel_values
-
+        model_inputs.update(lang_model_inputs)
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 0d6b59c7f6bc..126232b6efc7 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -23,7 +23,9 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import logging, TensorType
+
+logger = logging.get_logger(__name__)
 
 
 class LlavaProcessor(ProcessorMixin):
@@ -47,9 +49,9 @@ class LlavaProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
         self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = 14  # self.image_processor.path_size
+        self.patch_size = getattr(self.image_processor, "patch_size", None)
         self.image_token = "<image>"
-        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
+        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
 
     def __call__(
         self,
@@ -115,15 +117,24 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        # Replace the image token with the expanded image token sequence
-        num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
-        if self.vision_feature_select_strategy == "default":
-            num_image_tokens -= 1
-
-        prompt_strings = []
-        for sample in text:
-            sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-            prompt_strings.append(sample)
+        # try to expand inputs in processing if we have the necessary parts
+        if self.patch_size is not None and self.vision_feature_select_strategy is not None: 
+            # Replace the image token with the expanded image token sequence
+            num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+        else:
+            prompt_strings = text
+            logger.warning(
+                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
 
         text_inputs = self.tokenizer(
             prompt_strings,
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index e5e0675e962a..5986e604616b 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -753,8 +753,13 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
+
 
         if pixel_values is not None and pixel_values.size(0) > 0:
             # ! infer image_num_patches from image_sizes
@@ -792,8 +797,57 @@ def forward(
                 image_sizes,
                 image_newline=self.image_newline,
             )
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = image_features.flatten()
+            if legacy_processing:
+                logger.warning(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+                if input_ids.shape[1] != 1:
+                    inputs_embeds = inputs_embeds.to(image_features.dtype)
+                    inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_image_features(
+                        image_features,
+                        feature_lens,
+                        inputs_embeds,
+                        input_ids,
+                        attention_mask,
+                        position_ids,
+                        labels=labels,
+                    )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.44 
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -848,7 +902,19 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = self.language_model.prepare_inputs_for_generation(
+        model_inputs = {}
+
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
+            cache_position = None        
+        else:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            if past_key_values is None:
+                model_inputs["pixel_values"] = pixel_values
+                model_inputs["image_sizes"] = image_sizes
+        
+        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -857,12 +923,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values passed by the user
-        if past_key_values is None:
-            model_inputs["pixel_values"] = pixel_values
-            model_inputs["image_sizes"] = image_sizes
-
+        model_inputs.update(lang_model_inputs)
         return model_inputs
 
     # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._reorder_cache
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index d60615fa0ed2..1e5e0a51b9d6 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -23,7 +23,9 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import logging, TensorType
+
+logger = logging.get_logger(__name__)
 
 
 class LlavaNextProcessor(ProcessorMixin):
@@ -48,9 +50,9 @@ def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
 
         self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = 14  # self.image_processor.path_size
+        self.patch_size = getattr(self.image_processor, "patch_size", None)
         self.image_token = "<image>"
-        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
+        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
 
     def __call__(
         self,
@@ -120,7 +122,15 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if not image_inputs:
+        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+            prompt_strings = text
+            logger.warning(
+                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+        # cannot infer image expansion length if no images are found 
+        elif not image_inputs:
             prompt_strings = text
         else:
             image_sizes = image_inputs["image_sizes"]
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index f45bc814fcf2..d5be6afff3d6 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -278,6 +278,87 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
+    def _merge_input_ids_with_visual_features(
+        self, visual_features, inputs_embeds, input_ids, attention_mask, labels, num_frames=1
+    ):
+        num_images, num_image_patches, embed_dim = visual_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        special_vision_token = self.config.video_token_index if num_frames == 8 else self.config.image_token_index
+
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == special_vision_token
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_seq_len = (num_special_image_tokens.max() * (num_image_patches * num_frames - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != special_vision_token)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = (
+            torch.cumsum((special_image_token_mask * (num_image_patches * num_frames - 1) + 1), dim=-1) - 1
+        )
+        nb_image_pad = max_seq_len - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        # expand input ids so that the second "merge" with videos does not fail
+        final_embedding = torch.zeros(
+            batch_size, max_seq_len, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_seq_len, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        final_input_ids = torch.full(
+            (batch_size, max_seq_len), self.pad_token_id, dtype=input_ids.dtype, device=inputs_embeds.device
+        )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        final_input_ids[batch_indices, text_to_overwrite] = input_ids[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_seq_len), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+        else:
+            final_labels = None
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.full((batch_size, max_seq_len), True, dtype=torch.bool, device=inputs_embeds.device)
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != visual_features.shape[:-1].numel():
+            visual_type = "videos" if num_frames == 8 else "images"
+            num_images //= num_frames
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of {visual_type} tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of {visual_type} given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = visual_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+        return final_embedding, final_attention_mask, final_labels, position_ids, final_input_ids
+
     def _get_vision_features(
         self,
         pixel_values_images: Optional[torch.FloatTensor] = None,
@@ -429,8 +510,12 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576 or (input_ids == self.config.video_token_index).sum(1).max() < 4600
 
         if pixel_values_images is not None or pixel_values_videos is not None:
             image_outputs, video_outputs = self._get_vision_features(
@@ -442,17 +527,86 @@ def forward(
 
             if image_outputs is not None:
                 image_features = self.multi_modal_projector(image_outputs)
-                special_image_mask = (
-                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-                )
-                inputs_embeds[special_image_mask] = image_features.flatten()
             if video_outputs is not None:
                 video_features = self.multi_modal_projector(video_outputs)
-                print(video_features.shape)
-                special_image_mask = (
-                    (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            
+            
+            if legacy_processing:
+                logger.warning(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
-                inputs_embeds[special_image_mask] = video_features.flatten()
+                if input_ids.shape[1] != 1:
+                    if image_outputs is not None:
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            labels,
+                            position_ids,
+                            input_ids,
+                        ) = self._merge_input_ids_with_visual_features(
+                            image_features, inputs_embeds, input_ids, attention_mask, labels
+                        )
+                    if video_outputs is not None:
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            labels,
+                            position_ids,
+                            _,
+                        ) = self._merge_input_ids_with_visual_features(
+                            video_features,
+                            inputs_embeds,
+                            input_ids,
+                            attention_mask,
+                            labels,
+                            num_frames=8,
+                        )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            
+            # TODO: @raushan retain only the new behavior after v4.44
+            else:
+                if image_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    inputs_embeds[special_image_mask] = image_features.flatten()
+                
+                if video_outputs is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    inputs_embeds[special_image_mask] = video_features.flatten()
+
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -507,7 +661,19 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = self.language_model.prepare_inputs_for_generation(
+        model_inputs = {}
+
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
+            cache_position = None        
+        else:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image/video token anymore
+            # Otherwise we need pixel values to be passed to model
+            if past_key_values is None:
+                model_inputs["pixel_values_images"] = pixel_values_images
+                model_inputs["pixel_values_videos"] = pixel_values_videos
+        
+        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -516,12 +682,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values passed by the user
-        if past_key_values is None:
-            model_inputs["pixel_values_images"] = pixel_values_images
-            model_inputs["pixel_values_videos"] = pixel_values_videos
-
+        model_inputs.update(lang_model_inputs)
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 51e2ad55a665..85f74c6ad1dd 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -23,7 +23,9 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...utils import logging, TensorType
+
+logger = logging.get_logger(__name__)
 
 
 class VideoLlavaProcessor(ProcessorMixin):
@@ -47,10 +49,10 @@ class VideoLlavaProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
         self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = 14  # self.image_processor.path_size
+        self.patch_size = getattr(self.image_processor, "patch_size", None)
         self.image_token = "<image>"
         self.video_token = "<video>"
-        self.vision_feature_select_strategy = "default"  # self.image_processor.vision_feature_select_strategy
+        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
 
     def __call__(
         self,
@@ -122,17 +124,25 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        # Replace the image token with the expanded image token sequence
-        num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
-        num_video_tokens = num_image_tokens * 8  # Always 8 frames, no `vision_feature_select_strategy`
-        if self.vision_feature_select_strategy == "default":
-            num_image_tokens -= 1
-
-        prompt_strings = []
-        for sample in text:
-            sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-            sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
-            prompt_strings.append(sample)
+        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+            prompt_strings = text
+            logger.warning(
+                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+        else:
+            # Replace the image token with the expanded image token sequence
+            num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
+            num_video_tokens = num_image_tokens * 8  # Always 8 frames, no `vision_feature_select_strategy`
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                prompt_strings.append(sample)
 
         text_inputs = self.tokenizer(
             prompt_strings,
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 0069bf36fc9a..b9ecce36ebed 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -277,6 +277,84 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
+    def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        if labels is not None:
+            final_labels = torch.full(
+                (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
+            )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+        if labels is not None:
+            final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+        image_to_overwrite = torch.full(
+            (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
+        )
+        image_to_overwrite[batch_indices, text_to_overwrite] = False
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+
+        # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
+        batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
+        indices_to_mask = new_token_positions[batch_indices, pad_indices]
+
+        final_embedding[batch_indices, indices_to_mask] = 0
+
+        if labels is None:
+            final_labels = None
+
+        return final_embedding, final_attention_mask, final_labels, position_ids
+
     @add_start_docstrings_to_model_forward(VIPLLAVA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=VipLlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     # Ignore copy
@@ -345,10 +423,15 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+            # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
+
 
-        if pixel_values is not None and input_ids.shape[1] != 1:
+        if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
 
             # For VIP-llava, the image features are computed this way
@@ -356,8 +439,52 @@ def forward(
             image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
             image_features = torch.cat(image_features, dim=-1)
             image_features = self.multi_modal_projector(image_features)
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = image_features.flatten()
+            
+            if legacy_processing:  
+                logger.warning(
+                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+                # prefill stage vs decoding stage (legacy behavior copied)
+                if input_ids.shape[1] != 1:
+                    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                        image_features, inputs_embeds, input_ids, attention_mask, labels
+                    )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+
+                    # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # in the case one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            
+            # TODO: @raushan retain only the new behavior after v4.44
+            else:
+                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
             attention_mask=attention_mask,
@@ -411,7 +538,18 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = self.language_model.prepare_inputs_for_generation(
+        model_inputs = {}
+
+        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
+        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
+            cache_position = None        
+        else:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            if past_key_values is None:
+                model_inputs["pixel_values"] = pixel_values
+        
+        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -420,11 +558,7 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        # If we're in cached decoding stage, pixel values is None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values passed by the user
-        if past_key_values is None:
-            model_inputs["pixel_values"] = pixel_values
-
+        model_inputs.update(lang_model_inputs)
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):

From 8472035b92d7bf63423b933e3f72f4bc7ba545cb Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 29 May 2024 14:59:01 +0200
Subject: [PATCH 05/37] fix & add tests

---
 .../models/llava/modeling_llava.py            | 49 +++++++++-------
 .../models/llava/processing_llava.py          |  7 ++-
 .../models/llava_next/modeling_llava_next.py  | 48 +++++++++-------
 .../llava_next/processing_llava_next.py       |  9 +--
 .../video_llava/modeling_video_llava.py       | 57 ++++++++++++-------
 .../video_llava/processing_video_llava.py     |  7 ++-
 .../models/vipllava/modeling_vipllava.py      | 51 ++++++++++-------
 tests/models/llava/test_modeling_llava.py     | 28 +++++++++
 .../llava_next/test_modeling_llava_next.py    | 28 +++++++++
 .../video_llava/test_modeling_video_llava.py  | 30 ++++++++++
 .../models/vipllava/test_modeling_vipllava.py | 28 +++++++++
 11 files changed, 253 insertions(+), 89 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 64931c2ed084..32f6efb296ea 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -424,9 +424,13 @@ def forward(
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
+                input_ids.shape[-1] == 1 and pixel_values is not None
+            )
 
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -440,9 +444,9 @@ def forward(
                 raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
 
             image_features = self.multi_modal_projector(selected_image_feature)
-            
-            if legacy_processing:  
-                logger.warning(
+
+            if legacy_processing:
+                logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
@@ -485,7 +489,9 @@ def forward(
 
             # TODO: @raushan retain only the new behavior after v4.44
             else:
-                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
                 inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
@@ -540,28 +546,33 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-    
-        model_inputs = {}
-
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
-            cache_position = None        
-        else:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
-            if past_key_values is None:
-                model_inputs["pixel_values"] = pixel_values
-        
-        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
+        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position,
+            cache_position=cache_position if not legacy_processing else None,
             **kwargs,
         )
 
-        model_inputs.update(lang_model_inputs)
+        if legacy_processing:
+            # legacy specific code copied from prev version
+            if past_key_values is not None:
+                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+                if "position_ids" in model_inputs:
+                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["cache_position"] = None
+
+        elif past_key_values is None:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 96c06ea3508b..d50cad017249 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -22,7 +22,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import logging, TensorType
+from ...utils import TensorType, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -117,7 +118,7 @@ def __call__(
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         # try to expand inputs in processing if we have the necessary parts
-        if self.patch_size is not None and self.vision_feature_select_strategy is not None: 
+        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
             # Replace the image token with the expanded image token sequence
             num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
             if self.vision_feature_select_strategy == "default":
@@ -129,7 +130,7 @@ def __call__(
                 prompt_strings.append(sample)
         else:
             prompt_strings = text
-            logger.warning(
+            logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 5986e604616b..82569e5840c6 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -128,7 +128,6 @@ def unpad_image(tensor, original_size):
 
     original_aspect_ratio = original_width / original_height
     current_aspect_ratio = current_width / current_height
-    print(original_height, original_width, current_height, current_width)
 
     if original_aspect_ratio > current_aspect_ratio:
         scale_factor = current_width / original_width
@@ -756,10 +755,13 @@ def forward(
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
-
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
+                input_ids.shape[-1] == 1 and pixel_values is not None
+            )
 
         if pixel_values is not None and pixel_values.size(0) > 0:
             # ! infer image_num_patches from image_sizes
@@ -798,7 +800,7 @@ def forward(
                 image_newline=self.image_newline,
             )
             if legacy_processing:
-                logger.warning(
+                logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
@@ -844,9 +846,11 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44 
+            # TODO: @raushan retain only the new behavior after v4.44
             else:
-                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
                 inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
@@ -902,19 +906,9 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = {}
+        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
 
-        # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
-            cache_position = None        
-        else:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
-            if past_key_values is None:
-                model_inputs["pixel_values"] = pixel_values
-                model_inputs["image_sizes"] = image_sizes
-        
-        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
+        model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -923,7 +917,23 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        model_inputs.update(lang_model_inputs)
+        if legacy_processing:
+            # legacy specific code copied from prev version
+            if past_key_values is not None:
+                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+                if "position_ids" in model_inputs:
+                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+            model_inputs["cache_position"] = None
+
+        elif past_key_values is None:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["image_sizes"] = image_sizes
+
         return model_inputs
 
     # Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration._reorder_cache
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index aaa98acee43e..b18c33105749 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -23,7 +23,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import logging, TensorType
+from ...utils import TensorType, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -122,14 +123,14 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+        if self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
-            logger.warning(
+            logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
             )
-        # cannot infer image expansion length if no images are found 
+        # cannot infer image expansion length if no images are found
         elif not image_inputs:
             prompt_strings = text
         else:
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 75adfe857aea..e3c09bf77cd8 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -516,9 +516,16 @@ def forward(
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+
             # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576 or (input_ids == self.config.video_token_index).sum(1).max() < 4600
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = img_token_count < 256 and video_token_count < 2056
+            pixels_present = (
+                input_ids.shape[-1] == 1 and pixel_values_images is not None and pixel_values_videos is not None
+            )
+            legacy_processing = inputs_expanded or pixels_present
 
         if pixel_values_images is not None or pixel_values_videos is not None:
             image_outputs, video_outputs = self._get_vision_features(
@@ -532,10 +539,9 @@ def forward(
                 image_features = self.multi_modal_projector(image_outputs)
             if video_outputs is not None:
                 video_features = self.multi_modal_projector(video_outputs)
-            
-            
+
             if legacy_processing:
-                logger.warning(
+                logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
@@ -595,7 +601,7 @@ def forward(
 
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-            
+
             # TODO: @raushan retain only the new behavior after v4.44
             else:
                 if image_outputs is not None:
@@ -603,14 +609,13 @@ def forward(
                         (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                     )
                     inputs_embeds[special_image_mask] = image_features.flatten()
-                
+
                 if video_outputs is not None:
                     special_image_mask = (
                         (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                     )
                     inputs_embeds[special_image_mask] = video_features.flatten()
 
-
         outputs = self.language_model(
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -664,28 +669,38 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = {}
-
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
-            cache_position = None        
-        else:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image/video token anymore
-            # Otherwise we need pixel values to be passed to model
-            if past_key_values is None:
-                model_inputs["pixel_values_images"] = pixel_values_images
-                model_inputs["pixel_values_videos"] = pixel_values_videos
-        
-        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
+        legacy_processing = input_ids is not None and (
+            (input_ids == self.config.image_token_index).sum(1).max() < 256
+            and (input_ids == self.config.video_token_index).sum(1).max() < 2056
+        )
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position,
+            cache_position=cache_position if not legacy_processing else None,
             **kwargs,
         )
 
-        model_inputs.update(lang_model_inputs)
+        if legacy_processing:
+            # legacy specific code copied from prev version, we assume that we always have one more new token (assisted decoding doesn't work for VLMs)
+            if past_key_values is not None:
+                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+                if "position_ids" in model_inputs:
+                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+            model_inputs["cache_position"] = None
+
+        elif past_key_values is None:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values_images"] = pixel_values_images
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index cd63b87d1f9a..984bae4b9584 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -22,7 +22,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import logging, TensorType
+from ...utils import TensorType, logging
+
 
 logger = logging.get_logger(__name__)
 
@@ -123,9 +124,9 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+        if self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
-            logger.warning(
+            logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index ebb7caafd386..4cbea432d17d 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -427,10 +427,13 @@ def forward(
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
+
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
-            legacy_processing = (input_ids == self.config.image_token_index).sum(1).max() < 576
-
+            # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
+            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
+                input_ids.shape[-1] == 1 and pixel_values is not None
+            )
 
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -440,9 +443,9 @@ def forward(
             image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
             image_features = torch.cat(image_features, dim=-1)
             image_features = self.multi_modal_projector(image_features)
-            
-            if legacy_processing:  
-                logger.warning(
+
+            if legacy_processing:
+                logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
@@ -481,10 +484,12 @@ def forward(
 
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-            
+
             # TODO: @raushan retain only the new behavior after v4.44
             else:
-                special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                special_image_mask = (
+                    (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                )
                 inputs_embeds[special_image_mask] = image_features.flatten()
 
         outputs = self.language_model(
@@ -539,27 +544,33 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        model_inputs = {}
-
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        if input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576:
-            cache_position = None        
-        else:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
-            if past_key_values is None:
-                model_inputs["pixel_values"] = pixel_values
-        
-        lang_model_inputs = self.language_model.prepare_inputs_for_generation(
+        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
+
+        model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position,
+            cache_position=cache_position if not legacy_processing else None,
             **kwargs,
         )
 
-        model_inputs.update(lang_model_inputs)
+        if legacy_processing:
+            # legacy specific code copied from prev version
+            if past_key_values is not None:
+                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+                if "position_ids" in model_inputs:
+                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["cache_position"] = None
+
+        elif past_key_values is None:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index aaf0284c0587..11d0e50005a5 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -450,3 +450,31 @@ def test_tokenizer_integration(self):
         EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
         self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
         self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/llava-1.5-7b-hf"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index c060a892c9d4..8f45f7deea8c 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -465,3 +465,31 @@ def test_small_model_integration_test_batch_matches_single(self):
             self.processor.decode(output_batched[0], skip_special_tokens=True),
             self.processor.decode(output_single[0], skip_special_tokens=True),
         )
+
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
+        model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2356)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 17)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 1a91a2660f2c..8cf259b742c1 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -534,3 +534,33 @@ def test_video_llava_merge_inputs_error_bug(self):
             labels=input_ids,
         ).loss
         loss.backward()
+
+    def test_expansion_in_processing(self):
+        model_id = "LanguageBind/Video-LLaVA-7B-hf"
+        model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = VideoLlavaProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <video>Describe the video in details. ASSISTANT:"
+        video_file = hf_hub_download(
+            repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset"
+        )
+        video_file = np.load(video_file)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 2073)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, videos=video_file, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index af278643fd57..f8b4505c68d2 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -252,3 +252,31 @@ def test_vipllava_merge_inputs_error_bug(self):
             labels=input_ids,
         ).loss
         loss.backward()
+
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/vip-llava-7b-hf"
+        model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())

From 13af9e84245253ad8ae39e36b695b34d5dfce272 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 29 May 2024 15:10:57 +0200
Subject: [PATCH 06/37] fix tests

---
 src/transformers/models/llava/modeling_llava.py             | 1 -
 src/transformers/models/llava_next/modeling_llava_next.py   | 1 -
 src/transformers/models/video_llava/modeling_video_llava.py | 1 -
 src/transformers/models/vipllava/modeling_vipllava.py       | 1 -
 tests/models/llava/test_modeling_llava.py                   | 2 ++
 tests/models/llava_next/test_modeling_llava_next.py         | 2 ++
 tests/models/video_llava/test_modeling_video_llava.py       | 2 ++
 tests/models/vipllava/test_modeling_vipllava.py             | 2 ++
 8 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 32f6efb296ea..fcd04626730a 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -406,7 +406,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 82569e5840c6..97e850d8a60e 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -737,7 +737,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index e3c09bf77cd8..232285a34186 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -498,7 +498,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layer = (
             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
         )
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 4cbea432d17d..6116de43b8d2 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -414,7 +414,6 @@ def forward(
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
         vision_feature_layers = (
             vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers
         )
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 11d0e50005a5..ad2e0a65a382 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -451,6 +451,8 @@ def test_tokenizer_integration(self):
         self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
         self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
 
+    @slow
+    @require_bitsandbytes
     def test_expansion_in_processing(self):
         model_id = "llava-hf/llava-1.5-7b-hf"
         model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 8f45f7deea8c..008597645991 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -466,6 +466,8 @@ def test_small_model_integration_test_batch_matches_single(self):
             self.processor.decode(output_single[0], skip_special_tokens=True),
         )
 
+    @slow
+    @require_bitsandbytes
     def test_expansion_in_processing(self):
         model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
         model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 8cf259b742c1..469d5dca3c5a 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -535,6 +535,8 @@ def test_video_llava_merge_inputs_error_bug(self):
         ).loss
         loss.backward()
 
+    @slow
+    @require_bitsandbytes
     def test_expansion_in_processing(self):
         model_id = "LanguageBind/Video-LLaVA-7B-hf"
         model = VideoLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index f8b4505c68d2..47be7c40d1d5 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -253,6 +253,8 @@ def test_vipllava_merge_inputs_error_bug(self):
         ).loss
         loss.backward()
 
+    @slow
+    @require_bitsandbytes
     def test_expansion_in_processing(self):
         model_id = "llava-hf/vip-llava-7b-hf"
         model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)

From 41d086f1ce24c4151f5f06722f5039fb22df61ec Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 29 May 2024 15:32:26 +0200
Subject: [PATCH 07/37] ipnuts embeds cannot be passed with pixels

---
 .../models/llava/modeling_llava.py            |  5 +++
 .../models/llava_next/modeling_llava_next.py  |  5 +++
 .../video_llava/modeling_video_llava.py       |  5 +++
 .../models/vipllava/modeling_vipllava.py      |  5 +++
 tests/models/llava/test_modeling_llava.py     | 42 +++++++++++++++++
 .../llava_next/test_modeling_llava_next.py    | 42 +++++++++++++++++
 .../video_llava/test_modeling_video_llava.py  | 45 +++++++++++++++++++
 .../models/vipllava/test_modeling_vipllava.py | 42 +++++++++++++++++
 8 files changed, 191 insertions(+)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index fcd04626730a..e8f678e773ea 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -420,6 +420,11 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 97e850d8a60e..bcefdd83500b 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -751,6 +751,11 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 232285a34186..faee98e3d0dc 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -512,6 +512,11 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 6116de43b8d2..5708cc53ff61 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -423,6 +423,11 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index ad2e0a65a382..e6a56f99cfbc 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -186,6 +186,48 @@ def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 008597645991..99ef130178aa 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -239,6 +239,48 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 469d5dca3c5a..b2de416daf19 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -314,6 +314,51 @@ def recursive_check(batched_object, single_row_object, model_name, key):
             for key in model_batched_output:
                 recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 
+    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            input_ids[input_ids == model.config.image_token_index] = 2  # remove image tokens
+            input_ids[input_ids == model.config.image_token_index] = 2  # remove video tokens
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
 
 @require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index 47be7c40d1d5..236240459a1c 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -167,6 +167,48 @@ def setUp(self):
         self.model_tester = VipLlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
 
+    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )

From bf59ed694d6e9410af4d98bd55945fd53b1a9515 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 7 Jun 2024 15:45:53 +0200
Subject: [PATCH 08/37] more updates

---
 .../models/llava/modeling_llava.py            |   3 +-
 .../models/llava/processing_llava.py          |  32 ++-
 .../models/llava_next/modeling_llava_next.py  |   3 +-
 .../llava_next/processing_llava_next.py       |  51 ++--
 .../models/paligemma/modeling_paligemma.py    | 222 +++++-------------
 .../video_llava/modeling_video_llava.py       |   3 +-
 .../video_llava/processing_video_llava.py     |  40 +++-
 tests/models/llava/test_modeling_llava.py     |   2 +-
 8 files changed, 142 insertions(+), 214 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index e8f678e773ea..ac68ffaadf6a 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -452,7 +452,8 @@ def forward(
             if legacy_processing:
                 logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 # prefill stage vs decoding stage (legacy behavior copied)
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index d50cad017249..91b5368b776d 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -19,7 +19,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@@ -46,12 +46,18 @@ class LlavaProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor=None, tokenizer=None):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
         super().__init__(image_processor, tokenizer)
-        self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = getattr(self.image_processor, "patch_size", None)
-        self.image_token = "<image>"
-        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
 
     def __call__(
         self,
@@ -118,9 +124,14 @@ def __call__(
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
         # try to expand inputs in processing if we have the necessary parts
-        if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+        if (
+            pixel_values is not None
+            and self.patch_size is not None
+            and self.vision_feature_select_strategy is not None
+        ):
             # Replace the image token with the expanded image token sequence
-            num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
             if self.vision_feature_select_strategy == "default":
                 num_image_tokens -= 1
 
@@ -128,11 +139,12 @@ def __call__(
             for sample in text:
                 sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                 prompt_strings.append(sample)
-        else:
+        elif pixel_values is not None:
             prompt_strings = text
             logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
             )
 
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index bcefdd83500b..0d7fd235ec29 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -806,7 +806,8 @@ def forward(
             if legacy_processing:
                 logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 if input_ids.shape[1] != 1:
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index b18c33105749..5af95e40bcb4 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -20,7 +20,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...image_processing_utils import select_best_resolution
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@@ -47,14 +47,19 @@ class LlavaNextProcessor(ProcessorMixin):
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor=None, tokenizer=None):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
         super().__init__(image_processor, tokenizer)
 
-        self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = getattr(self.image_processor, "patch_size", None)
-        self.image_token = "<image>"
-        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
-
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
@@ -127,7 +132,8 @@ def __call__(
             prompt_strings = text
             logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
             )
         # cannot infer image expansion length if no images are found
@@ -135,11 +141,12 @@ def __call__(
             prompt_strings = text
         else:
             image_sizes = image_inputs["image_sizes"]
+            height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
             prompt_strings = []
             for image_size, sample in zip(image_sizes, text):
                 # Replace the image token with the expanded image token sequence
-                height, width = image_size
-                num_image_tokens = self._get_number_of_features(height, width)
+                orig_height, orig_width = image_size
+                num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                 if self.vision_feature_select_strategy == "default":
                     num_image_tokens -= 1
 
@@ -156,27 +163,27 @@ def __call__(
 
         return BatchFeature(data={**text_inputs, **image_inputs})
 
-    def _get_number_of_features(self, height: int, width: int) -> int:
+    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
         image_grid_pinpoints = self.image_processor.image_grid_pinpoints
-        image_size = self.image_size
-        patch_size = self.patch_size
 
-        npatches = image_size // patch_size
-
-        height_best_resolution, width_best_resolution = select_best_resolution([height, width], image_grid_pinpoints)
-        num_patch_height, num_patch_width = height_best_resolution // image_size, width_best_resolution // image_size
+        height_best_resolution, width_best_resolution = select_best_resolution(
+            [orig_height, orig_width], image_grid_pinpoints
+        )
+        scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
 
+        patches_height = height // self.patch_size
+        patches_width = width // self.patch_size
         unpadded_features, newline_features = self._get_unpadded_features(
-            height, width, npatches, num_patch_height, num_patch_width
+            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
         )
         # The base patch covers the entire image (+1 for the CLS)
-        base_features = npatches**2 + 1
+        base_features = patches_height * patches_width + 1
         num_image_tokens = unpadded_features + newline_features + base_features
         return num_image_tokens
 
-    def _get_unpadded_features(self, height, width, npatches, num_patch_height, num_patch_width):
-        current_width = npatches * num_patch_height
-        current_height = npatches * num_patch_width
+    def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
+        current_width = patches_height * scale_height
+        current_height = patches_width * scale_width
 
         original_aspect_ratio = width / height
         current_aspect_ratio = current_width / current_height
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index e8303a798489..c6898ba7a88b 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -283,74 +283,37 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
 
-    def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+    def _update_causal_mask(
+        self, attention_mask, token_type_ids, inputs_embeds, cache_position, is_training: bool = False
     ):
-        _, _, embed_dim = image_features.shape
-        batch_size, sequence_length = input_ids.shape
         dtype, device = inputs_embeds.dtype, inputs_embeds.device
         min_dtype = torch.finfo(dtype).min
+        sequence_length = inputs_embeds.shape[1]
 
-        scaled_image_features = image_features / (self.config.hidden_size**0.5)
-        final_embedding = torch.zeros(
-            batch_size, sequence_length, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
-        )
+        if cache_position is None:
+            cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
 
-        text_mask = (input_ids != self.config.image_token_index) & (input_ids != self.pad_token_id)
-        image_mask = input_ids == self.config.image_token_index
-        pad_mask = input_ids == self.pad_token_id
-
-        # expand masks to match embedding dimension
-        text_mask_expanded = text_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
-        pad_mask_expanded = pad_mask.unsqueeze(-1).expand(-1, -1, embed_dim).to(inputs_embeds.device)
-        # insert padding and text token embeddings
-        final_embedding = torch.where(text_mask_expanded, inputs_embeds, final_embedding)
-        final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
-        # insert image embeddings - the image mask is always less or equal to the sentence in length
-        final_embedding = final_embedding.masked_scatter(
-            image_mask.unsqueeze(-1).expand_as(final_embedding).to(device=final_embedding.device),
-            scaled_image_features.to(device=final_embedding.device, dtype=final_embedding.dtype),
-        )
-        final_embedding = torch.where(pad_mask_expanded, torch.zeros_like(final_embedding), final_embedding)
-        if attention_mask is not None:
-            position_ids = (attention_mask.cumsum(-1)).masked_fill_((attention_mask == 0), 1)
-        else:
-            position_ids = None
+        target_length = cache_position[-1] + 1
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
 
-        if token_type_ids is not None and labels is not None:
-            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
-            target_length = cache_position[-1] + 1
-            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
             )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
-                # unmask the prefill
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                 )
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    padding_mask, min_dtype
-                )
-
-            final_labels = torch.full(
-                (batch_size, sequence_length), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
-            )
-            final_labels = torch.where(input_ids != self.pad_token_id, labels, final_labels)
-        else:
-            causal_mask = attention_mask.unsqueeze(1).unsqueeze(2) * attention_mask.unsqueeze(1).unsqueeze(-1)
-            causal_mask = causal_mask.to(dtype).expand(-1, self.config.text_config.num_key_value_heads, -1, -1)
-            final_labels = None
-        return final_embedding, causal_mask, final_labels, position_ids
+        return causal_mask
 
     @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
@@ -406,67 +369,38 @@ def forward(
                 "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
             )
 
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # the attention mask is turned 4d after, we keep track of the original one
-        input_attention_mask = attention_mask
+        is_training = token_type_ids is not None and labels is not None
 
         if inputs_embeds is None:
-            # 1. Extra the input embeddings
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-            # 2. Merge text and images
-            if pixel_values is not None and input_ids.shape[1] != 1:
-                image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
-                selected_image_feature = image_outputs.last_hidden_state
-                image_features = self.multi_modal_projector(selected_image_feature)
+        # Merge text and images
+        if pixel_values is not None:
+            image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.multi_modal_projector(selected_image_feature)
+            image_features = image_features / (self.config.hidden_size**0.5)
 
-                if cache_position is None:
-                    cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
-                inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
-                    image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
-                )
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = image_features.flatten()
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, inputs_embeds, cache_position, is_training
+        )
 
-            else:
-                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
-                # generation with cache
-                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
-                    # Retrieve the first layer to inspect the logits and mask out the hidden states
-                    # that are set to 0
-                    # TODO @molbap this will only work for dynamic cache.
-                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-
-                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-
-                    # Get the target length
-                    target_seqlen = cache_position[-1] + 1
-
-                    extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    )
-
-                    # Filter out only the tokens that can be un-attended, this can happen
-                    # if one uses PaliGemma+ Fused modules where the cache on the
-                    # first iteration is already big enough, or if one passes custom cache
-                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-                    new_batch_index = batch_index[valid_indices]
-                    new_non_attended_tokens = non_attended_tokens[valid_indices]
-
-                    # Zero-out the places where we don't need to attend
-                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-
-                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
-                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
-        attention_mask = attention_mask.to(inputs_embeds.dtype)
         outputs = self.language_model(
-            attention_mask=attention_mask,
+            attention_mask=causal_mask,
             position_ids=position_ids,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
@@ -483,9 +417,9 @@ def forward(
         if labels is not None:
             shift_logits = logits[..., :-1, :]
             shift_labels = labels[..., 1:]
-            if input_attention_mask is not None:
+            if attention_mask is not None:
                 # we use the input attention mask to shift the logits and labels, because it is 2D.
-                shift_attention_mask = input_attention_mask[..., 1:]
+                shift_attention_mask = attention_mask[..., 1:]
                 shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
                 shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
             else:
@@ -520,65 +454,21 @@ def prepare_inputs_for_generation(
         token_type_ids=None,
         **kwargs,
     ):
-        past_length = 0
-        if past_key_values is not None:
-            if isinstance(past_key_values, Cache):
-                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
-                max_cache_length = (
-                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
-                    if past_key_values.get_max_length() is not None
-                    else None
-                )
-                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
-            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
-            else:
-                cache_length = past_length = past_key_values[0][0].shape[2]
-                max_cache_length = None
-
-            # Keep only the unprocessed tokens:
-            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
-            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
-                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
-            # here we need to recall past_length is num_image_tokens + previous input_ids.
-            elif past_length < input_ids.shape[1]:
-                input_ids = input_ids[:, past_length:]
-            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
-            elif self.config.image_token_index in input_ids:
-                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
-            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
-            # older attention values, as their corresponding values are not part of the input.
-            if cache_length < past_length and attention_mask is not None:
-                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "cache_position": cache_position,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "pixel_values": pixel_values,
-                "token_type_ids": token_type_ids,
-            }
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            **kwargs,
         )
+        model_inputs["token_type_ids"] = token_type_ids
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if past_key_values is None:
+            model_inputs["pixel_values"] = pixel_values
+
         return model_inputs
 
     def _reorder_cache(self, *args, **kwargs):
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index faee98e3d0dc..286000a4378d 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -547,7 +547,8 @@ def forward(
             if legacy_processing:
                 logger.warning_once(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 if input_ids.shape[1] != 1:
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 984bae4b9584..0dc470f7bdb9 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -19,7 +19,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@@ -46,13 +46,20 @@ class VideoLlavaProcessor(ProcessorMixin):
     image_processor_class = "VideoLlavaImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor=None, tokenizer=None):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        video_token="<video>",
+    ):
+        self._patch_size = patch_size
+        self._vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
+        self.video_token = video_token
         super().__init__(image_processor, tokenizer)
-        self.image_size = self.image_processor.size["shortest_edge"]
-        self.patch_size = getattr(self.image_processor, "patch_size", None)
-        self.image_token = "<image>"
-        self.video_token = "<video>"
-        self.vision_feature_select_strategy = getattr(self.image_processor, "vision_feature_select_strategy", None)
 
     def __call__(
         self,
@@ -124,17 +131,26 @@ def __call__(
         elif not isinstance(text, list) and not isinstance(text[0], str):
             raise ValueError("Invalid input text. Please provide a string, or a list of strings")
 
-        if self.patch_size is None or self.vision_feature_select_strategy is None:
+        if encoded_images is not None and self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
             logger.warning_once(
                 "Expanding inputs for image tokens in LLaVa should be done in processing. "
-                "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
             )
-        else:
+        elif encoded_images is not None:
             # Replace the image token with the expanded image token sequence
-            num_image_tokens = (self.image_size // self.patch_size) ** 2 + 1
-            num_video_tokens = num_image_tokens * 8  # Always 8 frames, no `vision_feature_select_strategy`
+            if "pixel_values" in encoded_images:
+                height, width = get_image_size(to_numpy_array(encoded_images.get("pixel_values")[0]))
+                num_frames = 1
+            else:
+                one_video = to_numpy_array(encoded_images.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+
+            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+            num_video_tokens = num_image_tokens * num_frames
             if self.vision_feature_select_strategy == "default":
                 num_image_tokens -= 1
 
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index e6a56f99cfbc..94789417c059 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -205,7 +205,7 @@ def test_inputs_embeds(self):
             inputs["inputs_embeds"] = wte(input_ids)
 
             with torch.no_grad():
-                model(**inputs)[0]
+                model(**inputs)
 
     def test_inputs_embeds_matches_input_ids(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

From 020e7ed10d5070d8e623d6e94fac87d3f3aea823 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 08:35:51 +0200
Subject: [PATCH 09/37] paligemma ready!

---
 src/transformers/models/paligemma/modeling_paligemma.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index c6898ba7a88b..2d82254631f7 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -295,8 +295,14 @@ def _update_causal_mask(
 
         target_length = cache_position[-1] + 1
         causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+
+        # do causal diagonal mask only if training, otherwise attend to the whole prefix
+        # training-specific attn for prefix is handled below
         if sequence_length != 1:
-            causal_mask = torch.triu(causal_mask, diagonal=1)
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask = torch.zeros_like(causal_mask)
 
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
@@ -313,6 +319,7 @@ def _update_causal_mask(
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                     token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
                 )
+
         return causal_mask
 
     @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)

From 3e0455c8da54bfc68558ff05761a03cc0f8ab3e8 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 09:55:59 +0200
Subject: [PATCH 10/37] minor typos

---
 src/transformers/models/llava_next/modeling_llava_next.py     | 2 +-
 src/transformers/models/llava_next/processing_llava_next.py   | 2 +-
 src/transformers/models/video_llava/modeling_video_llava.py   | 2 +-
 src/transformers/models/video_llava/processing_video_llava.py | 2 +-
 src/transformers/models/vipllava/modeling_vipllava.py         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 0d7fd235ec29..41352f727ce1 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -805,7 +805,7 @@ def forward(
             )
             if legacy_processing:
                 logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 5af95e40bcb4..579fd9994b01 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -131,7 +131,7 @@ def __call__(
         if self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
             logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                 "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 286000a4378d..2ffa5801308e 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -546,7 +546,7 @@ def forward(
 
             if legacy_processing:
                 logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 0dc470f7bdb9..1618887cc339 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -134,7 +134,7 @@ def __call__(
         if encoded_images is not None and self.patch_size is None or self.vision_feature_select_strategy is None:
             prompt_strings = text
             logger.warning_once(
-                "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                 "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                 "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 5708cc53ff61..fbb0be4a9562 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -450,7 +450,7 @@ def forward(
 
             if legacy_processing:
                 logger.warning_once(
-                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
+                    "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )

From 674f16e6620efe9733101dc02189e22ca02d5dc6 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 09:57:04 +0200
Subject: [PATCH 11/37] update blip-2

---
 .../models/blip_2/modeling_blip_2.py          | 40 +++++++++++++++----
 .../models/blip_2/processing_blip_2.py        | 32 +++++++++++++--
 2 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 8fa55d01ee88..62152cf7c07e 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1767,12 +1767,24 @@ def forward(
             language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
         )
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        expected_device = language_model_attention_mask.device
-        attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)
+
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concating
+        if hasattr(self.config, "image_token_index"):
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1876,13 +1888,25 @@ def generate(
                 .repeat(batch_size, 1)
                 .to(image_embeds.device)
             )
+        inputs_embeds = self.get_input_embeddings()(input_ids)
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
 
-        # concatenate query embeddings with prompt embeddings
-        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concating
+        if hasattr(self.config, "image_token_index"):
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
 
         # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
         # -1 is to account for the prepended BOS after `generate.`
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index ff7044c82aed..aa2b9089b486 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -20,8 +20,18 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
 
 
 class Blip2Processor(ProcessorMixin):
@@ -43,10 +53,15 @@ class Blip2Processor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
 
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
-    def __init__(self, image_processor, tokenizer):
+    def __init__(self, image_processor, tokenizer, num_query_tokens=None):
         tokenizer.return_token_type_ids = False
+        self.current_processor = image_processor
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        tokens_to_add = {"additional_special_tokens": [self.image_token]}
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.num_query_tokens = num_query_tokens
+
         super().__init__(image_processor, tokenizer)
-        self.current_processor = self.image_processor
 
     # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
     def __call__(
@@ -105,6 +120,15 @@ def __call__(
         encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
 
         if text is not None:
+            # if we know how many query tokens, expand text inside processor
+            if self.num_query_tokens is not None:
+                text = self.image_token.content * self.num_query_tokens + text
+            else:
+                logger.warning_once(
+                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
             text_encoding = self.tokenizer(
                 text=text,
                 add_special_tokens=add_special_tokens,

From 42ae6464156dd1ad698f4a1be6d4b876a7aa5b6e Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 11:03:44 +0200
Subject: [PATCH 12/37] fix tests & raise error

---
 .../paligemma/configuration_paligemma.py      | 16 +++++-
 .../models/paligemma/modeling_paligemma.py    |  7 +++
 .../paligemma/test_modeling_paligemma.py      | 57 +++++++++++++++++--
 3 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index d092142476c8..fe082007a4fe 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 """PaliGemmamodel configuration"""
 
+import warnings
+
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
 from ..auto import CONFIG_MAPPING
@@ -84,7 +86,7 @@ def __init__(
         hidden_size=2048,
         **kwargs,
     ):
-        self.ignore_index = ignore_index
+        self._ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.vocab_size = vocab_size
         self.projection_dim = projection_dim
@@ -128,3 +130,15 @@ def __init__(
         self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
         self.vision_config.projection_dim = projection_dim
         super().__init__(**kwargs)
+
+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.44.",
+            FutureWarning,
+        )
+        return self._ignore_index
+
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 2d82254631f7..305f52a491a2 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -400,6 +400,13 @@ def forward(
             image_features = image_features / (self.config.hidden_size**0.5)
 
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            if inputs_embeds[special_image_mask].numel() != image_features.numel():
+                image_tokens_in_text = torch.sum(input_ids == self.config.image_token_index)
+                raise ValueError(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
+                    "tokens from image embeddings."
+                )
             inputs_embeds[special_image_mask] = image_features.flatten()
 
         causal_mask = self._update_causal_mask(
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 10fd48060a93..f0a3815d4e1d 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -53,9 +53,9 @@ def __init__(
         self,
         parent,
         ignore_index=-100,
-        image_token_index=98,
+        image_token_index=0,
         projector_hidden_act="gelu",
-        seq_length=7,
+        seq_length=25,
         vision_feature_select_strategy="default",
         vision_feature_layer=-1,
         projection_dim=32,
@@ -87,8 +87,8 @@ def __init__(
         is_training=True,
         vision_config={
             "use_labels": True,
-            "image_size": 30,
-            "patch_size": 2,
+            "image_size": 20,
+            "patch_size": 5,
             "num_image_tokens": 4,
             "num_channels": 3,
             "is_training": True,
@@ -106,6 +106,7 @@ def __init__(
     ):
         self.parent = parent
         self.ignore_index = ignore_index
+        # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -157,8 +158,10 @@ def prepare_config_and_inputs_for_common(self):
         config, pixel_values = config_and_inputs
         input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
         attention_mask = input_ids.ne(1).to(torch_device)
-        # setting the 4 first tokens to be image
-        input_ids[:, :4] = config.image_token_index
+        # set the 16 first tokens to be image, and ensure that no other tokens are image tokens
+        # do not change this unless you modified image size or patch size
+        input_ids = torch.where(input_ids == config.image_token_index, 2, input_ids)
+        input_ids[:, :16] = config.image_token_index
         inputs_dict = {
             "pixel_values": pixel_values,
             "input_ids": input_ids,
@@ -259,6 +262,48 @@ def test_save_load_low_cpu_mem_usage_checkpoints(self):
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            input_ids[input_ids == model.config.image_token_index] = 2  # remove image tokens
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
 
 @slow
 @require_torch

From a6c50deb7431a43d9b31221fa4ff93aec48da5f3 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 11:17:19 +0200
Subject: [PATCH 13/37] docstring

---
 src/transformers/models/blip_2/processing_blip_2.py      | 4 ++--
 src/transformers/models/llava/processing_llava.py        | 7 +++++++
 .../models/llava_next/processing_llava_next.py           | 7 +++++++
 .../models/video_llava/processing_video_llava.py         | 9 +++++++++
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index aa2b9089b486..bced91c1c76e 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -46,13 +46,14 @@ class Blip2Processor(ProcessorMixin):
             An instance of [`BlipImageProcessor`]. The image processor is a required input.
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            MNumber of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
     def __init__(self, image_processor, tokenizer, num_query_tokens=None):
         tokenizer.return_token_type_ids = False
         self.current_processor = image_processor
@@ -63,7 +64,6 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None):
 
         super().__init__(image_processor, tokenizer)
 
-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
     def __call__(
         self,
         images: ImageInput = None,
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 91b5368b776d..783ed029aa4b 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -40,6 +40,13 @@ class LlavaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
     """
 
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 579fd9994b01..4595cd90ca6f 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -41,6 +41,13 @@ class LlavaNextProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
     """
 
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 1618887cc339..abf652b1e69a 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -40,6 +40,15 @@ class VideoLlavaProcessor(ProcessorMixin):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
     """
 
     attributes = ["image_processor", "tokenizer"]

From 4766e2eaf6fe57bbc45821c71426a6c8e53e24d6 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 10 Jun 2024 12:24:53 +0200
Subject: [PATCH 14/37] add blip2 test

---
 .../models/blip_2/processing_blip_2.py        | 38 +++++++++++++------
 tests/models/blip_2/test_modeling_blip_2.py   | 25 ++++++++++++
 2 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index bced91c1c76e..109c6f67be20 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -16,6 +16,7 @@
 Processor class for BLIP-2.
 """
 
+from operator import concat
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
@@ -119,17 +120,14 @@ def __call__(
         # add pixel_values
         encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
 
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
         if text is not None:
-            # if we know how many query tokens, expand text inside processor
-            if self.num_query_tokens is not None:
-                text = self.image_token.content * self.num_query_tokens + text
-            else:
-                logger.warning_once(
-                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
-                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
-                )
-            text_encoding = self.tokenizer(
+            text_encoding = {}
+            _text_encoding = self.tokenizer(
                 text=text,
                 add_special_tokens=add_special_tokens,
                 padding=padding,
@@ -144,9 +142,27 @@ def __call__(
                 return_token_type_ids=return_token_type_ids,
                 return_length=return_length,
                 verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # hardcode "None" here for prepending image tokens
                 **kwargs,
             )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None:
+                image_tokens = self.image_token.content * self.num_query_tokens
+                image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = list(map(concat, image_token_encoding[k], _text_encoding[k]))
+            else:
+                text_encoding = _text_encoding
+                logger.warning_once(
+                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+
+            # cast to desired return tensors type
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
         else:
             text_encoding = None
 
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 9e295325b3fd..e67e2f1ae9a4 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1031,3 +1031,28 @@ def test_inference_t5_multi_accelerator(self):
             [0, 3, 7, 152, 67, 839, 1],
         )
         self.assertEqual(generated_text, "san diego")
+
+    def test_expansion_in_processing(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt = "Question: which city is this? Answer:"
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
+        model.config.image_token_index = processor.tokenizer.vocab_size
+
+        # Generate again with new inputs
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)

From f74297bb0c40bf8e4eeacd46b2b64cb41a92e83c Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 17 Jun 2024 07:23:53 +0200
Subject: [PATCH 15/37] tmp

---
 blips.py |  53 ++++++++++++++++++++
 gemma.py |  31 ++++++++++++
 temp.py  | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 229 insertions(+)
 create mode 100644 blips.py
 create mode 100644 gemma.py
 create mode 100644 temp.py

diff --git a/blips.py b/blips.py
new file mode 100644
index 000000000000..8f367bc71239
--- /dev/null
+++ b/blips.py
@@ -0,0 +1,53 @@
+# from PIL import Image
+# import requests
+# from transformers import AutoProcessor, BlipForQuestionAnswering
+# 
+# model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+# processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+# 
+# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# image = Image.open(requests.get(url, stream=True).raw)
+# 
+# # training
+# text = "How many cats are in the picture?"
+# label = "2"
+# inputs = processor(images=image, text=text, return_tensors="pt")
+# labels = processor(text=label, return_tensors="pt").input_ids
+# 
+# inputs["labels"] = labels
+# outputs = model(**inputs)
+# loss = outputs.loss
+# loss.backward()
+# 
+# # inference
+# text = "How many cats are in the picture?"
+# inputs = processor(images=image, text=text, return_tensors="pt")
+# outputs = model.generate(**inputs)
+# print(processor.decode(outputs[0], skip_special_tokens=True))
+
+
+
+from PIL import Image
+import requests
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+import torch
+
+processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+model = Blip2ForConditionalGeneration.from_pretrained(
+    "Salesforce/blip2-opt-2.7b", device_map="cuda", torch_dtype=torch.float16
+)
+
+processor.num_query_tokens = model.config.num_query_tokens
+model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64) # pad for efficient computation
+model.config.image_token_index = processor.tokenizer.vocab_size
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+prompt = "Question: how many cats are there? Answer:"
+inputs = processor(images=image, text=prompt, return_tensors="pt").to(device=model.device, dtype=torch.float16)
+
+generated_ids = model.generate(**inputs, max_new_tokens=15)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+print(generated_text)
+
diff --git a/gemma.py b/gemma.py
new file mode 100644
index 000000000000..92a8492d3785
--- /dev/null
+++ b/gemma.py
@@ -0,0 +1,31 @@
+from PIL import Image
+import requests
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+import torch
+
+
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# model_id = "meta-llama/Llama-2-7b-chat-hf"
+# tok = AutoTokenizer.from_pretrained(model_id)
+# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager")
+# inputs = tok("Hello", return_tensors="pt").to(model.device)
+# 
+# model.generate(**inputs, max_new_tokens=4)
+
+
+model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-224", attn_implementation="eager")
+processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+
+prompt = "answer en Where is the remote control?"
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=prompt, images=image, padding="max_length", max_length=26, return_tensors="pt")
+
+# Generate
+generate_ids = model.generate(**inputs, do_sample=False, min_new_tokens=6, max_new_tokens=30)
+out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(out)
+
diff --git a/temp.py b/temp.py
new file mode 100644
index 000000000000..7154c4ba8c2c
--- /dev/null
+++ b/temp.py
@@ -0,0 +1,145 @@
+# from PIL import Image
+# import requests
+# from transformers import AutoProcessor, LlavaForConditionalGeneration
+# 
+# model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+# processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+# 
+# prompt = "USER: <image>\nWhat's the content of the image? <image> ASSISTANT:"
+# url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+# image = Image.open(requests.get(url, stream=True).raw)
+# 
+# inputs = processor(text=[prompt], images=[image, image], return_tensors="pt")
+# for k, v in inputs.items():
+#     print(k, v.shape)
+# 
+# processor.vision_feature_select_strategy = "default"
+# processor.patch_size = 14
+# inputs_expanded = processor(text=[prompt], images=[image, image], return_tensors="pt")
+# for k, v in inputs_expanded.items():
+#     print(k, v.shape)
+# 
+# # Generate
+# generate_ids = model.generate(**inputs, max_new_tokens=15)
+# generate_ids_expanded = model.generate(**inputs_expanded, max_new_tokens=15)
+# assert(generate_ids_expanded == generate_ids)
+
+
+
+# from PIL import Image
+# import requests
+# from transformers import AutoProcessor, LlavaNextForConditionalGeneration
+# 
+# model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+# processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+# 
+# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
+# cats_image = Image.open(requests.get(url, stream=True).raw)
+# lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
+# 
+# very_wide_url = "https://as2.ftcdn.net/v2/jpg/02/84/07/89/1000_F_284078927_VW1YQ7jCb7Xz8cWkd7nUytk1j3KCCHcY.jpg"
+# very_wide_image = Image.open(requests.get(very_wide_url, stream=True).raw)
+# 
+# prompts = [
+#     "[INST] <image>\nWhat is shown in this image? [/INST]",
+#     "[INST] <image>\nDescribe the image. [/INST]",
+#     "[INST] <image>\nHow many cats do you see in this image? [/INST]",
+#     ]
+# processor.vision_feature_select_strategy = "default"
+# processor.patch_size = 14
+# inputs = processor(text=prompts, images=[very_wide_image, lowres_img, cats_image], padding=True, return_tensors="pt")
+# for k, v in inputs.items():
+#     print(k, v.shape)
+# 
+# # Generate
+# generate_ids = model.generate(**inputs, max_new_tokens=30)
+# out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+# print(out)
+# 
+# 
+# import torch
+# from PIL import Image
+# import requests
+# from transformers import AutoProcessor, VipLlavaForConditionalGeneration
+# 
+# model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
+# processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
+# 
+# prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
+# question = "Can you please describe this image?"
+# prompt = prompt.format(question)
+# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
+# image = Image.open(requests.get(url, stream=True).raw)
+# 
+# inputs = processor(text=prompt, images=image, return_tensors="pt").to(0, torch.float16)
+# 
+# # Generate
+# generate_ids = model.generate(**inputs, max_new_tokens=20)
+# out = processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
+# print(out)
+# 
+
+from PIL import Image
+import requests
+import numpy as np
+import av
+from huggingface_hub import hf_hub_download
+from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
+
+
+def read_video_pyav(container, indices):
+    '''
+    Decode the video with PyAV decoder.
+    Args:
+        container (`av.container.input.InputContainer`): PyAV container.
+        indices (`List[int]`): List of frame indices to decode.
+    Returns:
+        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+    '''
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
+processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
+
+prompt = "USER: <video>Why is this video funny? ASSISTANT:"
+video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
+container = av.open(video_path)
+
+# sample uniformly 8 frames from the video
+total_frames = container.streams.video[0].frames
+indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+clip = read_video_pyav(container, indices)
+
+processor.vision_feature_select_strategy = "default"
+processor.patch_size = 14
+
+inputs = processor(text=prompt, videos=clip, return_tensors="pt")
+
+# Generate
+generate_ids = model.generate(**inputs, max_new_tokens=50)
+out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(out)
+
+
+# to generate from image and video mix
+# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+# image = Image.open(requests.get(url, stream=True).raw)
+# prompt = [
+#         "USER: <image> How many cats are there in the image? ASSISTANT:",
+#         "USER: <video>Why is this video funny? ASSISTANT:"
+#      ]
+# inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
+# 
+# # Generate
+# generate_ids = model.generate(**inputs, max_new_tokens=50)
+# out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
\ No newline at end of file

From 5fc8565cc0dbc9b62e68af05dc9fa31b4af66d5c Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 18 Jun 2024 10:46:34 +0200
Subject: [PATCH 16/37] add image seq length to config

---
 src/transformers/models/llava/configuration_llava.py   |  4 ++++
 src/transformers/models/llava/modeling_llava.py        | 10 ++++++----
 .../models/llava_next/configuration_llava_next.py      |  4 ++++
 .../models/llava_next/modeling_llava_next.py           | 10 ++++++----
 .../models/video_llava/configuration_video_llava.py    |  8 ++++++++
 .../models/video_llava/modeling_video_llava.py         |  6 +++---
 .../models/video_llava/processing_video_llava.py       |  4 ++--
 .../models/vipllava/configuration_vipllava.py          |  4 ++++
 src/transformers/models/vipllava/modeling_vipllava.py  | 10 ++++++----
 9 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index 6930dcc78c46..3c480d9c4244 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -50,6 +50,8 @@ class LlavaConfig(PretrainedConfig):
             Can be one of `"default"` or `"full"`.
         vision_feature_layer (`int`, *optional*, defaults to -2):
             The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
 
     Example:
 
@@ -84,11 +86,13 @@ def __init__(
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
+        image_seq_length=576,
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index ac68ffaadf6a..fa2d334ba81d 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -432,9 +432,9 @@ def forward(
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
-                input_ids.shape[-1] == 1 and pixel_values is not None
-            )
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -552,7 +552,9 @@ def prepare_inputs_for_generation(
         **kwargs,
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
+        legacy_processing = (
+            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+        )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py
index 311139386723..e8768dde8572 100644
--- a/src/transformers/models/llava_next/configuration_llava_next.py
+++ b/src/transformers/models/llava_next/configuration_llava_next.py
@@ -53,6 +53,8 @@ class LlavaNextConfig(PretrainedConfig):
             of the form `(height, width)`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
 
     Example:
 
@@ -89,11 +91,13 @@ def __init__(
         vision_feature_layer=-2,
         image_grid_pinpoints=None,
         tie_word_embeddings=False,
+        image_seq_length=576,
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 41352f727ce1..f73bce7ade48 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -763,9 +763,9 @@ def forward(
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
-                input_ids.shape[-1] == 1 and pixel_values is not None
-            )
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None and pixel_values.size(0) > 0:
             # ! infer image_num_patches from image_sizes
@@ -911,7 +911,9 @@ def prepare_inputs_for_generation(
         cache_position=None,
         **kwargs,
     ):
-        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
+        legacy_processing = (
+            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+        )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
index 9fd236e595bf..8738a02585e0 100644
--- a/src/transformers/models/video_llava/configuration_video_llava.py
+++ b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -51,6 +51,10 @@ class VideoLlavaConfig(PretrainedConfig):
             Can be either "full" to select all features or "default" to select features without `CLS`.
         vision_feature_layer (`int`, *optional*, defaults to -2):
             The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 2056):
+            Sequence length of one video embedding.
 
     Example:
 
@@ -86,6 +90,8 @@ def __init__(
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
+        image_seq_length=256,
+        video_seq_length=2056,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -94,6 +100,8 @@ def __init__(
         self.projector_hidden_act = projector_hidden_act
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.vision_feature_layer = vision_feature_layer
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
 
         self.vision_config = vision_config
 
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 97d6f6d72355..e587b3349172 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -524,7 +524,7 @@ def forward(
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
             video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
-            inputs_expanded = img_token_count < 256 and video_token_count < 2056
+            inputs_expanded = img_token_count < self.image_seq_length and video_token_count < self.video_seq_length
             pixels_present = (
                 input_ids.shape[-1] == 1 and pixel_values_images is not None and pixel_values_videos is not None
             )
@@ -675,8 +675,8 @@ def prepare_inputs_for_generation(
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
         legacy_processing = input_ids is not None and (
-            (input_ids == self.config.image_token_index).sum(1).max() < 256
-            and (input_ids == self.config.video_token_index).sum(1).max() < 2056
+            (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            and (input_ids == self.config.video_token_index).sum(1).max() < self.video_seq_length
         )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index abf652b1e69a..a4e7836e200f 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -64,8 +64,8 @@ def __init__(
         image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
         video_token="<video>",
     ):
-        self._patch_size = patch_size
-        self._vision_feature_select_strategy = vision_feature_select_strategy
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = image_token
         self.video_token = video_token
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py
index d98099b21b05..b238e8dd3908 100644
--- a/src/transformers/models/vipllava/configuration_vipllava.py
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -49,6 +49,8 @@ class VipLlavaConfig(PretrainedConfig):
             The layer norm epsilon of the projector layernorm
         vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`):
             The list of layers to select the vision features from.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
 
     Example:
 
@@ -83,6 +85,7 @@ def __init__(
         projector_hidden_act="gelu",
         projector_layernorm_eps=1e-5,
         vision_feature_layers=[-2, -5, -8, -11, 6],
+        image_seq_length=576,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -90,6 +93,7 @@ def __init__(
         self.projector_hidden_act = projector_hidden_act
         self.projector_layernorm_eps = projector_layernorm_eps
         self.vision_feature_layers = vision_feature_layers
+        self.image_seq_length = image_seq_length
 
         if "vocab_size" in kwargs:
             warnings.warn(
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index fbb0be4a9562..8de7b6288755 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -435,9 +435,9 @@ def forward(
             # if the number of image tokens is more than image embeddings seq length, then prob we expanded it in processing
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
-            legacy_processing = ((input_ids == self.config.image_token_index).sum(1).max() < 576) or (
-                input_ids.shape[-1] == 1 and pixel_values is not None
-            )
+            legacy_processing = (
+                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
@@ -549,7 +549,9 @@ def prepare_inputs_for_generation(
         **kwargs,
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
-        legacy_processing = input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < 576
+        legacy_processing = (
+            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+        )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
             input_ids,

From 1b4674addaade0c34fc5131b5ab628f11473a28f Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 18 Jun 2024 10:49:30 +0200
Subject: [PATCH 17/37] update docstring

---
 src/transformers/models/llava/modeling_llava.py             | 4 ++++
 src/transformers/models/llava_next/modeling_llava_next.py   | 4 ++++
 src/transformers/models/paligemma/modeling_paligemma.py     | 4 ++++
 src/transformers/models/video_llava/modeling_video_llava.py | 4 ++++
 src/transformers/models/vipllava/modeling_vipllava.py       | 4 ++++
 5 files changed, 20 insertions(+)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index fa2d334ba81d..d15bd9da9b3c 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -225,6 +225,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index f73bce7ade48..8e56ecd6241d 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -334,6 +334,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index fc07bc8c6d78..8c3a506ee22b 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -221,6 +221,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index e587b3349172..2be52f5ec171 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -230,6 +230,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 8de7b6288755..25bfd6b80d51 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -229,6 +229,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 

From 84388756b3d94efea0c8c9dae3789035d46929df Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 18 Jun 2024 10:55:18 +0200
Subject: [PATCH 18/37] delete

---
 blips.py | 53 -----------------------------------------------------
 gemma.py | 31 -------------------------------
 2 files changed, 84 deletions(-)
 delete mode 100644 blips.py
 delete mode 100644 gemma.py

diff --git a/blips.py b/blips.py
deleted file mode 100644
index 8f367bc71239..000000000000
--- a/blips.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# from PIL import Image
-# import requests
-# from transformers import AutoProcessor, BlipForQuestionAnswering
-# 
-# model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-# processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
-# 
-# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-# image = Image.open(requests.get(url, stream=True).raw)
-# 
-# # training
-# text = "How many cats are in the picture?"
-# label = "2"
-# inputs = processor(images=image, text=text, return_tensors="pt")
-# labels = processor(text=label, return_tensors="pt").input_ids
-# 
-# inputs["labels"] = labels
-# outputs = model(**inputs)
-# loss = outputs.loss
-# loss.backward()
-# 
-# # inference
-# text = "How many cats are in the picture?"
-# inputs = processor(images=image, text=text, return_tensors="pt")
-# outputs = model.generate(**inputs)
-# print(processor.decode(outputs[0], skip_special_tokens=True))
-
-
-
-from PIL import Image
-import requests
-from transformers import Blip2Processor, Blip2ForConditionalGeneration
-import torch
-
-processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-model = Blip2ForConditionalGeneration.from_pretrained(
-    "Salesforce/blip2-opt-2.7b", device_map="cuda", torch_dtype=torch.float16
-)
-
-processor.num_query_tokens = model.config.num_query_tokens
-model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64) # pad for efficient computation
-model.config.image_token_index = processor.tokenizer.vocab_size
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-prompt = "Question: how many cats are there? Answer:"
-inputs = processor(images=image, text=prompt, return_tensors="pt").to(device=model.device, dtype=torch.float16)
-
-generated_ids = model.generate(**inputs, max_new_tokens=15)
-generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-print(generated_text)
-
diff --git a/gemma.py b/gemma.py
deleted file mode 100644
index 92a8492d3785..000000000000
--- a/gemma.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from PIL import Image
-import requests
-from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
-import torch
-
-
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-# model_id = "meta-llama/Llama-2-7b-chat-hf"
-# tok = AutoTokenizer.from_pretrained(model_id)
-# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", attn_implementation="eager")
-# inputs = tok("Hello", return_tensors="pt").to(model.device)
-# 
-# model.generate(**inputs, max_new_tokens=4)
-
-
-model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-224", attn_implementation="eager")
-processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
-
-prompt = "answer en Where is the remote control?"
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-inputs = processor(text=prompt, images=image, padding="max_length", max_length=26, return_tensors="pt")
-
-# Generate
-generate_ids = model.generate(**inputs, do_sample=False, min_new_tokens=6, max_new_tokens=30)
-out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print(out)
-

From bf9e637bb3498619b835d16127bc12a2ce4c8ea8 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 18 Jun 2024 11:39:07 +0200
Subject: [PATCH 19/37] fix tests

---
 src/transformers/models/llava/modeling_llava.py           | 5 +++--
 src/transformers/models/llava_next/modeling_llava_next.py | 5 +++--
 .../models/video_llava/modeling_video_llava.py            | 8 +++++---
 src/transformers/models/vipllava/modeling_vipllava.py     | 5 +++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index d15bd9da9b3c..49224aabdd1c 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -437,7 +437,7 @@ def forward(
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
             legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
@@ -557,7 +557,8 @@ def prepare_inputs_for_generation(
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
         legacy_processing = (
-            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
         )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 8e56ecd6241d..e8bacf4296b2 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -768,7 +768,7 @@ def forward(
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
             legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None and pixel_values.size(0) > 0:
@@ -916,7 +916,8 @@ def prepare_inputs_for_generation(
         **kwargs,
     ):
         legacy_processing = (
-            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
         )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 2be52f5ec171..09ca03ba4b9f 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -528,7 +528,9 @@ def forward(
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
             video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
-            inputs_expanded = img_token_count < self.image_seq_length and video_token_count < self.video_seq_length
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
+            )
             pixels_present = (
                 input_ids.shape[-1] == 1 and pixel_values_images is not None and pixel_values_videos is not None
             )
@@ -679,8 +681,8 @@ def prepare_inputs_for_generation(
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
         legacy_processing = input_ids is not None and (
-            (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
-            and (input_ids == self.config.video_token_index).sum(1).max() < self.video_seq_length
+            (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
+            and (input_ids == self.config.video_token_index).sum(1).max() < self.config.video_seq_length
         )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 25bfd6b80d51..7c186960c138 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -440,7 +440,7 @@ def forward(
             # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
             # In case we're in decoding stage, legacy behavior is checked by presence of pixel values even if use_cache=True
             legacy_processing = (
-                (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+                (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
             ) or (input_ids.shape[-1] == 1 and pixel_values is not None)
 
         if pixel_values is not None:
@@ -554,7 +554,8 @@ def prepare_inputs_for_generation(
     ):
         # Trigger the new behavior if we have more than image embeddings seq length tokens for images
         legacy_processing = (
-            input_ids is not None and (input_ids == self.config.image_token_index).sum(1).max() < self.image_seq_length
+            input_ids is not None
+            and (input_ids == self.config.image_token_index).sum(1).max() < self.config.image_seq_length
         )
 
         model_inputs = self.language_model.prepare_inputs_for_generation(

From db1fa4f35a4bef7ff23438d0ce5c5211c8801c28 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 18 Jun 2024 11:55:06 +0200
Subject: [PATCH 20/37] fix blip

---
 src/transformers/models/blip_2/processing_blip_2.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 109c6f67be20..c76413108a9b 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -120,12 +120,12 @@ def __call__(
         # add pixel_values
         encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
 
-        if isinstance(text, str):
-            text = [text]
-        elif not isinstance(text, list) and not isinstance(text[0], str):
-            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-
         if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
             text_encoding = {}
             _text_encoding = self.tokenizer(
                 text=text,

From 246b06a1d245632e6b0dc69bbe2d6070d3f95e44 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 21 Jun 2024 15:35:04 +0200
Subject: [PATCH 21/37] fix paligemma

---
 src/transformers/models/paligemma/modeling_paligemma.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 8c3a506ee22b..8f5f1d77d223 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -126,6 +126,9 @@ class PaliGemmaPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["PaliGemmaMultiModalProjector"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
     _supports_sdpa = True
 
     def _init_weights(self, module):
@@ -479,6 +482,10 @@ def prepare_inputs_for_generation(
             cache_position=cache_position,
             **kwargs,
         )
+
+        if input_ids.shape[1] != 1:  # for BC with the current implementation, position ids in pre-fill is 1-indexed
+            model_inputs["position_ids"] += 1
+
         model_inputs["token_type_ids"] = token_type_ids
 
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore

From 548621566df47cf720c1c0e38f09d53fa27b39c7 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 18 Jul 2024 08:42:02 +0200
Subject: [PATCH 22/37] out-of-place scatter

---
 src/transformers/models/blip_2/modeling_blip_2.py           | 3 ++-
 src/transformers/models/blip_2/processing_blip_2.py         | 2 --
 src/transformers/models/llava/modeling_llava.py             | 3 ++-
 src/transformers/models/llava/processing_llava.py           | 4 ++--
 src/transformers/models/llava_next/modeling_llava_next.py   | 3 ++-
 src/transformers/models/llava_next/processing_llava_next.py | 4 ++--
 src/transformers/models/paligemma/modeling_paligemma.py     | 3 ++-
 src/transformers/models/video_llava/modeling_video_llava.py | 6 ++++--
 src/transformers/models/vipllava/modeling_vipllava.py       | 3 ++-
 9 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 3f5423bffe6d..86fa8a90a55d 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1774,7 +1774,8 @@ def forward(
         # otherwise we expand manually by concating
         if hasattr(self.config, "image_token_index"):
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
-            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+            language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
         else:
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 6b838871d274..9df35fc7f0cb 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -56,8 +56,6 @@ class Blip2Processor(ProcessorMixin):
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-
-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
     def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.return_token_type_ids = False
         self.current_processor = image_processor
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 49224aabdd1c..e5d74b02c75f 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -501,7 +501,8 @@ def forward(
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                 )
-                inputs_embeds[special_image_mask] = image_features.flatten()
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index ccae0bbee9ad..05d1d5c341ca 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -45,10 +45,10 @@ class LlavaProcessor(ProcessorMixin):
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Shoudl be same as in model's config
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
     """
 
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 842eed169c2e..fd915b290789 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -867,7 +867,8 @@ def forward(
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                 )
-                inputs_embeds[special_image_mask] = image_features.flatten()
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 51213b7126b6..6065eebf3acb 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -46,10 +46,10 @@ class LlavaNextProcessor(ProcessorMixin):
         vision_feature_select_strategy (`str`, *optional*):
             The feature selection strategy used to select the vision feature from the vision backbone.
             Shoudl be same as in model's config
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
         chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
             in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
     """
 
     attributes = ["image_processor", "tokenizer"]
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 77c1e635d586..427d156c22d6 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -414,7 +414,8 @@ def forward(
                     f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
                     "tokens from image embeddings."
                 )
-            inputs_embeds[special_image_mask] = image_features.flatten()
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         causal_mask = self._update_causal_mask(
             attention_mask, token_type_ids, inputs_embeds, cache_position, is_training
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 0b0b07260c73..5676822433f7 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -614,13 +614,15 @@ def forward(
                     special_image_mask = (
                         (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                     )
-                    inputs_embeds[special_image_mask] = image_features.flatten()
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
                 if video_outputs is not None:
                     special_image_mask = (
                         (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                     )
-                    inputs_embeds[special_image_mask] = video_features.flatten()
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 7c186960c138..b1ac8a610469 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -498,7 +498,8 @@ def forward(
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
                 )
-                inputs_embeds[special_image_mask] = image_features.flatten()
+                image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,

From 78c448445004afe32c0afc74f58978bd45aceefc Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 18 Jul 2024 09:49:16 +0200
Subject: [PATCH 23/37] add llava-next-video

---
 .../configuration_llava_next_video.py         |  26 ++-
 .../llava_next_video/diff_llava_next_video.py | 197 +++++++++++-------
 .../modeling_llava_next_video.py              | 191 ++++++++++-------
 .../processing_llava_next_video.py            |  83 +++++++-
 .../test_modeling_llava_next_video.py         |  28 +++
 5 files changed, 357 insertions(+), 168 deletions(-)

diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index 59bf460e84a6..e772b14272fa 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -22,9 +22,15 @@
 
 from transformers import PretrainedConfig
 
+from ...utils import (
+    logging,
+)
 from ..auto import CONFIG_MAPPING
 
 
+logger = logging.get_logger(__name__)
+
+
 class LlavaNextVideoConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`LlavaNextVideoForConditionalGeneration`]. It is used to instantiate an
@@ -41,8 +47,18 @@ class LlavaNextVideoConfig(PretrainedConfig):
             The config object or dictionary of the text backbone.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
+        video_token_index (`int`, *optional*, defaults to 32000):
+            The video token index to encode the image prompt.
         image_token_index (`int`, *optional*, defaults to 32001):
            The image token index to encode the image prompt.
+        spatial_pool_mode (`str`, *optional*, defaults to `"average"`):
+            Pooling mode to use for videos. Can be "average", "max" or "conv".
+        spatial_pool_stride (`int`, *optional*, defaults to 2):
+            Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
@@ -56,12 +72,6 @@ class LlavaNextVideoConfig(PretrainedConfig):
             of the form `(height, width)`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        video_token_index (`int`, *optional*, defaults to 32000):
-            The video token index to encode the image prompt.
-        spatial_pool_mode (`str`, *optional*, defaults to `"average"`):
-            Pooling mode to use for videos. Can be "average", "max" or "conv".
-        spatial_pool_stride (`int`, *optional*, defaults to 2):
-            Stride used in the pooling layer for videos.
 
     Example:
 
@@ -99,11 +109,15 @@ def __init__(
         video_token_index=32000,
         spatial_pool_mode="average",
         spatial_pool_stride=2,
+        image_seq_length=576,
+        video_seq_length=288,
         **kwargs,
     ):
         self.video_token_index = video_token_index
         self.spatial_pool_mode = spatial_pool_mode
         self.spatial_pool_stride = spatial_pool_stride
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
diff --git a/src/transformers/models/llava_next_video/diff_llava_next_video.py b/src/transformers/models/llava_next_video/diff_llava_next_video.py
index ec41cefed77d..a85013c61b2b 100644
--- a/src/transformers/models/llava_next_video/diff_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py
@@ -64,6 +64,10 @@ class LlavaNextVideoConfig(PretrainedConfig):
             Pooling mode to use for videos. Can be "average", "max" or "conv".
         spatial_pool_stride (`int`, *optional*, defaults to 2):
             Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
@@ -114,11 +118,15 @@ def __init__(
         video_token_index=32000,
         spatial_pool_mode="average",
         spatial_pool_stride=2,
+        image_seq_length=576,
+        video_seq_length=288,
         **kwargs,
     ):
         self.video_token_index = video_token_index
         self.spatial_pool_mode = spatial_pool_mode
         self.spatial_pool_stride = spatial_pool_stride
+        self.image_seq_length = image_seq_length
+        self.video_seq_length = video_seq_length
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
@@ -375,90 +383,119 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-        # Merge text and images in prefill stage
-        if past_key_values is None:
-            # First merge image tokens if there are any
-            if pixel_values is not None and pixel_values.size(0) > 0:
-                image_features = self._get_image_features(pixel_values, image_sizes)
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
-                )
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.image_token_index,
-                )
-            # Then merge video tokens if there are any
-            if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
-                video_features = self._get_video_features(pixel_values_videos)
-                video_features = [feature.flatten(0, 1) for feature in video_features]
-                feature_lens = [feature.size(0) for feature in video_features]
-                video_features = torch.cat(video_features, dim=0)
-                feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=video_features.device)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    video_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.video_token_index,
-                )
-
-        # pixel_values is not None but is empty ---> text only cases
-        elif (pixel_values is not None and pixel_values.size(0) == 0) or (
-            pixel_values_videos is not None and pixel_values_videos.size(0) == 0
-        ):
-            pass
-
-        # generation with cache, decoding stage
-        elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
-            # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-            first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-            # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-            batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-            # Get the target length
-            target_length = input_ids.shape[1]
-            past_length = first_layer_past_key_value.shape[-1]
-            extended_attention_mask = torch.ones(
-                (attention_mask.shape[0], past_length),
-                dtype=attention_mask.dtype,
-                device=attention_mask.device,
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
             )
-            # Filter out only the tokens that can be un-attended, this can happen
-            # if one uses Llava + Fused modules where the cache on the
-            # first iteration is already big enough, or if one passes custom cache
-            valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-            new_batch_index = batch_index[valid_indices]
-            new_non_attended_tokens = non_attended_tokens[valid_indices]
-            # Zero-out the places where we don't need to attend
-            extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-            attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-            position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
+            legacy_processing = inputs_expanded or pixels_present
+
+        image_features = None
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            image_features = self._get_image_features(pixel_values, image_sizes)
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+
+        video_features = None
+        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
+            video_features = self._get_video_features(pixel_values_videos)
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+                if input_ids.shape[1] != 1:
+                    if image_features is not None:
+                        inputs_embeds = inputs_embeds.to(image_features.dtype)
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            position_ids,
+                            labels,
+                            input_ids,
+                        ) = self._merge_input_ids_with_image_features(
+                            image_features,
+                            feature_lens,
+                            inputs_embeds,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            labels=labels,
+                            image_token_index=self.config.image_token_index,
+                        )
+                    if video_features is not None:
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            position_ids,
+                            labels,
+                            input_ids,
+                        ) = self._merge_input_ids_with_image_features(
+                            video_features,
+                            video_feature_lens,
+                            inputs_embeds,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            labels=labels,
+                            image_token_index=self.config.video_token_index,
+                        )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.44
+            else:
+                if image_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+                if video_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 30b6abdf8e9f..5460d9c8489a 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -375,6 +375,10 @@ def _supports_sdpa(self):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
 
 
@@ -848,90 +852,119 @@ def forward(
                 "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
             )
 
+        legacy_processing = False
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
-        # Merge text and images in prefill stage
-        if past_key_values is None:
-            # First merge image tokens if there are any
-            if pixel_values is not None and pixel_values.size(0) > 0:
-                image_features = self._get_image_features(pixel_values, image_sizes)
-                image_features, feature_lens = self.pack_image_features(
-                    image_features,
-                    image_sizes,
-                    image_newline=self.image_newline,
-                )
-                inputs_embeds = inputs_embeds.to(image_features.dtype)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    image_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.image_token_index,
-                )
-            # Then merge video tokens if there are any
-            if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
-                video_features = self._get_video_features(pixel_values_videos)
-                video_features = [feature.flatten(0, 1) for feature in video_features]
-                feature_lens = [feature.size(0) for feature in video_features]
-                video_features = torch.cat(video_features, dim=0)
-                feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=video_features.device)
-                (
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    labels,
-                    input_ids,
-                ) = self._merge_input_ids_with_image_features(
-                    video_features,
-                    feature_lens,
-                    inputs_embeds,
-                    input_ids,
-                    attention_mask,
-                    position_ids,
-                    labels=labels,
-                    image_token_index=self.config.video_token_index,
+            # if the number of image/video tokens is more than image embeddings seq length, then prob we expanded it in processing
+            # not very reliable, but we don't expect one to actually pass 500+ images for one prompt
+            img_token_count = (input_ids == self.config.image_token_index).sum(1).max()
+            video_token_count = (input_ids == self.config.video_token_index).sum(1).max()
+            inputs_expanded = (
+                img_token_count < self.config.image_seq_length and video_token_count < self.config.video_seq_length
+            )
+            pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
+            legacy_processing = inputs_expanded or pixels_present
+
+        image_features = None
+        if pixel_values is not None and pixel_values.size(0) > 0:
+            image_features = self._get_image_features(pixel_values, image_sizes)
+            image_features, feature_lens = self.pack_image_features(
+                image_features,
+                image_sizes,
+                image_newline=self.image_newline,
+            )
+
+        video_features = None
+        if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
+            video_features = self._get_video_features(pixel_values_videos)
+            video_features = [feature.flatten(0, 1) for feature in video_features]
+            video_feature_lens = [feature.size(0) for feature in video_features]
+            video_features = torch.cat(video_features, dim=0)
+            video_feature_lens = torch.tensor(video_feature_lens, dtype=torch.long, device=video_features.device)
+
+            if legacy_processing:
+                logger.warning_once(
+                    "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
+                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
+                if input_ids.shape[1] != 1:
+                    if image_features is not None:
+                        inputs_embeds = inputs_embeds.to(image_features.dtype)
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            position_ids,
+                            labels,
+                            input_ids,
+                        ) = self._merge_input_ids_with_image_features(
+                            image_features,
+                            feature_lens,
+                            inputs_embeds,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            labels=labels,
+                            image_token_index=self.config.image_token_index,
+                        )
+                    if video_features is not None:
+                        (
+                            inputs_embeds,
+                            attention_mask,
+                            position_ids,
+                            labels,
+                            input_ids,
+                        ) = self._merge_input_ids_with_image_features(
+                            video_features,
+                            video_feature_lens,
+                            inputs_embeds,
+                            input_ids,
+                            attention_mask,
+                            position_ids,
+                            labels=labels,
+                            image_token_index=self.config.video_token_index,
+                        )
+                else:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                    # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                    # Get the target length
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], past_length),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    # Filter out only the tokens that can be un-attended, this can happen
+                    # if one uses Llava + Fused modules where the cache on the
+                    # first iteration is already big enough, or if one passes custom cache
+                    valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                    new_batch_index = batch_index[valid_indices]
+                    new_non_attended_tokens = non_attended_tokens[valid_indices]
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+            # TODO: @raushan retain only the new behavior after v4.44
+            else:
+                if image_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
-        # pixel_values is not None but is empty ---> text only cases
-        elif (pixel_values is not None and pixel_values.size(0) == 0) or (
-            pixel_values_videos is not None and pixel_values_videos.size(0) == 0
-        ):
-            pass
-
-        # generation with cache, decoding stage
-        elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
-            # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
-            first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
-            # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
-            batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
-            # Get the target length
-            target_length = input_ids.shape[1]
-            past_length = first_layer_past_key_value.shape[-1]
-            extended_attention_mask = torch.ones(
-                (attention_mask.shape[0], past_length),
-                dtype=attention_mask.dtype,
-                device=attention_mask.device,
-            )
-            # Filter out only the tokens that can be un-attended, this can happen
-            # if one uses Llava + Fused modules where the cache on the
-            # first iteration is already big enough, or if one passes custom cache
-            valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
-            new_batch_index = batch_index[valid_indices]
-            new_non_attended_tokens = non_attended_tokens[valid_indices]
-            # Zero-out the places where we don't need to attend
-            extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
-            attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
-            position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+                if video_features is not None:
+                    special_image_mask = (
+                        (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+                    )
+                    video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
+                    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, video_features)
 
         outputs = self.language_model(
             attention_mask=attention_mask,
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 81426b3a0af3..7fbac3493ed9 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -19,7 +19,7 @@
 from typing import TYPE_CHECKING, List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, VideoInput
+from ...image_utils import ImageInput, VideoInput, get_image_size, to_numpy_array
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@@ -48,6 +48,15 @@ class LlavaNextVideoProcessor(ProcessorMixin):
             The tokenizer is a required input.
         chat_template (`str`, *optional*):
             Jinja chat template that will be used in tokenizer's `apply_chat_template`
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Shoudl be same as in model's config
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
     """
 
     # video and image processor share same args, but have different processing logic
@@ -58,7 +67,22 @@ class LlavaNextVideoProcessor(ProcessorMixin):
     video_processor_class = "LlavaNextVideoImageProcessor"
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
 
-    def __init__(self, video_processor=None, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+    def __init__(
+        self,
+        video_processor=None,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        video_token="<video>",
+        image_token="<image>",
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = image_token
+        self.video_token = video_token
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
     def __call__(
@@ -131,9 +155,62 @@ def __call__(
         else:
             videos_inputs = {}
 
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        print(self.patch_size, self.vision_feature_select_strategy, image_inputs, videos_inputs.keys())
+
+        if self.patch_size is None or self.vision_feature_select_strategy is None:
+            prompt_strings = text
+            logger.warning_once(
+                "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+        # cannot infer image expansion length if no images/videos are found
+        elif not image_inputs and not videos_inputs:
+            prompt_strings = text
+        else:
+            # images expand taking into account num_of_patches in each image
+            if image_inputs:
+                image_sizes = image_inputs["image_sizes"]
+                height, width = get_image_size(to_numpy_array(image_inputs["pixel_values"][0][0]))
+                prompt_strings = []
+                for image_size, sample in zip(image_sizes, text):
+                    # Replace the image token with the expanded image token sequence
+                    orig_height, orig_width = image_size
+                    num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
+                    if self.vision_feature_select_strategy == "default":
+                        num_image_tokens -= 1
+
+                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                    prompt_strings.append(sample)
+                text = prompt_strings
+
+            # videos are easier, simply get frames and multiply
+            if videos_inputs:
+                one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
+                height, width = get_image_size(one_video[0])
+                num_frames = one_video.shape[0]  # frame dim is always after batch dim
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
+                num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
+
+                prompt_strings = []
+                for sample in text:
+                    sample = sample.replace(self.video_token, self.video_token * num_video_tokens)
+                    prompt_strings.append(sample)
+
         text_inputs = self.tokenizer(
-            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
         )
+        print(text_inputs.keys())
 
         return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
 
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index afe3062fb50e..c71231b4d412 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -453,3 +453,31 @@ def test_small_model_integration_test_batch_matches_single(self):
             self.processor.decode(output_batched[0], skip_special_tokens=True),
             self.processor.decode(output_single[0], skip_special_tokens=True),
         )
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
+        model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+            "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 1170)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(self.prompt_video, videos=[self.video], return_tensors="pt").to(torch_device)
+        self.assertTrue(inputs.input_ids.shape[-1] == 19)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())

From d60624ef00658039b7a7d098bf8aff6c4b628a37 Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Date: Mon, 5 Aug 2024 17:14:20 +0500
Subject: [PATCH 24/37] Update
 src/transformers/models/blip_2/modeling_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
---
 src/transformers/models/blip_2/modeling_blip_2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 86fa8a90a55d..817cba3ecc9d 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1894,7 +1894,7 @@ def generate(
             attention_mask = torch.ones_like(input_ids)
 
         # if the model already has "image_token_index" then the input is expanded to account for image embeds
-        # otherwise we expand manually by concating
+        # otherwise we expand manually by concatenating
         if hasattr(self.config, "image_token_index"):
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
             inputs_embeds[special_image_mask] = language_model_inputs.flatten()

From 1973b39f49edd13e6e5bd8e5242d6a9b0c0c805a Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 5 Aug 2024 15:05:40 +0200
Subject: [PATCH 25/37] remove tmp

---
 temp.py | 145 --------------------------------------------------------
 1 file changed, 145 deletions(-)
 delete mode 100644 temp.py

diff --git a/temp.py b/temp.py
deleted file mode 100644
index 7154c4ba8c2c..000000000000
--- a/temp.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# from PIL import Image
-# import requests
-# from transformers import AutoProcessor, LlavaForConditionalGeneration
-# 
-# model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
-# processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
-# 
-# prompt = "USER: <image>\nWhat's the content of the image? <image> ASSISTANT:"
-# url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-# image = Image.open(requests.get(url, stream=True).raw)
-# 
-# inputs = processor(text=[prompt], images=[image, image], return_tensors="pt")
-# for k, v in inputs.items():
-#     print(k, v.shape)
-# 
-# processor.vision_feature_select_strategy = "default"
-# processor.patch_size = 14
-# inputs_expanded = processor(text=[prompt], images=[image, image], return_tensors="pt")
-# for k, v in inputs_expanded.items():
-#     print(k, v.shape)
-# 
-# # Generate
-# generate_ids = model.generate(**inputs, max_new_tokens=15)
-# generate_ids_expanded = model.generate(**inputs_expanded, max_new_tokens=15)
-# assert(generate_ids_expanded == generate_ids)
-
-
-
-# from PIL import Image
-# import requests
-# from transformers import AutoProcessor, LlavaNextForConditionalGeneration
-# 
-# model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-# processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-# 
-# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-# lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e"
-# cats_image = Image.open(requests.get(url, stream=True).raw)
-# lowres_img = Image.open(requests.get(lowres_url, stream=True).raw)
-# 
-# very_wide_url = "https://as2.ftcdn.net/v2/jpg/02/84/07/89/1000_F_284078927_VW1YQ7jCb7Xz8cWkd7nUytk1j3KCCHcY.jpg"
-# very_wide_image = Image.open(requests.get(very_wide_url, stream=True).raw)
-# 
-# prompts = [
-#     "[INST] <image>\nWhat is shown in this image? [/INST]",
-#     "[INST] <image>\nDescribe the image. [/INST]",
-#     "[INST] <image>\nHow many cats do you see in this image? [/INST]",
-#     ]
-# processor.vision_feature_select_strategy = "default"
-# processor.patch_size = 14
-# inputs = processor(text=prompts, images=[very_wide_image, lowres_img, cats_image], padding=True, return_tensors="pt")
-# for k, v in inputs.items():
-#     print(k, v.shape)
-# 
-# # Generate
-# generate_ids = model.generate(**inputs, max_new_tokens=30)
-# out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-# print(out)
-# 
-# 
-# import torch
-# from PIL import Image
-# import requests
-# from transformers import AutoProcessor, VipLlavaForConditionalGeneration
-# 
-# model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
-# processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
-# 
-# prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
-# question = "Can you please describe this image?"
-# prompt = prompt.format(question)
-# url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
-# image = Image.open(requests.get(url, stream=True).raw)
-# 
-# inputs = processor(text=prompt, images=image, return_tensors="pt").to(0, torch.float16)
-# 
-# # Generate
-# generate_ids = model.generate(**inputs, max_new_tokens=20)
-# out = processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
-# print(out)
-# 
-
-from PIL import Image
-import requests
-import numpy as np
-import av
-from huggingface_hub import hf_hub_download
-from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
-
-
-def read_video_pyav(container, indices):
-    '''
-    Decode the video with PyAV decoder.
-    Args:
-        container (`av.container.input.InputContainer`): PyAV container.
-        indices (`List[int]`): List of frame indices to decode.
-    Returns:
-        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
-    '''
-    frames = []
-    container.seek(0)
-    start_index = indices[0]
-    end_index = indices[-1]
-    for i, frame in enumerate(container.decode(video=0)):
-        if i > end_index:
-            break
-        if i >= start_index and i in indices:
-            frames.append(frame)
-    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
-
-model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
-processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")
-
-prompt = "USER: <video>Why is this video funny? ASSISTANT:"
-video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
-container = av.open(video_path)
-
-# sample uniformly 8 frames from the video
-total_frames = container.streams.video[0].frames
-indices = np.arange(0, total_frames, total_frames / 8).astype(int)
-clip = read_video_pyav(container, indices)
-
-processor.vision_feature_select_strategy = "default"
-processor.patch_size = 14
-
-inputs = processor(text=prompt, videos=clip, return_tensors="pt")
-
-# Generate
-generate_ids = model.generate(**inputs, max_new_tokens=50)
-out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print(out)
-
-
-# to generate from image and video mix
-# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-# image = Image.open(requests.get(url, stream=True).raw)
-# prompt = [
-#         "USER: <image> How many cats are there in the image? ASSISTANT:",
-#         "USER: <video>Why is this video funny? ASSISTANT:"
-#      ]
-# inputs = processor(text=prompt, images=image, videos=clip, padding=True, return_tensors="pt")
-# 
-# # Generate
-# generate_ids = model.generate(**inputs, max_new_tokens=50)
-# out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
\ No newline at end of file

From 8e88d8bb993a79bd299e54ae9f570461b57de0d3 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 5 Aug 2024 15:12:02 +0200
Subject: [PATCH 26/37] codestyle

---
 .../models/llava/processing_llava.py          | 26 +++++++++----------
 .../llava_next/test_modeling_llava_next.py    |  3 ++-
 .../test_modeling_llava_next_video.py         |  2 +-
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index d907c255d9da..0e37daeb6065 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -137,20 +137,18 @@ def __call__(
 
         # try to expand inputs in processing if we have the necessary parts
         if image_inputs.get("pixel_values") is not None:
-            if (self.patch_size is not None
-            and self.vision_feature_select_strategy is not None
-        ):
-            # Replace the image token with the expanded image token sequence
-            pixel_values = image_inputs["pixel_values"]
-            height, width = get_image_size(to_numpy_array(pixel_values[0]))
-            num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
-            if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= 1
-
-            prompt_strings = []
-            for sample in text:
-                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-                prompt_strings.append(sample)
+            if self.patch_size is not None and self.vision_feature_select_strategy is not None:
+                # Replace the image token with the expanded image token sequence
+                pixel_values = image_inputs["pixel_values"]
+                height, width = get_image_size(to_numpy_array(pixel_values[0]))
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
+                if self.vision_feature_select_strategy == "default":
+                    num_image_tokens -= 1
+
+                prompt_strings = []
+                for sample in text:
+                    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                    prompt_strings.append(sample)
             else:
                 prompt_strings = text
                 logger.warning_once(
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index cc2f05f7c1cd..525ad0a546c2 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -549,7 +549,8 @@ def test_padding_side_when_merging_inputs(self):
         self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item())
 
     @slow
-    @require_bitsandbytesdef test_expansion_in_processing(self):
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
         model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
         model = LlavaNextForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
         processor = AutoProcessor.from_pretrained(model_id)
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 83db0a35281e..700b5f750fef 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -487,7 +487,7 @@ def test_padding_side_when_merging_inputs(self):
         with torch.no_grad():
             output_train = model(**inputs_batched, output_hidden_states=True)
         self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item())
-    
+
     @slow
     @require_bitsandbytes
     def test_expansion_in_processing(self):

From 689eed932739905d01022b723908138e6d5f0242 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 08:21:23 +0200
Subject: [PATCH 27/37] nits

---
 .../models/blip_2/processing_blip_2.py        |  6 +-
 .../models/paligemma/modeling_paligemma.py    | 61 ++++++++++++-------
 2 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 9df35fc7f0cb..075a72d8bce2 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -16,7 +16,6 @@
 Processor class for BLIP-2.
 """
 
-from operator import concat
 from typing import List, Optional, Union
 
 from ...image_utils import ImageInput
@@ -153,7 +152,10 @@ def __call__(
                 image_tokens = self.image_token.content * self.num_query_tokens
                 image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
                 for k in _text_encoding:
-                    text_encoding[k] = list(map(concat, image_token_encoding[k], _text_encoding[k]))
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                    ]
             else:
                 text_encoding = _text_encoding
                 logger.warning_once(
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 952e6b4030b6..849ae3239a8e 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -21,7 +21,7 @@
 import torch.utils.checkpoint
 from torch import nn
 
-from ...cache_utils import Cache
+from ...cache_utils import Cache, StaticCache
 from ...modeling_utils import PreTrainedModel
 from ...utils import (
     ModelOutput,
@@ -293,25 +293,34 @@ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_m
         return model_embeds
 
     def _update_causal_mask(
-        self, attention_mask, token_type_ids, inputs_embeds, cache_position, is_training: bool = False
+        self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
     ):
+        using_static_cache = isinstance(past_key_values, StaticCache)
         dtype, device = inputs_embeds.dtype, inputs_embeds.device
         min_dtype = torch.finfo(dtype).min
         sequence_length = inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
 
-        if cache_position is None:
-            cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
-
-        target_length = cache_position[-1] + 1
-        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
-
-        # do causal diagonal mask only if training, otherwise attend to the whole prefix
-        # training-specific attn for prefix is handled below
-        if sequence_length != 1:
-            if is_training:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            else:
-                causal_mask = torch.zeros_like(causal_mask)
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+            if sequence_length != 1:
+                if is_training:
+                    causal_mask = torch.triu(causal_mask, diagonal=1)
+                else:
+                    causal_mask = torch.zeros_like(causal_mask)
 
         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
         causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
@@ -400,6 +409,15 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings()(input_ids)
 
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+
         # Merge text and images
         if pixel_values is not None:
             image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
@@ -419,7 +437,7 @@ def forward(
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
         causal_mask = self._update_causal_mask(
-            attention_mask, token_type_ids, inputs_embeds, cache_position, is_training
+            attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training
         )
 
         outputs = self.language_model(
@@ -488,14 +506,15 @@ def prepare_inputs_for_generation(
             **kwargs,
         )
 
-        if input_ids.shape[1] != 1:  # for BC with the current implementation, position ids in pre-fill is 1-indexed
-            model_inputs["position_ids"] += 1
-
         model_inputs["token_type_ids"] = token_type_ids
 
+        # position_ids in Paligemma are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+
         # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-        # Otherwise we need pixel values to be passed to model
-        if past_key_values is None:
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
             model_inputs["pixel_values"] = pixel_values
 
         return model_inputs

From 28e8054675c43a16a2b12e3a159918b2cabc99db Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 08:27:43 +0200
Subject: [PATCH 28/37] more nits

---
 src/transformers/models/llava_next/processing_llava_next.py | 5 +++++
 tests/models/blip_2/test_modeling_blip_2.py                 | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 6065eebf3acb..a6c954fedb48 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -194,6 +194,11 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int
         return num_image_tokens
 
     def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
+        """
+        Get number of features for a given image with height/width. LLaVA-NeXT is different from LLaVA
+        because it divided each image into patches depending on its resolution. Therefore we need to calculate how many
+        patches an image is divided into and get the number of features from that.
+        """
         current_width = patches_height * scale_height
         current_height = patches_width * scale_width
 
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 15b855629c33..7b1af495b3dd 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1042,6 +1042,9 @@ def test_expansion_in_processing(self):
 
         image = prepare_img()
         prompt = "Question: which city is this? Answer:"
+
+        # Make sure we will go teh legacy path by setting these args to None
+        processor.num_query_tokens = None
         inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
 
         predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)

From 637e514202ccb7831b79cb2a5c4136e6ad247be9 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 09:14:26 +0200
Subject: [PATCH 29/37] remove overriding in tests

---
 tests/models/llava/test_modeling_llava.py     | 42 -----------------
 .../llava_next/test_modeling_llava_next.py    | 42 -----------------
 .../paligemma/test_modeling_paligemma.py      | 42 -----------------
 .../video_llava/test_modeling_video_llava.py  | 45 -------------------
 .../models/vipllava/test_modeling_vipllava.py | 42 -----------------
 tests/test_modeling_common.py                 |  6 +++
 6 files changed, 6 insertions(+), 213 deletions(-)

diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 270ac51b85b1..d658893dc1b8 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -186,48 +186,6 @@ def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
 
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index 525ad0a546c2..d6f1500574b8 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -237,48 +237,6 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
-    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 7a28daf63433..3d3e1495f782 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -262,48 +262,6 @@ def test_save_load_low_cpu_mem_usage_checkpoints(self):
     def test_save_load_low_cpu_mem_usage_no_safetensors(self):
         pass
 
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for VLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            input_ids[input_ids == model.config.image_token_index] = 2  # remove image tokens
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
 
 @slow
 @require_torch
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 1837c61738bf..4b84af74fd03 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -322,51 +322,6 @@ def recursive_check(batched_object, single_row_object, model_name, key):
             for key in model_batched_output:
                 recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 
-    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values_images"]
-            del inputs["pixel_values_videos"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            input_ids[input_ids == model.config.image_token_index] = 2  # remove image tokens
-            input_ids[input_ids == model.config.image_token_index] = 2  # remove video tokens
-            del inputs["input_ids"]
-            del inputs["pixel_values_images"]
-            del inputs["pixel_values_videos"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
 
 @require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index b63e75dcfb7e..d2528096ef0b 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -167,48 +167,6 @@ def setUp(self):
         self.model_tester = VipLlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
 
-    # overwrite inputs_embeds tests becasue we need to delete pixel values for VLMs
-    def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)[0]
-
-    def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            input_ids[input_ids == model.config.image_token_index] = 1  # remove image tokens
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 148f33e048d5..fe0e9f733cdf 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2755,6 +2755,9 @@ def test_inputs_embeds(self):
 
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
 
+            # remove pixel values for VLMs, because we can't pass embeds and pixel values at the same time
+            inputs = {k: v for k, v in inputs.items() if "pixel_values" not in k}
+
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]
                 del inputs["input_ids"]
@@ -2791,6 +2794,9 @@ def test_inputs_embeds_matches_input_ids(self):
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
             pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
 
+            # remove pixel values for VLMs, because we can't pass embeds and pixel values at the same time
+            inputs = {k: v for k, v in inputs.items() if "pixel_values" not in k}
+
             wte = model.get_input_embeddings()
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]

From be939d88fa6ab3cae0a5ce6367ba591e594c5876 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 09:53:25 +0200
Subject: [PATCH 30/37] comprehension when merging video

---
 .../llava_next_video/diff_llava_next_video.py | 61 ++++++++-----------
 .../modeling_llava_next_video.py              | 61 ++++++++-----------
 .../video_llava/modeling_video_llava.py       | 55 +++++++----------
 3 files changed, 71 insertions(+), 106 deletions(-)

diff --git a/src/transformers/models/llava_next_video/diff_llava_next_video.py b/src/transformers/models/llava_next_video/diff_llava_next_video.py
index a85013c61b2b..d75d2c0ca45a 100644
--- a/src/transformers/models/llava_next_video/diff_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py
@@ -397,7 +397,7 @@ def forward(
             pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
             legacy_processing = inputs_expanded or pixels_present
 
-        image_features = None
+        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self._get_image_features(pixel_values, image_sizes)
             image_features, feature_lens = self.pack_image_features(
@@ -406,7 +406,7 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = None
+        video_features = video_feature_lens = None
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self._get_video_features(pixel_values_videos)
             video_features = [feature.flatten(0, 1) for feature in video_features]
@@ -422,41 +422,28 @@ def forward(
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 if input_ids.shape[1] != 1:
-                    if image_features is not None:
-                        inputs_embeds = inputs_embeds.to(image_features.dtype)
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            image_features,
-                            feature_lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=self.config.image_token_index,
-                        )
-                    if video_features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            video_features,
-                            video_feature_lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=self.config.video_token_index,
-                        )
+                    iterator = (
+                        (image_features, feature_lens, self.config.image_token_index),
+                        (video_features, video_feature_lens, self.config.video_token_index),
+                    )
+                    for features, lens, special_token in zip(iterator):
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                labels,
+                                position_ids,
+                                input_ids,
+                            ) = self._merge_input_ids_with_image_features(
+                                features,
+                                lens,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                labels=labels,
+                                image_token_index=special_token,
+                            )
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
                     first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 348ecb08d808..aa3553a00e6c 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -867,7 +867,7 @@ def forward(
             pixels_present = input_ids.shape[-1] == 1 and pixel_values is not None and pixel_values_videos is not None
             legacy_processing = inputs_expanded or pixels_present
 
-        image_features = None
+        image_features = feature_lens = None
         if pixel_values is not None and pixel_values.size(0) > 0:
             image_features = self._get_image_features(pixel_values, image_sizes)
             image_features, feature_lens = self.pack_image_features(
@@ -876,7 +876,7 @@ def forward(
                 image_newline=self.image_newline,
             )
 
-        video_features = None
+        video_features = video_feature_lens = None
         if pixel_values_videos is not None and pixel_values_videos.size(0) > 0:
             video_features = self._get_video_features(pixel_values_videos)
             video_features = [feature.flatten(0, 1) for feature in video_features]
@@ -892,41 +892,28 @@ def forward(
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 if input_ids.shape[1] != 1:
-                    if image_features is not None:
-                        inputs_embeds = inputs_embeds.to(image_features.dtype)
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            image_features,
-                            feature_lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=self.config.image_token_index,
-                        )
-                    if video_features is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            position_ids,
-                            labels,
-                            input_ids,
-                        ) = self._merge_input_ids_with_image_features(
-                            video_features,
-                            video_feature_lens,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            position_ids,
-                            labels=labels,
-                            image_token_index=self.config.video_token_index,
-                        )
+                    iterator = (
+                        (image_features, feature_lens, self.config.image_token_index),
+                        (video_features, video_feature_lens, self.config.video_token_index),
+                    )
+                    for features, lens, special_token in iterator:
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                labels,
+                                position_ids,
+                                input_ids,
+                            ) = self._merge_input_ids_with_image_features(
+                                features,
+                                lens,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                position_ids,
+                                labels=labels,
+                                image_token_index=special_token,
+                            )
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
                     first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 978763cd8476..f8ce202e38e6 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -541,6 +541,7 @@ def forward(
                 vision_feature_select_strategy=vision_feature_select_strategy,
             )
 
+            image_features = video_features = None
             if image_outputs is not None:
                 image_features = self.multi_modal_projector(image_outputs)
             if video_outputs is not None:
@@ -554,31 +555,22 @@ def forward(
                     "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
                 )
                 if input_ids.shape[1] != 1:
-                    if image_outputs is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            labels,
-                            position_ids,
-                            input_ids,
-                        ) = self._merge_input_ids_with_visual_features(
-                            image_features, inputs_embeds, input_ids, attention_mask, labels
-                        )
-                    if video_outputs is not None:
-                        (
-                            inputs_embeds,
-                            attention_mask,
-                            labels,
-                            position_ids,
-                            _,
-                        ) = self._merge_input_ids_with_visual_features(
-                            video_features,
-                            inputs_embeds,
-                            input_ids,
-                            attention_mask,
-                            labels,
-                            num_frames=num_frames,
-                        )
+                    for features, frames in ((image_features, 1), (video_features, num_frames)):
+                        if features is not None:
+                            (
+                                inputs_embeds,
+                                attention_mask,
+                                labels,
+                                position_ids,
+                                input_ids,
+                            ) = self._merge_input_ids_with_visual_features(
+                                features,
+                                inputs_embeds,
+                                input_ids,
+                                attention_mask,
+                                labels,
+                                num_frames=num_frames,
+                            )
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states
                     # that are set to 0
@@ -689,22 +681,21 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position if not legacy_processing else None,
+            cache_position=cache_position,
             **kwargs,
         )
 
         if legacy_processing:
             # legacy specific code copied from prev version, we assume that we always have one more new token (assisted decoding doesn't work for VLMs)
-            if past_key_values is not None:
-                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
-                if "position_ids" in model_inputs:
-                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
+            # if cache_position[0] != 0:
+            #     model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
+            #     if "position_ids" in model_inputs:
+            #         model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
 
             model_inputs["pixel_values_images"] = pixel_values_images
             model_inputs["pixel_values_videos"] = pixel_values_videos
-            model_inputs["cache_position"] = None
 
-        elif past_key_values is None:
+        elif cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values_images"] = pixel_values_images

From 232eb7c91bcf0b0cd963cb4c668274c16ee8889a Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 09:59:16 +0200
Subject: [PATCH 31/37] fix-copies

---
 .../configuration_llava_next_video.py         | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
index e772b14272fa..3f310565b437 100644
--- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py
@@ -47,18 +47,8 @@ class LlavaNextVideoConfig(PretrainedConfig):
             The config object or dictionary of the text backbone.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
-        video_token_index (`int`, *optional*, defaults to 32000):
-            The video token index to encode the image prompt.
         image_token_index (`int`, *optional*, defaults to 32001):
            The image token index to encode the image prompt.
-        spatial_pool_mode (`str`, *optional*, defaults to `"average"`):
-            Pooling mode to use for videos. Can be "average", "max" or "conv".
-        spatial_pool_stride (`int`, *optional*, defaults to 2):
-            Stride used in the pooling layer for videos.
-        image_seq_length (`int`, *optional*, defaults to 576):
-            Sequence length of one image embedding.
-        video_seq_length (`int`, *optional*, defaults to 288):
-            Sequence length of one video embedding.
         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
             The activation function used by the multimodal projector.
         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
@@ -72,6 +62,16 @@ class LlavaNextVideoConfig(PretrainedConfig):
             of the form `(height, width)`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
+        video_token_index (`int`, *optional*, defaults to 32000):
+            The video token index to encode the image prompt.
+        spatial_pool_mode (`str`, *optional*, defaults to `"average"`):
+            Pooling mode to use for videos. Can be "average", "max" or "conv".
+        spatial_pool_stride (`int`, *optional*, defaults to 2):
+            Stride used in the pooling layer for videos.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        video_seq_length (`int`, *optional*, defaults to 288):
+            Sequence length of one video embedding.
 
     Example:
 

From 385a617484fef3eaf2430d0e9c4cfb06889c84df Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 10:14:22 +0200
Subject: [PATCH 32/37] revert changes for embeds test

---
 tests/models/llava/test_modeling_llava.py     | 43 ++++++++++++++++++
 .../llava_next/test_modeling_llava_next.py    | 43 ++++++++++++++++++
 .../test_modeling_llava_next_video.py         | 25 ++++++++++-
 .../paligemma/test_modeling_paligemma.py      | 43 ++++++++++++++++++
 .../video_llava/test_modeling_video_llava.py  | 45 +++++++++++++++++++
 .../models/vipllava/test_modeling_vipllava.py | 43 ++++++++++++++++++
 tests/test_modeling_common.py                 |  6 ---
 7 files changed, 241 insertions(+), 7 deletions(-)

diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index d658893dc1b8..360bbde29c18 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -186,6 +186,49 @@ def setUp(self):
         self.model_tester = LlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py
index d6f1500574b8..c08f5973d421 100644
--- a/tests/models/llava_next/test_modeling_llava_next.py
+++ b/tests/models/llava_next/test_modeling_llava_next.py
@@ -237,6 +237,49 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py
index 700b5f750fef..6bdb178ad0a7 100644
--- a/tests/models/llava_next_video/test_modeling_llava_next_video.py
+++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py
@@ -252,8 +252,8 @@ def test_initialization(self):
                         msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                     )
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):
-        # overwrite because llava can't support both inputs_embeds and pixel values at ipnut
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
@@ -274,6 +274,29 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+            del inputs["pixel_values_videos"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index 3d3e1495f782..411193db1ce3 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -188,6 +188,49 @@ def setUp(self):
         self.model_tester = PaliGemmaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py
index 4b84af74fd03..29fa3b71589a 100644
--- a/tests/models/video_llava/test_modeling_video_llava.py
+++ b/tests/models/video_llava/test_modeling_video_llava.py
@@ -322,6 +322,51 @@ def recursive_check(batched_object, single_row_object, model_name, key):
             for key in model_batched_output:
                 recursive_check(model_batched_output[key], model_row_output[key], model_name, key)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values_images"]
+            del inputs["pixel_values_videos"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
 
 @require_torch
 class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
index d2528096ef0b..3ae727e9890e 100644
--- a/tests/models/vipllava/test_modeling_vipllava.py
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -167,6 +167,49 @@ def setUp(self):
         self.model_tester = VipLlavaVisionText2TextModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
 
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
     )
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index fe0e9f733cdf..148f33e048d5 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -2755,9 +2755,6 @@ def test_inputs_embeds(self):
 
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
 
-            # remove pixel values for VLMs, because we can't pass embeds and pixel values at the same time
-            inputs = {k: v for k, v in inputs.items() if "pixel_values" not in k}
-
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]
                 del inputs["input_ids"]
@@ -2794,9 +2791,6 @@ def test_inputs_embeds_matches_input_ids(self):
             inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
             pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
 
-            # remove pixel values for VLMs, because we can't pass embeds and pixel values at the same time
-            inputs = {k: v for k, v in inputs.items() if "pixel_values" not in k}
-
             wte = model.get_input_embeddings()
             if not self.is_encoder_decoder:
                 input_ids = inputs["input_ids"]

From 4831a7e62bdbe825b928b575830c4b9bde0bd709 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 6 Aug 2024 10:54:36 +0200
Subject: [PATCH 33/37] fix tests after making comprehension

---
 .../models/llava_next_video/diff_llava_next_video.py            | 2 +-
 .../models/llava_next_video/modeling_llava_next_video.py        | 2 +-
 src/transformers/models/video_llava/modeling_video_llava.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llava_next_video/diff_llava_next_video.py b/src/transformers/models/llava_next_video/diff_llava_next_video.py
index d75d2c0ca45a..6580090eed3b 100644
--- a/src/transformers/models/llava_next_video/diff_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py
@@ -431,8 +431,8 @@ def forward(
                             (
                                 inputs_embeds,
                                 attention_mask,
-                                labels,
                                 position_ids,
+                                labels,
                                 input_ids,
                             ) = self._merge_input_ids_with_image_features(
                                 features,
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index aa3553a00e6c..2c52406730a4 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -901,8 +901,8 @@ def forward(
                             (
                                 inputs_embeds,
                                 attention_mask,
-                                labels,
                                 position_ids,
+                                labels,
                                 input_ids,
                             ) = self._merge_input_ids_with_image_features(
                                 features,
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index f8ce202e38e6..b7e6afcafe78 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -569,7 +569,7 @@ def forward(
                                 input_ids,
                                 attention_mask,
                                 labels,
-                                num_frames=num_frames,
+                                num_frames=frames,
                             )
                 else:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states

From 85fbff9fb7d2c61ad2d64eba5b01ec0a456446ec Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Date: Thu, 8 Aug 2024 10:12:12 +0500
Subject: [PATCH 34/37] Update
 src/transformers/models/blip_2/processing_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
---
 src/transformers/models/blip_2/processing_blip_2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 075a72d8bce2..f70d671b686a 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -47,7 +47,7 @@ class Blip2Processor(ProcessorMixin):
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
         num_query_tokens (`int`, *optional*):
-            MNumber of tokens used by the Qformer as queries, should be same as in model's config.
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
     attributes = ["image_processor", "tokenizer"]

From 119178fcb31c958e32e5b48f8e474cb2ef39f7ea Mon Sep 17 00:00:00 2001
From: Raushan Turganbay <raushan.turganbay@alumni.nu.edu.kz>
Date: Thu, 8 Aug 2024 10:17:55 +0500
Subject: [PATCH 35/37] Update
 src/transformers/models/blip_2/processing_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
---
 src/transformers/models/blip_2/processing_blip_2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index f70d671b686a..6a050c3d0de5 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -59,8 +59,7 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.return_token_type_ids = False
         self.current_processor = image_processor
         self.image_token = AddedToken("<image>", normalized=False, special=True)
-        tokens_to_add = {"additional_special_tokens": [self.image_token]}
-        tokenizer.add_special_tokens(tokens_to_add)
+        tokenizer.add_tokens([self.image_token], special_tokens=True)
         self.num_query_tokens = num_query_tokens
 
         super().__init__(image_processor, tokenizer)

From 245191188303597c848516462d9a6083ec1eb8aa Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 8 Aug 2024 09:54:19 +0200
Subject: [PATCH 36/37] more updates

---
 .../models/blip_2/configuration_blip_2.py     | 13 ++++-
 .../models/blip_2/modeling_blip_2.py          | 22 +++----
 .../models/blip_2/processing_blip_2.py        |  2 +-
 .../configuration_instructblip.py             | 13 ++++-
 .../instructblip/modeling_instructblip.py     | 51 ++++++++++++----
 .../instructblip/processing_instructblip.py   | 56 +++++++++++++++---
 .../configuration_instructblipvideo.py        | 13 ++++-
 .../diff_instructblipvideo.py                 | 50 ++++++++++++----
 .../modeling_instructblipvideo.py             | 49 ++++++++++++----
 .../processing_instructblipvideo.py           | 58 ++++++++++++++++---
 .../models/llava/modeling_llava.py            | 16 ++---
 .../models/llava/processing_llava.py          |  2 +-
 .../models/llava_next/modeling_llava_next.py  | 14 +----
 .../llava_next/processing_llava_next.py       |  2 +-
 .../llava_next_video/diff_llava_next_video.py |  4 +-
 .../modeling_llava_next_video.py              |  4 +-
 .../processing_llava_next_video.py            |  2 +-
 .../paligemma/configuration_paligemma.py      |  5 +-
 .../models/paligemma/modeling_paligemma.py    |  8 +++
 .../video_llava/modeling_video_llava.py       |  4 +-
 .../models/vipllava/modeling_vipllava.py      | 16 ++---
 tests/models/blip_2/test_modeling_blip_2.py   |  6 +-
 .../test_modeling_instructblip.py             | 32 ++++++++++
 .../test_modeling_instructblipvideo.py        | 30 ++++++++++
 24 files changed, 361 insertions(+), 111 deletions(-)

diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py
index fbbe67764dfc..86380e89b6d8 100644
--- a/src/transformers/models/blip_2/configuration_blip_2.py
+++ b/src/transformers/models/blip_2/configuration_blip_2.py
@@ -264,6 +264,8 @@ class Blip2Config(PretrainedConfig):
         num_query_tokens (`int`, *optional*, defaults to 32):
             The number of query tokens passed through the Transformer.
 
+        image_token_index (`int`, *optional*):
+            Token index of special image token.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -299,7 +301,15 @@ class Blip2Config(PretrainedConfig):
 
     model_type = "blip-2"
 
-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        image_token_index=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
@@ -323,6 +333,7 @@ def __init__(self, vision_config=None, qformer_config=None, text_config=None, nu
         self.is_encoder_decoder = self.text_config.is_encoder_decoder
 
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
         self.initializer_factor = 1.0
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 817cba3ecc9d..e89576c67ecc 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1772,7 +1772,7 @@ def forward(
 
         # if the model already has "image_token_index" then the input is expanded to account for image embeds
         # otherwise we expand manually by concating
-        if hasattr(self.config, "image_token_index"):
+        if getattr(self.config, "image_token_index", None) is not None:
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
             language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
@@ -1780,7 +1780,7 @@ def forward(
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
@@ -1895,26 +1895,28 @@ def generate(
 
         # if the model already has "image_token_index" then the input is expanded to account for image embeds
         # otherwise we expand manually by concatenating
-        if hasattr(self.config, "image_token_index"):
+        if getattr(self.config, "image_token_index", None) is not None:
             special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
             inputs_embeds[special_image_mask] = language_model_inputs.flatten()
         else:
             logger.warning_once(
                 "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                 "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
             )
             inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
             attention_mask = torch.cat(
                 [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
             )
 
-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 6a050c3d0de5..00a6e2f3c8e3 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -160,7 +160,7 @@ def __call__(
                 logger.warning_once(
                     "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                     "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
 
             # cast to desired return tensors type
diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py
index 77014e6f4667..a274212a945e 100644
--- a/src/transformers/models/instructblip/configuration_instructblip.py
+++ b/src/transformers/models/instructblip/configuration_instructblip.py
@@ -269,6 +269,8 @@ class InstructBlipConfig(PretrainedConfig):
         num_query_tokens (`int`, *optional*, defaults to 32):
             The number of query tokens passed through the Transformer.
 
+        image_token_index (`int`, *optional*):
+            Token index of special image token.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -304,7 +306,15 @@ class InstructBlipConfig(PretrainedConfig):
 
     model_type = "instructblip"
 
-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        image_token_index=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
@@ -328,6 +338,7 @@ def __init__(self, vision_config=None, qformer_config=None, text_config=None, nu
         self.is_encoder_decoder = self.text_config.is_encoder_decoder
 
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
         self.initializer_factor = 1.0
diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index 8ad47b308fd0..f59f72a6699c 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1453,12 +1453,24 @@ def forward(
         )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1580,17 +1592,32 @@ def generate(
             )
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
 
-        # concatenate query embeddings with prompt embeddings
         inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
 
-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "image_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index adebd22178ef..839a132ab722 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -22,11 +22,21 @@
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
 from ..auto import AutoTokenizer
 
 
+logger = logging.get_logger(__name__)
+
+
 class InstructBlipProcessor(ProcessorMixin):
     r"""
     Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
@@ -42,6 +52,8 @@ class InstructBlipProcessor(ProcessorMixin):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
         qformer_tokenizer (`AutoTokenizer`, *optional*):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):"
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -49,11 +61,13 @@ class InstructBlipProcessor(ProcessorMixin):
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
-        super().__init__(image_processor, tokenizer)
-
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query_tokens=None, **kwargs):
         # add QFormer tokenizer
         self.qformer_tokenizer = qformer_tokenizer
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        tokenizer.add_tokens([self.image_token], special_tokens=True)
+        self.num_query_tokens = num_query_tokens
+        super().__init__(image_processor, tokenizer)
 
     def __call__(
         self,
@@ -87,7 +101,12 @@ def __call__(
         encoding = BatchFeature()
 
         if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            _text_encoding = self.tokenizer(
                 text=text,
                 add_special_tokens=add_special_tokens,
                 padding=padding,
@@ -102,9 +121,32 @@ def __call__(
                 return_token_type_ids=return_token_type_ids,
                 return_length=return_length,
                 verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # needed to concatenate below
                 **kwargs,
             )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None and images is not None:
+                text_encoding = {}
+                image_tokens = self.image_token.content * self.num_query_tokens
+                image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                if images is not None:
+                    logger.warning_once(
+                        "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
+                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    )
+
+            # cast to desired return tensors type after concatenating
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
             encoding.update(text_encoding)
             qformer_text_encoding = self.qformer_tokenizer(
                 text=text,
diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
index 180372f35d18..051e8e218071 100644
--- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py
@@ -276,6 +276,8 @@ class InstructBlipVideoConfig(PretrainedConfig):
         num_query_tokens (`int`, *optional*, defaults to 32):
             The number of query tokens passed through the Transformer.
 
+        video_token_index (`int`, *optional*):
+            Token index of special video token.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -311,7 +313,15 @@ class InstructBlipVideoConfig(PretrainedConfig):
 
     model_type = "instructblipvideo"
 
-    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+    def __init__(
+        self,
+        vision_config=None,
+        qformer_config=None,
+        text_config=None,
+        num_query_tokens=32,
+        video_token_index=None,
+        **kwargs,
+    ):
         super().__init__(**kwargs)
 
         if vision_config is None:
@@ -335,6 +345,7 @@ def __init__(self, vision_config=None, qformer_config=None, text_config=None, nu
         self.is_encoder_decoder = self.text_config.is_encoder_decoder
 
         self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
         self.initializer_factor = 1.0
diff --git a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
index f400250d9327..506da83c5322 100644
--- a/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/diff_instructblipvideo.py
@@ -260,11 +260,24 @@ def forward(
         )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -394,17 +407,32 @@ def generate(
             )
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
 
-        # concatenate query embeddings with prompt embeddings
         inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
 
-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index d3b594e9c3f7..701402241d4a 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1495,11 +1495,25 @@ def forward(
         )
 
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
 
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_model_attention_mask.to(attention_mask.device), attention_mask], dim=1)
+
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1629,17 +1643,32 @@ def generate(
             )
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
 
-        # concatenate query embeddings with prompt embeddings
         inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
 
-        # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
-        # -1 is to account for the prepended BOS after `generate.`
-        if not self.language_model.config.is_encoder_decoder:
-            generate_kwargs["max_length"] = generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
-            generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
+        # if the model already has "video_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if getattr(self.config, "video_token_index", None) is not None:
+            special_image_mask = (input_ids == self.config.video_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
+
+            # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
+            # -1 is to account for the prepended BOS after `generate.`
+            if not self.language_model.config.is_encoder_decoder:
+                generate_kwargs["max_length"] = (
+                    generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1
+                )
+                generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 8310b68d736c..2bec788bfa8b 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -22,11 +22,21 @@
 from ...image_processing_utils import BatchFeature
 from ...image_utils import VideoInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
 from ..auto import AutoTokenizer
 
 
+logger = logging.get_logger(__name__)
+
+
 class InstructBlipVideoProcessor(ProcessorMixin):
     r"""
     Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
@@ -42,6 +52,8 @@ class InstructBlipVideoProcessor(ProcessorMixin):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
         qformer_tokenizer (`AutoTokenizer`, *optional*):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            Number of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -49,11 +61,13 @@ class InstructBlipVideoProcessor(ProcessorMixin):
     image_processor_class = "InstructBlipVideoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, **kwargs):
-        super().__init__(image_processor, tokenizer)
-
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer=None, num_query_tokens=None, **kwargs):
         # add QFormer tokenizer
         self.qformer_tokenizer = qformer_tokenizer
+        self.video_token = AddedToken("<video>", normalized=False, special=True)
+        tokenizer.add_tokens([self.video_token], special_tokens=True)
+        self.num_query_tokens = num_query_tokens
+        super().__init__(image_processor, tokenizer)
 
     def __call__(
         self,
@@ -84,7 +98,12 @@ def __call__(
         encoding = BatchFeature()
 
         if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            _text_encoding = self.tokenizer(
                 text=text,
                 add_special_tokens=add_special_tokens,
                 padding=padding,
@@ -99,9 +118,34 @@ def __call__(
                 return_token_type_ids=return_token_type_ids,
                 return_length=return_length,
                 verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # required to concatenate below
                 **kwargs,
             )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None and images is not None:
+                text_encoding = {}
+                video_tokens = (
+                    self.video_token.content * self.num_query_tokens * 4
+                )  # InstrucBLIP works with 4 frames only
+                video_token_encoding = self.tokenizer([video_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(video_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                if images is not None:
+                    logger.warning_once(
+                        "Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. "
+                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. "
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
+                    )
+
+            # cast to desired return tensors type after concatenating
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
             encoding.update(text_encoding)
             qformer_text_encoding = self.qformer_tokenizer(
                 text=text,
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
index 9599e2551141..7ae4050c1d0b 100644
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -459,7 +459,7 @@ def forward(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 # prefill stage vs decoding stage (legacy behavior copied)
                 if input_ids.shape[1] != 1:
@@ -497,7 +497,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
@@ -568,21 +568,13 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position if not legacy_processing else None,
+            cache_position=cache_position,
             **kwargs,
         )
 
         if legacy_processing:
-            # legacy specific code copied from prev version
-            if past_key_values is not None:
-                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
-                if "position_ids" in model_inputs:
-                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
-
             model_inputs["pixel_values"] = pixel_values
-            model_inputs["cache_position"] = None
-
-        elif past_key_values is None:
+        elif cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index 0e37daeb6065..ed63ec944c87 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -155,7 +155,7 @@ def __call__(
                     "Expanding inputs for image tokens in LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
 
         text_inputs = self.tokenizer(
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
index 445a980a7db4..345dc848b280 100644
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -820,7 +820,7 @@ def forward(
                     "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 if input_ids.shape[1] != 1:
                     inputs_embeds = inputs_embeds.to(image_features.dtype)
@@ -863,7 +863,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
@@ -939,17 +939,9 @@ def prepare_inputs_for_generation(
         )
 
         if legacy_processing:
-            # legacy specific code copied from prev version
-            if past_key_values is not None:
-                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
-                if "position_ids" in model_inputs:
-                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
-
             model_inputs["pixel_values"] = pixel_values
             model_inputs["image_sizes"] = image_sizes
-            model_inputs["cache_position"] = None
-
-        elif past_key_values is None:
+        elif cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index a6c954fedb48..50e3fbcecf5a 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -146,7 +146,7 @@ def __call__(
                 "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                 "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
             )
         # cannot infer image expansion length if no images are found
         elif not image_inputs:
diff --git a/src/transformers/models/llava_next_video/diff_llava_next_video.py b/src/transformers/models/llava_next_video/diff_llava_next_video.py
index 6580090eed3b..b4018db586e7 100644
--- a/src/transformers/models/llava_next_video/diff_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/diff_llava_next_video.py
@@ -419,7 +419,7 @@ def forward(
                     "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 if input_ids.shape[1] != 1:
                     iterator = (
@@ -468,7 +468,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 if image_features is not None:
                     special_image_mask = (
diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 2c52406730a4..2910438310e7 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -889,7 +889,7 @@ def forward(
                     "Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 if input_ids.shape[1] != 1:
                     iterator = (
@@ -938,7 +938,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 if image_features is not None:
                     special_image_mask = (
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 2c467eff6aee..80fd6753d477 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -168,7 +168,7 @@ def __call__(
                 "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
                 "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                 "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
             )
         # cannot infer image expansion length if no images/videos are found
         elif not image_inputs and not videos_inputs:
diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py
index 5c20c5ae3fa1..2f5a72bb6f78 100644
--- a/src/transformers/models/paligemma/configuration_paligemma.py
+++ b/src/transformers/models/paligemma/configuration_paligemma.py
@@ -110,14 +110,11 @@ def __init__(
                 vocab_size=257152,
                 vision_use_head=False,
             )
-        self.vocab_size = self.vocab_size
 
         self.text_config = text_config
-
         if isinstance(self.text_config, dict):
             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma"
             self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-            self.vocab_size = self.text_config.vocab_size
         elif text_config is None:
             self.text_config = CONFIG_MAPPING["gemma"](
                 hidden_size=2048,
@@ -135,7 +132,7 @@ def __init__(
     @property
     def ignore_index(self):
         warnings.warn(
-            "The `ignore_index` attribute is deprecated and will be removed in v4.44.",
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
             FutureWarning,
         )
         return self._ignore_index
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 849ae3239a8e..146dcd382064 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -436,6 +436,14 @@ def forward(
             image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
             inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
 
+        # mask out pad-token-ids in labels for BC
+        if labels is not None and self.pad_token_id in labels:
+            logger.warning_once(
+                "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. ",
+                "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
+            )
+            labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
+
         causal_mask = self._update_causal_mask(
             attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training
         )
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index b7e6afcafe78..af828d274e19 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -552,7 +552,7 @@ def forward(
                     "Expanding inputs for image tokens in Video-LLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                     "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 if input_ids.shape[1] != 1:
                     for features, frames in ((image_features, 1), (video_features, num_frames)):
@@ -601,7 +601,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 if image_outputs is not None:
                     special_image_mask = (
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
index 1eba7627e6c2..12297496c733 100644
--- a/src/transformers/models/vipllava/modeling_vipllava.py
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -457,7 +457,7 @@ def forward(
                 logger.warning_once(
                     "Expanding inputs for image tokens in VipLLaVa should be done in processing. "
                     "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                 )
                 # prefill stage vs decoding stage (legacy behavior copied)
                 if input_ids.shape[1] != 1:
@@ -494,7 +494,7 @@ def forward(
                     attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
-            # TODO: @raushan retain only the new behavior after v4.44
+            # TODO: @raushan retain only the new behavior after v4.47
             else:
                 special_image_mask = (
                     (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
@@ -565,21 +565,13 @@ def prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
-            cache_position=cache_position if not legacy_processing else None,
+            cache_position=cache_position,
             **kwargs,
         )
 
         if legacy_processing:
-            # legacy specific code copied from prev version
-            if past_key_values is not None:
-                model_inputs["input_ids"] = model_inputs["input_ids"][:, -1:]
-                if "position_ids" in model_inputs:
-                    model_inputs["position_ids"] = model_inputs["position_ids"][:, -1:]
-
             model_inputs["pixel_values"] = pixel_values
-            model_inputs["cache_position"] = None
-
-        elif past_key_values is None:
+        elif cache_position[0] == 0:
             # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
             # Otherwise we need pixel values to be passed to model
             model_inputs["pixel_values"] = pixel_values
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 7b1af495b3dd..362079434a93 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1043,8 +1043,9 @@ def test_expansion_in_processing(self):
         image = prepare_img()
         prompt = "Question: which city is this? Answer:"
 
-        # Make sure we will go teh legacy path by setting these args to None
+        # Make sure we will go the legacy path by setting these args to None
         processor.num_query_tokens = None
+        model.config.image_token_index = None
         inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
 
         predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
@@ -1052,8 +1053,9 @@ def test_expansion_in_processing(self):
 
         # Add args to the config to trigger new logic when inputs are expanded in processing file
         processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        model.config.image_token_index = len(processor.tokenizer) - 1
         model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
-        model.config.image_token_index = processor.tokenizer.vocab_size
 
         # Generate again with new inputs
         inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 1aaa8e1a8b68..8292567334bf 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -637,3 +637,35 @@ def test_inference_interpolate_pos_encoding(self):
             predictions[0].tolist(), [0, 37, 1023, 753, 3, 9, 2335, 3823, 30, 8, 2608, 28, 3, 9, 1782, 5, 1]
         )
         self.assertEqual(generated_text, "The image features a woman sitting on the beach with a dog.")
+
+    def test_expansion_in_processing(self):
+        processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-flan-t5-xl")
+        model = InstructBlipForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-flan-t5-xl",
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt = "What's in the image?"
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.image_token_index = None
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        model.config.image_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 1265db3a2a2e..8a9326c22ac1 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -583,3 +583,33 @@ def test_inference_vicuna_7b(self):
             generated_text,
             "a baby girl wearing glasses is reading a book on the bed 1080p",
         )
+
+    def test_expansion_in_processing(self):
+        processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
+        model = InstructBlipVideoForConditionalGeneration.from_pretrained(
+            "Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
+        )
+
+        clip = prepare_video()
+        prompt = "Explain what is happening in this short video."
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.video_token_index = None
+        inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<video>"]})
+        model.config.video_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(len(processor.tokenizer), pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)

From 414031e0db26f2c34a06d8f857b8a1667725c591 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Thu, 8 Aug 2024 10:21:28 +0200
Subject: [PATCH 37/37] fix tests

---
 src/transformers/models/blip_2/processing_blip_2.py             | 2 +-
 src/transformers/models/instructblip/processing_instructblip.py | 2 +-
 .../models/instructblipvideo/processing_instructblipvideo.py    | 2 +-
 src/transformers/models/llava/processing_llava.py               | 2 +-
 src/transformers/models/llava_next/processing_llava_next.py     | 2 +-
 .../models/llava_next_video/processing_llava_next_video.py      | 2 +-
 src/transformers/models/paligemma/modeling_paligemma.py         | 2 +-
 src/transformers/models/video_llava/processing_video_llava.py   | 2 +-
 tests/models/instructblip/test_processor_instructblip.py        | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 00a6e2f3c8e3..e879b41eb156 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -51,7 +51,7 @@ class Blip2Processor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index 839a132ab722..bb0351b67187 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -57,7 +57,7 @@ class InstructBlipProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 2bec788bfa8b..f56f8186b07d 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -57,7 +57,7 @@ class InstructBlipVideoProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = []
+    valid_kwargs = ["num_query_tokens"]
     image_processor_class = "InstructBlipVideoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index ed63ec944c87..99244d993b71 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -52,7 +52,7 @@ class LlavaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index 50e3fbcecf5a..c043b8bc7ed6 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -53,7 +53,7 @@ class LlavaNextProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 80fd6753d477..efbb193ba62a 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -62,7 +62,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
     # video and image processor share same args, but have different processing logic
     # only image processor config is saved in the hub
     attributes = ["video_processor", "image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
     image_processor_class = "LlavaNextImageProcessor"
     video_processor_class = "LlavaNextVideoImageProcessor"
     tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
diff --git a/src/transformers/models/paligemma/modeling_paligemma.py b/src/transformers/models/paligemma/modeling_paligemma.py
index 146dcd382064..8eff8cce50cc 100644
--- a/src/transformers/models/paligemma/modeling_paligemma.py
+++ b/src/transformers/models/paligemma/modeling_paligemma.py
@@ -477,7 +477,7 @@ def forward(
             # Flatten the tokens
             loss_fct = nn.CrossEntropyLoss()
 
-            flat_logits = shift_logits.view(-1, self.config.vocab_size)
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             flat_labels = shift_labels.view(-1).to(shift_logits.device)
             loss = loss_fct(flat_logits, flat_labels)
         if not return_dict:
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index bc605147f0c6..a06913d7acf7 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -54,7 +54,7 @@ class VideoLlavaProcessor(ProcessorMixin):
     """
 
     attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
     image_processor_class = "VideoLlavaImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
diff --git a/tests/models/instructblip/test_processor_instructblip.py b/tests/models/instructblip/test_processor_instructblip.py
index 06c68a8a5807..e4fc43d9fb51 100644
--- a/tests/models/instructblip/test_processor_instructblip.py
+++ b/tests/models/instructblip/test_processor_instructblip.py
@@ -119,7 +119,7 @@ def test_tokenizer(self):
             tokenizer=tokenizer, image_processor=image_processor, qformer_tokenizer=qformer_tokenizer
         )
 
-        input_str = "lower newer"
+        input_str = ["lower newer"]
 
         encoded_processor = processor(text=input_str)