huggingface · zucchini-nlp · May 15, 2024 · Mar 19, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py
@@ -38,9 +38,10 @@ class VideoLlavaConfig(PretrainedConfig):
 
     Args:
         vision_config (`VideoLlavaVisionConfig`, *optional*):
-            Custom vision config or dict
+            Custom vision config or dict. Defaults ot `CLIPVisionConfig` if not indicated.
         text_config (`Union[AutoConfig, dict]`, *optional*):
             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+            Defaults ot `LlamaConfig` if not indicated.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
         image_token_index (`int`, *optional*, defaults to 32000):
@@ -101,7 +102,9 @@ def __init__(
         self.vision_config = vision_config
 
         if isinstance(self.vision_config, dict):
-            vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+            if "model_type" not in vision_config:
+                vision_config["model_type"] = "clip_vision_model"
+                logger.warning("Key=`model_type` not found in vision config, setting it to `clip_vision_model`")
             self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
         elif vision_config is None:
             self.vision_config = CONFIG_MAPPING["clip_vision_model"](
@@ -115,12 +118,13 @@ def __init__(
                 projection_dim=768,
             )
 
-        self.text_config = text_config
-
-        if isinstance(self.text_config, dict):
-            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        if isinstance(text_config, dict):
+            if "model_type" not in text_config:
+                text_config["model_type"] = "llama"
+                logger.warning("Key=`model_type` not found in text config, setting it to `llama`")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
-            self.text_config = CONFIG_MAPPING["llama"]()
+            text_config = CONFIG_MAPPING["llama"]()
 
+        self.text_config = text_config
         super().__init__(**kwargs)
diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py
@@ -139,6 +139,7 @@ def __init__(
         self.do_convert_rgb = do_convert_rgb
         self._valid_processor_keys = [
             "images",
+            "videos",
             "do_resize",
             "size",
             "resample",
@@ -206,8 +207,8 @@ def resize(
 
     def preprocess(
         self,
-        images: List[ImageInput],
-        videos: List[VideoInput],
+        images: List[ImageInput] = None,
+        videos: List[VideoInput] = None,
         do_resize: bool = None,
         size: Dict[str, int] = None,
         resample: PILImageResampling = None,
@@ -228,9 +229,12 @@ def preprocess(
         Preprocess an image or batch of images.
 
         Args:
-            visual_inputs (`ImageInput`):
-                List of images and/or videos to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+            images (`ImageInput`, *optional*):
+                List of images to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                 passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`, *optional*):
+                List of videos to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
             do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                 Whether to resize the image.
             size (`Dict[str, int]`, *optional*, defaults to `self.size`):
@@ -326,7 +330,7 @@ def preprocess(
                 ]
                 for video in videos
             ]
-            data["pixel_values_video"] = pixel_values_video
+            data["pixel_values_videos"] = pixel_values_videos
 
         if images is not None:
             pixel_values_images = [

diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -217,6 +217,11 @@ def _supports_sdpa(self):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
             model's internal embedding lookup matrix.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`
         use_cache (`bool`, *optional*):
             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
             `past_key_values`).
@@ -372,6 +377,9 @@ def _get_vision_features(
         # videos do not need to select features and it's always "full" (as it is done in the orig implementation)
         if pixel_values_videos is not None:
             batch_size_vid, num_frames, channels, height, width = pixel_values_videos.shape
+            if num_frames != 8:
+                raise ValueError(f"Video pixel values should have exactly `8` frames but foung `{num_frames}`")
+
             pixel_values = pixel_values_videos.reshape(batch_size_vid * num_frames, channels, height, width)
             video_outputs = self.video_tower(pixel_values, output_hidden_states=True)
             video_outputs = video_outputs.hidden_states[vision_feature_layer].squeeze(1)

diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
@@ -112,11 +112,10 @@ def __call__(
             encoded_images = self.image_processor(images=images, videos=videos, return_tensors=return_tensors)
             data.update(encoded_images)
 
-        if text is not None: 
-            text_inputs = self.tokenizer(
-                text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
-            )
-            data.update(text_inputs)
+        text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+        )
+        data.update(text_inputs)
 
         return BatchFeature(data=data)