huggingface · zucchini-nlp · Jan 10, 2025 · Oct 17, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
@@ -15,6 +15,7 @@
 
 import base64
 import os
+from contextlib import redirect_stdout
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -24,13 +25,17 @@
 
 from .utils import (
     ExplicitEnum,
+    is_av_available,
+    is_cv2_available,
+    is_decord_available,
     is_jax_tensor,
     is_numpy_array,
     is_tf_tensor,
     is_torch_available,
     is_torch_tensor,
     is_torchvision_available,
     is_vision_available,
+    is_yt_dlp_available,
     logging,
     requires_backends,
     to_numpy,
@@ -55,6 +60,7 @@
         PILImageResampling = PIL.Image
 
     if is_torchvision_available():
+        from torchvision import io as torchvision_io
         from torchvision.transforms import InterpolationMode
 
         pil_torch_interpolation_mapping = {
@@ -66,6 +72,17 @@
             PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
         }
 
+if is_decord_available():
+    from decord import VideoReader, cpu
+
+if is_av_available():
+    import av
+
+if is_cv2_available():
+    import cv2
+
+if is_yt_dlp_available():
+    from yt_dlp import YoutubeDL
 
 if TYPE_CHECKING:
     if is_torch_available():
@@ -385,6 +402,160 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
     return image
 
 
+def read_video_opencv(video_path, num_frames=None):
+    """
+    Decode the video with open-cv decoder.
+
+    Args:
+        video_path (str): Path to the video file.
+        num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    video = cv2.VideoCapture(video_path)
+    total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    if num_frames is not None:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int)
+    else:
+        indices = np.arange(0, total_num_frames).astype(int)
+
+    index = 0
+    frames = []
+    while video.isOpened():
+        success, frame = video.read()
+        if index in indices:
+            height, width, channel = frame.shape
+            frames.append(frame[0:height, 0:width, 0:channel])
+        if success:
+            index += 1
+        if index >= total_num_frames:
+            break
+
+    video.release()
+    return np.stack(frames)
+
+
+def read_video_decord(video_path, num_frames=None):
+    """
+    Decode the video with Decord decoder.
+
+    Args:
+        video_path (str): Path to the video file.
+        num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
+    if num_frames is not None:
+        indices = np.arange(0, len(vr), len(vr) / num_frames).astype(int)
+    else:
+        indices = np.arange(0, len(vr)).astype(int)
+
+    frames = vr.get_batch(indices).asnumpy()
+    return frames
+
+
+def read_video_pyav(video_path, num_frames=None):
+    """
+    Decode the video with PyAV decoder.
+
+    Args:
+        video_path (str): Path to the video file.
+        num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES
+
+    Returns:
+        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
+    """
+    container = av.open(video_path)
+
+    # sample uniformly "num_frames" frames from the video
+    total_frames = container.streams.video[0].frames
+    if num_frames is not None:
+        indices = np.arange(0, total_frames, total_frames / num_frames).astype(int)
+    else:
+        indices = np.arange(0, total_frames).astype(int)
+
+    frames = []
+    container.seek(0)
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= 0 and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+def read_video_torchvision(video_path, num_frames=None):
+    video, _, info = torchvision_io.read_video(
+        video_path,
+        start_pts=0.0,
+        end_pts=None,
+        pts_unit="sec",
+        output_format="TCHW",
+    )
+
+    if num_frames is not None:
+        idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
+        return video[idx]
+
+    return video
+
+
+VIDEO_DECODERS = {
+    "decord": read_video_decord,
+    "opencv": read_video_opencv,
+    "pyav": read_video_pyav,
+    "torchvision": read_video_torchvision,
+}
+
+
+def load_video(video: Union[str, "VideoInput"], num_frames=None, backend="opencv") -> np.array:
+    """
+    Loads `video` to a numpy array].
+
+    Args:
+        video (`str` or `VideoInput`):
+            The video to convert to the numpy array format. Can be a link to video or local path.
+        num_frames (`int`, *optional*):
+            Number of frames to sample uniformly. If not passed, the whole video is loaded.
+
+    Returns:
+        `np.array`: A numpy array of shape (num_frames, channels, height, width).
+    """
+    if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
+        buffer = BytesIO()
+        with redirect_stdout(buffer), YoutubeDL() as f:
+            f.download([video])
+        bytes_obj = buffer.getvalue()
+        file_obj = BytesIO(bytes_obj)
+    elif video.startswith("http://") or video.startswith("https://"):
+        file_obj = BytesIO(requests.get(video).content)
+    elif os.path.isfile(video):
+        file_obj = video
+    elif is_valid_image(video) or (isinstance(video, (list, tuple) and is_valid_image(video[0]))):
+        file_obj = None
+    else:
+        raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")
+
+    # can also load with decord, but not cv2/torchvision
+    # both will fail in case of url links
+    video_is_url = video.startswith("http://") or video.startswith("https://")
+    if video_is_url and backend in ["opencv", "torchvision"]:
+        raise ValueError(
+            "If you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend"
+        )
+
+    if file_obj is None:
+        return video
+
+    video_decoder = VIDEO_DECODERS[backend]
+    video = video_decoder(file_obj)
+    return video
+
+
 def validate_preprocess_arguments(
     do_rescale: Optional[bool] = None,
     rescale_factor: Optional[float] = None,

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -30,7 +30,7 @@
 import typing_extensions
 
 from .dynamic_module_utils import custom_object_save
-from .image_utils import ChannelDimension, is_valid_image, is_vision_available
+from .image_utils import ChannelDimension, is_valid_image, is_vision_available, load_image, load_video
 
 
 if is_vision_available():
@@ -1075,12 +1075,29 @@ def apply_chat_template(
         conversation: Union[List[Dict[str, str]]],
         chat_template: Optional[str] = None,
         tokenize: bool = False,
+        return_dict: bool = False,
+        processor_kwargs: Dict = {},
         **kwargs,
     ) -> str:
         """
         Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
         conversations to turn them into a single tokenizable string.
 
+        The input is expected to be in the following format, where each message content is a list consisting of text and
+        optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
+        `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.
+
+        conversation = [
+            {
+
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Please describe this image in detail."},
+                    {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                    ],
+            },
+        ]
+
         Args:
             conversation (`List[Dict, str, str]`):
                 The conversation to format.
@@ -1089,8 +1106,13 @@ def apply_chat_template(
                 chat template is used.
             tokenize (`bool`, *optional*, defaults to `False`):
                 Whether to tokenize the output or not.
+            return_dict (`bool`, defaults to `False`):
+                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+            processor_kwargs (`Dict[str: Any]`, *optional*):
+                Additional kwargs to pass to the processor. Used when `return_dict=True` and `tokenize=True`.
             **kwargs:
-                Additional keyword arguments
+                Additional keyword arguments passed to the chat template, such as `tools` or `documents` depending
+                on whether it is supported by the current template.
         """
 
         if chat_template is None:
@@ -1102,10 +1124,40 @@ def apply_chat_template(
                     "or provide a chat template as an argument. See "
                     "https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
                 )
-        return self.tokenizer.apply_chat_template(
-            conversation, chat_template=chat_template, tokenize=tokenize, **kwargs
+
+        prompt = self.tokenizer.apply_chat_template(
+            conversation, chat_template=chat_template, tokenize=False, return_dict=False, **kwargs
         )
 
+        def parse_conversation(conversation: Union[List[Dict[str, str]]]):
+            images, videos = [], []
+            for message in conversation:
+                visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
+                for vision_info in visuals:
+                    if vision_info["type"] == "image" and "image" in vision_info:
+                        images.append(load_image(vision_info["image"]))
+                    elif vision_info["type"] == "video" and "video" in vision_info:
+                        videos.append(load_video(vision_info["video"]))
+            return images, videos
+
+        # we will have to return all processed inputs in a dict
+        if tokenize:
+            images, videos = parse_conversation(conversation)
+            out = self(
+                text=prompt,
+                images=images if images else None,
+                videos=videos if videos else None,
+                padding=kwargs.get("padding", False),
+                truncation=kwargs.get("truncation", False),
+                max_length=kwargs.get("max_length", None),
+                return_tensors=kwargs.get("return_tensors", None),
+            )
+            if return_dict:
+                return out
+            else:
+                return out["input_ids"]
+        return prompt
+
 
 def _validate_images_text_input_order(images, text):
     """

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -129,6 +129,7 @@
     is_cv2_available,
     is_cython_available,
     is_datasets_available,
+    is_decord_available,
     is_detectron2_available,
     is_eetq_available,
     is_essentia_available,
@@ -230,6 +231,7 @@
     is_training_run_on_sagemaker,
     is_uroman_available,
     is_vision_available,
+    is_yt_dlp_available,
     requires_backends,
     torch_only_method,
 )