Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chat template: return vectorized output in processors #34275

Merged
merged 42 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
d66a928
update chat template
zucchini-nlp Oct 17, 2024
2bff795
Merge branch 'main' into chat-template-vlms
zucchini-nlp Oct 25, 2024
3c24aff
style
zucchini-nlp Oct 25, 2024
710edd1
fix tests
zucchini-nlp Oct 25, 2024
1bf58f3
Merge branch 'main' into chat-template-vlms
zucchini-nlp Oct 25, 2024
76d24ae
Merge branch 'main' into chat-template-vlms
zucchini-nlp Oct 29, 2024
eb588d1
Update src/transformers/image_utils.py
zucchini-nlp Oct 29, 2024
3de67e0
typehints + docs
zucchini-nlp Oct 29, 2024
bcf3dac
fix tests
zucchini-nlp Oct 29, 2024
87205d7
Merge branch 'main' into chat-template-vlms
zucchini-nlp Oct 29, 2024
6282694
remove unnecessary warnings
zucchini-nlp Oct 29, 2024
690c314
forgot code style :(
zucchini-nlp Oct 29, 2024
9049d64
allow users to pass backend and num frames
zucchini-nlp Oct 29, 2024
243b4c3
Update docs/source/en/chat_templating.md
zucchini-nlp Oct 30, 2024
899d20d
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
47272f8
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
fc8ba58
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
8b0ddd7
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
d2d27fb
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
1adfbca
Update src/transformers/image_utils.py
zucchini-nlp Oct 30, 2024
d0209e2
Update src/transformers/processing_utils.py
zucchini-nlp Oct 30, 2024
cde21be
Merge branch 'main' into chat-template-vlms
zucchini-nlp Oct 30, 2024
34ee690
typo fix
zucchini-nlp Nov 4, 2024
3cd24ac
merge main
zucchini-nlp Nov 4, 2024
91057e4
style
zucchini-nlp Nov 4, 2024
5edb363
address comments
zucchini-nlp Nov 15, 2024
04080ea
Merge branch 'main' into chat-template-vlms
zucchini-nlp Nov 15, 2024
eb450f8
Merge branch 'main' into chat-template-vlms
zucchini-nlp Nov 18, 2024
9cc74a4
align with "pipeline" template
zucchini-nlp Nov 19, 2024
39724ef
update docs
zucchini-nlp Nov 19, 2024
72368f7
update docs
zucchini-nlp Nov 19, 2024
376e808
merge main
zucchini-nlp Jan 8, 2025
de58cb0
unpack for all kwargs?
zucchini-nlp Jan 8, 2025
71a82b5
wrong conflict resolution while rebasing
zucchini-nlp Jan 8, 2025
4e62720
tmp
zucchini-nlp Jan 8, 2025
45289f3
update docs
zucchini-nlp Jan 9, 2025
503b153
Merge branch 'main' into chat-template-vlms
zucchini-nlp Jan 9, 2025
2b54a52
Update docs/source/en/chat_templating.md
zucchini-nlp Jan 10, 2025
3c3441e
Update docs/source/en/chat_templating.md
zucchini-nlp Jan 10, 2025
4600728
Update docs/source/en/chat_templating.md
zucchini-nlp Jan 10, 2025
39875be
Update docs/source/en/chat_templating.md
zucchini-nlp Jan 10, 2025
db2ec0c
Merge branch 'main' into chat-template-vlms
zucchini-nlp Jan 10, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions src/transformers/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import base64
import os
from contextlib import redirect_stdout
from io import BytesIO
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union

Expand All @@ -24,13 +25,17 @@

from .utils import (
ExplicitEnum,
is_av_available,
is_cv2_available,
is_decord_available,
is_jax_tensor,
is_numpy_array,
is_tf_tensor,
is_torch_available,
is_torch_tensor,
is_torchvision_available,
is_vision_available,
is_yt_dlp_available,
logging,
requires_backends,
to_numpy,
Expand All @@ -55,6 +60,7 @@
PILImageResampling = PIL.Image

if is_torchvision_available():
from torchvision import io as torchvision_io
from torchvision.transforms import InterpolationMode

pil_torch_interpolation_mapping = {
Expand All @@ -66,6 +72,17 @@
PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
}

if is_decord_available():
from decord import VideoReader, cpu

if is_av_available():
import av

if is_cv2_available():
import cv2

if is_yt_dlp_available():
from yt_dlp import YoutubeDL
Comment on lines +76 to +86
Copy link
Member

@hmellor hmellor Feb 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block breaks lazy importing of cv2 which vllm strictly enforces. It happens when vLLM imports from transformers.image_utils import ImageInput. vLLM cannot upgrade to v4.49.0 because of it vllm-project/vllm#13905.

Would it be possible to delay this import? This would be preferable to lazily importing ImageInput everywhere it's used in vLLM.

cc @ArthurZucker


if TYPE_CHECKING:
if is_torch_available():
Expand Down Expand Up @@ -385,6 +402,160 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] =
return image


def read_video_opencv(video_path, num_frames=None):
"""
Decode the video with open-cv decoder.

Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
video = cv2.VideoCapture(video_path)
total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
if num_frames is not None:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(int)
else:
indices = np.arange(0, total_num_frames).astype(int)

index = 0
frames = []
while video.isOpened():
success, frame = video.read()
if index in indices:
height, width, channel = frame.shape
frames.append(frame[0:height, 0:width, 0:channel])
if success:
index += 1
if index >= total_num_frames:
break

video.release()
return np.stack(frames)


def read_video_decord(video_path, num_frames=None):
"""
Decode the video with Decord decoder.

Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
vr = VideoReader(uri=video_path, ctx=cpu(0)) # decord has problems with gpu
if num_frames is not None:
indices = np.arange(0, len(vr), len(vr) / num_frames).astype(int)
else:
indices = np.arange(0, len(vr)).astype(int)

frames = vr.get_batch(indices).asnumpy()
return frames


def read_video_pyav(video_path, num_frames=None):
"""
Decode the video with PyAV decoder.

Args:
video_path (str): Path to the video file.
num_frames (int): Number of frames to sample uniformly. Defaults to NUM_FRAMES

Returns:
np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
"""
container = av.open(video_path)

# sample uniformly "num_frames" frames from the video
total_frames = container.streams.video[0].frames
if num_frames is not None:
indices = np.arange(0, total_frames, total_frames / num_frames).astype(int)
else:
indices = np.arange(0, total_frames).astype(int)

frames = []
container.seek(0)
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= 0 and i in indices:
frames.append(frame)
return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def read_video_torchvision(video_path, num_frames=None):
video, _, info = torchvision_io.read_video(
video_path,
start_pts=0.0,
end_pts=None,
pts_unit="sec",
output_format="TCHW",
)

if num_frames is not None:
idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
return video[idx]

return video


VIDEO_DECODERS = {
"decord": read_video_decord,
"opencv": read_video_opencv,
"pyav": read_video_pyav,
"torchvision": read_video_torchvision,
}


def load_video(video: Union[str, "VideoInput"], num_frames=None, backend="opencv") -> np.array:
"""
Loads `video` to a numpy array].

Args:
video (`str` or `VideoInput`):
The video to convert to the numpy array format. Can be a link to video or local path.
num_frames (`int`, *optional*):
Number of frames to sample uniformly. If not passed, the whole video is loaded.

Returns:
`np.array`: A numpy array of shape (num_frames, channels, height, width).
"""
if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
buffer = BytesIO()
with redirect_stdout(buffer), YoutubeDL() as f:
f.download([video])
bytes_obj = buffer.getvalue()
file_obj = BytesIO(bytes_obj)
elif video.startswith("http://") or video.startswith("https://"):
file_obj = BytesIO(requests.get(video).content)
elif os.path.isfile(video):
file_obj = video
elif is_valid_image(video) or (isinstance(video, (list, tuple) and is_valid_image(video[0]))):
file_obj = None
else:
raise TypeError("Incorrect format used for video. Should be an url linking to an video or a local path.")

# can also load with decord, but not cv2/torchvision
# both will fail in case of url links
video_is_url = video.startswith("http://") or video.startswith("https://")
if video_is_url and backend in ["opencv", "torchvision"]:
raise ValueError(
"If you are trying to load a video from URL, you can decode the video only with `pyav` or `decord` as backend"
)

if file_obj is None:
return video

video_decoder = VIDEO_DECODERS[backend]
video = video_decoder(file_obj)
return video


def validate_preprocess_arguments(
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
Expand Down
60 changes: 56 additions & 4 deletions src/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import typing_extensions

from .dynamic_module_utils import custom_object_save
from .image_utils import ChannelDimension, is_valid_image, is_vision_available
from .image_utils import ChannelDimension, is_valid_image, is_vision_available, load_image, load_video


if is_vision_available():
Expand Down Expand Up @@ -1075,12 +1075,29 @@ def apply_chat_template(
conversation: Union[List[Dict[str, str]]],
chat_template: Optional[str] = None,
tokenize: bool = False,
return_dict: bool = False,
processor_kwargs: Dict = {},
**kwargs,
) -> str:
"""
Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input
conversations to turn them into a single tokenizable string.

The input is expected to be in the following format, where each message content is a list consisting of text and
optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form
`pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text.

conversation = [
{

"role": "user",
"content": [
{"type": "text", "text": "Please describe this image in detail."},
{"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"},
],
},
]

Args:
conversation (`List[Dict, str, str]`):
The conversation to format.
Expand All @@ -1089,8 +1106,13 @@ def apply_chat_template(
chat template is used.
tokenize (`bool`, *optional*, defaults to `False`):
Whether to tokenize the output or not.
return_dict (`bool`, defaults to `False`):
Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
processor_kwargs (`Dict[str: Any]`, *optional*):
Additional kwargs to pass to the processor. Used when `return_dict=True` and `tokenize=True`.
**kwargs:
Additional keyword arguments
Additional keyword arguments passed to the chat template, such as `tools` or `documents` depending
on whether it is supported by the current template.
"""

if chat_template is None:
Expand All @@ -1102,10 +1124,40 @@ def apply_chat_template(
"or provide a chat template as an argument. See "
"https://huggingface.co/docs/transformers/main/en/chat_templating for more information."
)
return self.tokenizer.apply_chat_template(
conversation, chat_template=chat_template, tokenize=tokenize, **kwargs

prompt = self.tokenizer.apply_chat_template(
conversation, chat_template=chat_template, tokenize=False, return_dict=False, **kwargs
)

def parse_conversation(conversation: Union[List[Dict[str, str]]]):
images, videos = [], []
for message in conversation:
visuals = [content for content in message["content"] if content["type"] in ["image", "video"]]
for vision_info in visuals:
if vision_info["type"] == "image" and "image" in vision_info:
images.append(load_image(vision_info["image"]))
elif vision_info["type"] == "video" and "video" in vision_info:
videos.append(load_video(vision_info["video"]))
return images, videos

# we will have to return all processed inputs in a dict
if tokenize:
images, videos = parse_conversation(conversation)
out = self(
text=prompt,
images=images if images else None,
videos=videos if videos else None,
padding=kwargs.get("padding", False),
truncation=kwargs.get("truncation", False),
max_length=kwargs.get("max_length", None),
return_tensors=kwargs.get("return_tensors", None),
)
if return_dict:
return out
else:
return out["input_ids"]
return prompt


def _validate_images_text_input_order(images, text):
"""
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
is_cv2_available,
is_cython_available,
is_datasets_available,
is_decord_available,
is_detectron2_available,
is_eetq_available,
is_essentia_available,
Expand Down Expand Up @@ -230,6 +231,7 @@
is_training_run_on_sagemaker,
is_uroman_available,
is_vision_available,
is_yt_dlp_available,
requires_backends,
torch_only_method,
)
Expand Down
Loading
Loading