Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ImageProcessorFast to Qwen2.5-VL processor #36164

Merged
merged 21 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions docs/source/en/model_doc/qwen2_5_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,6 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

[[autodoc]] Qwen2_5_VLConfig

## Qwen2_5_VLImageProcessor

[[autodoc]] Qwen2_5_VLImageProcessor
- preprocess

## Qwen2_5_VLProcessor

[[autodoc]] Qwen2_5_VLProcessor
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,6 @@
_import_structure["models.pixtral"].append("PixtralImageProcessor")
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
_import_structure["models.pvt"].extend(["PvtImageProcessor"])
_import_structure["models.qwen2_5_vl"].extend(["Qwen2_5_VLImageProcessor"])
_import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
_import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"])
_import_structure["models.sam"].extend(["SamImageProcessor"])
Expand Down Expand Up @@ -6442,7 +6441,6 @@
PoolFormerImageProcessor,
)
from .models.pvt import PvtImageProcessor
from .models.qwen2_5_vl import Qwen2_5_VLImageProcessor
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not breaking because we added it in this release!

from .models.qwen2_vl import Qwen2VLImageProcessor
from .models.rt_detr import RTDetrImageProcessor
from .models.sam import SamImageProcessor
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
("poolformer", ("PoolFormerImageProcessor",)),
("pvt", ("PvtImageProcessor",)),
("pvt_v2", ("PvtImageProcessor",)),
("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/qwen2_5_vl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

if TYPE_CHECKING:
from .configuration_qwen2_5_vl import *
from .image_processing_qwen2_5_vl import *
from .modeling_qwen2_5_vl import *
from .processing_qwen2_5_vl import *
else:
Expand Down
426 changes: 0 additions & 426 deletions src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py

This file was deleted.

59 changes: 11 additions & 48 deletions src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from torch.nn import CrossEntropyLoss

from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
PatchEmbed,
PatchMerger,
Expand Down Expand Up @@ -830,48 +829,6 @@ def prepare_inputs_for_generation(
return model_inputs


class Qwen2_5_VLImageProcessor(Qwen2VLImageProcessor):
r"""
Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.

Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to 14):
The spacial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to 2):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""

model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
"second_per_grid_ts",
]


class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
fps: Union[List[float], float]

Expand All @@ -889,18 +846,25 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
class Qwen2_5_VLProcessor(Qwen2VLProcessor):
r"""
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2_5_VLImageProcessor`], *optional*):
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""

image_processor_class = "Qwen2_5_VLImageProcessor"
image_processor_class = "AutoImageProcessor"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most needed!


@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
return names_from_processor + ["second_per_grid_ts"]

def __call__(
self,
Expand All @@ -913,7 +877,7 @@ def __call__(
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
Expand Down Expand Up @@ -1016,6 +980,5 @@ def __call__(
"Qwen2_5_VLForConditionalGeneration",
"Qwen2_5_VLModel",
"Qwen2_5_VLPreTrainedModel",
"Qwen2_5_VLImageProcessor",
"Qwen2_5_VLProcessor",
]
11 changes: 6 additions & 5 deletions src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
class Qwen2_5_VLProcessor(ProcessorMixin):
r"""
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2_5_VLImageProcessor`], *optional*):
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
Expand All @@ -62,7 +62,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]

image_processor_class = "Qwen2_5_VLImageProcessor"
image_processor_class = "AutoImageProcessor"
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
Expand All @@ -81,7 +81,7 @@ def __call__(
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.

Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
Expand Down Expand Up @@ -212,7 +212,8 @@ def post_process_image_text_to_text(self, generated_outputs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
return names_from_processor + ["second_per_grid_ts"]


__all__ = ["Qwen2_5_VLProcessor"]
7 changes: 0 additions & 7 deletions src/transformers/utils/dummy_vision_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,13 +590,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])


class Qwen2_5_VLImageProcessor(metaclass=DummyObject):
_backends = ["vision"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])


class Qwen2VLImageProcessor(metaclass=DummyObject):
_backends = ["vision"]

Expand Down
Loading