Skip to content

Commit

Permalink
Add ImageProcessorFast to Qwen2.5-VL processor (#36164)
Browse files Browse the repository at this point in the history
* add qwen2 fast image processor to modular file

Signed-off-by: isotr0py <2037008807@qq.com>

* fix modular

Signed-off-by: isotr0py <2037008807@qq.com>

* fix circle import

Signed-off-by: isotr0py <2037008807@qq.com>

* add docs

Signed-off-by: isotr0py <2037008807@qq.com>

* fix typo

Signed-off-by: isotr0py <2037008807@qq.com>

* add modular generated files

Signed-off-by: isotr0py <2037008807@qq.com>

* revert qwen2vl fast image processor

Signed-off-by: isotr0py <2037008807@qq.com>

* remove qwen2.5-vl image processor from modular

Signed-off-by: isotr0py <2037008807@qq.com>

* re-generate qwen2.5-vl files

Signed-off-by: isotr0py <2037008807@qq.com>

* remove unnecessary test

Signed-off-by: isotr0py <2037008807@qq.com>

* fix auto map

Signed-off-by: isotr0py <2037008807@qq.com>

* cleanup

Signed-off-by: isotr0py <2037008807@qq.com>

* fix model_input_names

Signed-off-by: isotr0py <2037008807@qq.com>

* remove import

Signed-off-by: isotr0py <2037008807@qq.com>

* make fix-copies

Signed-off-by: isotr0py <2037008807@qq.com>

---------

Signed-off-by: isotr0py <2037008807@qq.com>
  • Loading branch information
Isotr0py authored Feb 14, 2025
1 parent 1931a35 commit 33d1d71
Show file tree
Hide file tree
Showing 10 changed files with 20 additions and 748 deletions.
5 changes: 0 additions & 5 deletions docs/source/en/model_doc/qwen2_5_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,6 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

[[autodoc]] Qwen2_5_VLConfig

## Qwen2_5_VLImageProcessor

[[autodoc]] Qwen2_5_VLImageProcessor
- preprocess

## Qwen2_5_VLProcessor

[[autodoc]] Qwen2_5_VLProcessor
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,6 @@
_import_structure["models.pixtral"].append("PixtralImageProcessor")
_import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
_import_structure["models.pvt"].extend(["PvtImageProcessor"])
_import_structure["models.qwen2_5_vl"].extend(["Qwen2_5_VLImageProcessor"])
_import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
_import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"])
_import_structure["models.sam"].extend(["SamImageProcessor"])
Expand Down Expand Up @@ -6444,7 +6443,6 @@
PoolFormerImageProcessor,
)
from .models.pvt import PvtImageProcessor
from .models.qwen2_5_vl import Qwen2_5_VLImageProcessor
from .models.qwen2_vl import Qwen2VLImageProcessor
from .models.rt_detr import RTDetrImageProcessor
from .models.sam import SamImageProcessor
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/image_processing_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@
("poolformer", ("PoolFormerImageProcessor",)),
("pvt", ("PvtImageProcessor",)),
("pvt_v2", ("PvtImageProcessor",)),
("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")),
("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")),
Expand Down
1 change: 0 additions & 1 deletion src/transformers/models/qwen2_5_vl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

if TYPE_CHECKING:
from .configuration_qwen2_5_vl import *
from .image_processing_qwen2_5_vl import *
from .modeling_qwen2_5_vl import *
from .processing_qwen2_5_vl import *
else:
Expand Down
426 changes: 0 additions & 426 deletions src/transformers/models/qwen2_5_vl/image_processing_qwen2_5_vl.py

This file was deleted.

59 changes: 11 additions & 48 deletions src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from torch.nn import CrossEntropyLoss

from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
PatchEmbed,
PatchMerger,
Expand Down Expand Up @@ -854,48 +853,6 @@ def prepare_inputs_for_generation(
return model_inputs


class Qwen2_5_VLImageProcessor(Qwen2VLImageProcessor):
r"""
Constructs a Qwen2.5-VL image processor that dynamically resizes images based on the original images.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to 14):
The spacial patch size of the vision encoder.
temporal_patch_size (`int`, *optional*, defaults to 2):
The temporal patch size of the vision encoder.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""

model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
"second_per_grid_ts",
]


class Qwen2_5_VLVideosProcessorKwargs(VideosKwargs, total=False):
fps: Union[List[float], float]

Expand All @@ -913,18 +870,25 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
class Qwen2_5_VLProcessor(Qwen2VLProcessor):
r"""
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2_5_VLImageProcessor`], *optional*):
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""

image_processor_class = "Qwen2_5_VLImageProcessor"
image_processor_class = "AutoImageProcessor"

@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
return names_from_processor + ["second_per_grid_ts"]

def __call__(
self,
Expand All @@ -937,7 +901,7 @@ def __call__(
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
Expand Down Expand Up @@ -1040,6 +1004,5 @@ def __call__(
"Qwen2_5_VLForConditionalGeneration",
"Qwen2_5_VLModel",
"Qwen2_5_VLPreTrainedModel",
"Qwen2_5_VLImageProcessor",
"Qwen2_5_VLProcessor",
]
11 changes: 6 additions & 5 deletions src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
class Qwen2_5_VLProcessor(ProcessorMixin):
r"""
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2_5_VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2_5_VLImageProcessor`], *optional*):
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
Expand All @@ -62,7 +62,7 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
valid_kwargs = ["chat_template"]

image_processor_class = "Qwen2_5_VLImageProcessor"
image_processor_class = "AutoImageProcessor"
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")

def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
Expand All @@ -81,7 +81,7 @@ def __call__(
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2_5_VLImageProcessor's [`~Qwen2_5_VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
Expand Down Expand Up @@ -212,7 +212,8 @@ def post_process_image_text_to_text(self, generated_outputs):
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
image_processor_input_names = self.image_processor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
return names_from_processor + ["second_per_grid_ts"]


__all__ = ["Qwen2_5_VLProcessor"]
7 changes: 0 additions & 7 deletions src/transformers/utils/dummy_vision_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,13 +590,6 @@ def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])


class Qwen2_5_VLImageProcessor(metaclass=DummyObject):
_backends = ["vision"]

def __init__(self, *args, **kwargs):
requires_backends(self, ["vision"])


class Qwen2VLImageProcessor(metaclass=DummyObject):
_backends = ["vision"]

Expand Down
Loading

0 comments on commit 33d1d71

Please sign in to comment.