Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VLM] Simplify post-processing of replacement info #12269

Merged
merged 1 commit into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _test_processing_correctness(
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=True,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="float16",
revision=None,
Expand Down
3 changes: 2 additions & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ def check_available_online(
trust_remote_code=True),
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
trust_remote_code=True),
# [Encoder-decoder]
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
Expand Down
42 changes: 23 additions & 19 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@

from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
PromptReplacement,
find_mm_placeholders,
find_text_matches, find_token_matches,
iter_token_matches,
replace_text_matches,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer
Expand Down Expand Up @@ -433,19 +437,19 @@ def test_find_replace_tokens(
[1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=6,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_4": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_4",
item_idx=0,
start_idx=3,
replacement=[32000],
tokens=[32000],
),
],
}
Expand All @@ -455,25 +459,25 @@ def test_find_replace_tokens(
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=5,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_3": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_3",
item_idx=0,
start_idx=7,
replacement=[1550, 918, 1550],
tokens=[1550, 918, 1550],
),
],
# No match for pattern_4 as it has lower priority than pattern_1
Expand All @@ -483,33 +487,33 @@ def test_find_replace_tokens(
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=3,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_4": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_4",
item_idx=0,
start_idx=5,
replacement=[32000],
tokens=[32000],
),
],
"pattern_3": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_3",
item_idx=0,
start_idx=6,
replacement=[1550, 918, 1550],
tokens=[1550, 918, 1550],
),
],
}
Expand Down
10 changes: 2 additions & 8 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,13 +342,7 @@ def get_vision_config(self):
return self.get_hf_config().vision_config

def get_hf_processor(self):
processor = self.ctx.get_hf_processor(AriaProcessor)

# Patch for https://github.com/huggingface/transformers/issues/35768
processor.tokenizer.image_token = "<|img|>"
processor.image_token = "<|img|>"

return processor
return self.ctx.get_hf_processor(AriaProcessor)

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
Expand Down Expand Up @@ -381,7 +375,7 @@ def get_dummy_processor_inputs(
}

hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token # type: ignore
image_token: str = hf_processor.tokenizer.image_token # type: ignore

return ProcessorInputs(
prompt_text=image_token * num_images,
Expand Down
33 changes: 8 additions & 25 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -481,30 +481,13 @@ def _get_prompt_replacements(
PromptReplacement(
modality="image",
target="</s>",
replacement="<image>" * num_image_tokens + "</s>",
replacement=PromptReplacementDetails(
full="<image>" * num_image_tokens + "</s>",
features="<image>" * num_image_tokens,
),
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
# so we ignore the trailing bos_token
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
info=Blip2ProcessingInfo,
Expand Down
42 changes: 13 additions & 29 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -141,39 +141,23 @@ def _get_prompt_replacements(
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_tokens = processor.image_token * self.info.get_num_image_tokens()

return [
PromptReplacement(
modality="image",
target="<image>",
replacement="".join([
processor.image_start_token,
processor.image_token * self.info.get_num_image_tokens(),
processor.image_end_token,
]),
replacement=PromptReplacementDetails(
full="".join([
processor.image_start_token,
image_tokens,
processor.image_end_token,
]),
features=image_tokens,
),
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
# so we ignore the image_start_token and image_end_token
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"] + 1,
length=p["length"] - 2) for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


class ChameleonLayerNorm(nn.LayerNorm):

Expand Down
38 changes: 11 additions & 27 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
""" PyTorch Fuyu model."""
import math
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
TypedDict, Union)
TypedDict)

import torch
import torch.nn as nn
Expand All @@ -30,13 +30,13 @@
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -215,9 +215,13 @@ def get_replacement_fuyu(item_idx: int):
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows

return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
[bos_token_id])
return PromptReplacementDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
)

return [
PromptReplacement(
Expand All @@ -227,26 +231,6 @@ def get_replacement_fuyu(item_idx: int):
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only |SPEAKER| (image) tokens should be considered as placeholders,
# so we ignore the trailing bos_token_id
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
info=FuyuProcessingInfo,
Expand Down
Loading
Loading