Skip to content

Commit

Permalink
[VLM] Simplify post-processing of replacement info (vllm-project#12269)
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Bowen Wang <abmfy@icloud.com>
  • Loading branch information
DarkLight1337 authored and abmfy committed Jan 24, 2025
1 parent 6de12e5 commit e8f3d84
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 208 deletions.
2 changes: 1 addition & 1 deletion tests/models/multimodal/processing/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _test_processing_correctness(
task="auto",
tokenizer=model_id,
tokenizer_mode="auto",
trust_remote_code=True,
trust_remote_code=model_info.trust_remote_code,
seed=0,
dtype="float16",
revision=None,
Expand Down
3 changes: 2 additions & 1 deletion tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,8 @@ def check_available_online(
trust_remote_code=True),
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
trust_remote_code=True),
# [Encoder-decoder]
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
Expand Down
42 changes: 23 additions & 19 deletions tests/multimodal/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@

from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement,
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
PromptReplacement,
find_mm_placeholders,
find_text_matches, find_token_matches,
iter_token_matches,
replace_text_matches,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer
Expand Down Expand Up @@ -433,19 +437,19 @@ def test_find_replace_tokens(
[1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=6,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_4": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_4",
item_idx=0,
start_idx=3,
replacement=[32000],
tokens=[32000],
),
],
}
Expand All @@ -455,25 +459,25 @@ def test_find_replace_tokens(
[1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=5,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_3": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_3",
item_idx=0,
start_idx=7,
replacement=[1550, 918, 1550],
tokens=[1550, 918, 1550],
),
],
# No match for pattern_4 as it has lower priority than pattern_1
Expand All @@ -483,33 +487,33 @@ def test_find_replace_tokens(
[1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
{
"pattern_1": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=0,
start_idx=1,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_1",
item_idx=1,
start_idx=3,
replacement=[32000, 32000],
tokens=[32000, 32000],
),
],
"pattern_4": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_4",
item_idx=0,
start_idx=5,
replacement=[32000],
tokens=[32000],
),
],
"pattern_3": [
PlaceholderInfo(
PlaceholderFeaturesInfo(
modality="pattern_3",
item_idx=0,
start_idx=6,
replacement=[1550, 918, 1550],
tokens=[1550, 918, 1550],
),
],
}
Expand Down
10 changes: 2 additions & 8 deletions vllm/model_executor/models/aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,13 +342,7 @@ def get_vision_config(self):
return self.get_hf_config().vision_config

def get_hf_processor(self):
processor = self.ctx.get_hf_processor(AriaProcessor)

# Patch for https://github.com/huggingface/transformers/issues/35768
processor.tokenizer.image_token = "<|img|>"
processor.image_token = "<|img|>"

return processor
return self.ctx.get_hf_processor(AriaProcessor)

def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
Expand Down Expand Up @@ -381,7 +375,7 @@ def get_dummy_processor_inputs(
}

hf_processor = self.info.get_hf_processor()
image_token: str = hf_processor.image_token # type: ignore
image_token: str = hf_processor.tokenizer.image_token # type: ignore

return ProcessorInputs(
prompt_text=image_token * num_images,
Expand Down
33 changes: 8 additions & 25 deletions vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -481,30 +481,13 @@ def _get_prompt_replacements(
PromptReplacement(
modality="image",
target="</s>",
replacement="<image>" * num_image_tokens + "</s>",
replacement=PromptReplacementDetails(
full="<image>" * num_image_tokens + "</s>",
features="<image>" * num_image_tokens,
),
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
# so we ignore the trailing bos_token
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
info=Blip2ProcessingInfo,
Expand Down
42 changes: 13 additions & 29 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -141,39 +141,23 @@ def _get_prompt_replacements(
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_tokens = processor.image_token * self.info.get_num_image_tokens()

return [
PromptReplacement(
modality="image",
target="<image>",
replacement="".join([
processor.image_start_token,
processor.image_token * self.info.get_num_image_tokens(),
processor.image_end_token,
]),
replacement=PromptReplacementDetails(
full="".join([
processor.image_start_token,
image_tokens,
processor.image_end_token,
]),
features=image_tokens,
),
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only <image> tokens should be considered as placeholders,
# so we ignore the image_start_token and image_end_token
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"] + 1,
length=p["length"] - 2) for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


class ChameleonLayerNorm(nn.LayerNorm):

Expand Down
38 changes: 11 additions & 27 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
""" PyTorch Fuyu model."""
import math
from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
TypedDict, Union)
TypedDict)

import torch
import torch.nn as nn
Expand All @@ -30,13 +30,13 @@
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs,
NestedTensors, PlaceholderRange)
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement)
BaseProcessingInfo, PromptReplacement,
PromptReplacementDetails)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors

Expand Down Expand Up @@ -215,9 +215,13 @@ def get_replacement_fuyu(item_idx: int):
image_width=image_size.width,
image_height=image_size.height,
)
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows

return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows +
[bos_token_id])
return PromptReplacementDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
)

return [
PromptReplacement(
Expand All @@ -227,26 +231,6 @@ def get_replacement_fuyu(item_idx: int):
)
]

def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalInputs:
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

# Only |SPEAKER| (image) tokens should be considered as placeholders,
# so we ignore the trailing bos_token_id
result["mm_placeholders"] = {
modality: [
PlaceholderRange(offset=p["offset"], length=p["length"] - 1)
for p in ps
]
for modality, ps in result["mm_placeholders"].items()
}

return result


@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor,
info=FuyuProcessingInfo,
Expand Down
Loading

0 comments on commit e8f3d84

Please sign in to comment.