From 8e12a4d9a4f8779d45378b0b3d5a6b6714bffafa Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 15 Jan 2025 21:09:26 -0800 Subject: [PATCH 1/5] fix Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 78a47e64d9afc..68d58c612b9d6 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -37,6 +37,7 @@ # For profile run _MAX_FRAMES_PER_VIDEO = 16 +_MAX_IMAGE_SIZE = 8686 class LlavaOnevisionVideoPixelInputs(TypedDict): @@ -101,6 +102,18 @@ def get_hf_processor(self): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} + def get_max_image_tokens(self) -> int: + + target_width, target_height = self.get_image_size_with_most_features() + + # FIXME: This is in fact not accurate and we compare with a known + # max image size. + return max( + self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ), _MAX_IMAGE_SIZE) + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), From c53d6961f79d73da5f6d545458bd31ad6ca25b41 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 15 Jan 2025 21:24:13 -0800 Subject: [PATCH 2/5] fix Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 68d58c612b9d6..7b381cd493911 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -37,7 +37,7 @@ # For profile run _MAX_FRAMES_PER_VIDEO = 16 -_MAX_IMAGE_SIZE = 8686 +_MAX_IMAGE_SIZE_PLACEHOLDER = 12288 class LlavaOnevisionVideoPixelInputs(TypedDict): @@ -106,13 +106,12 @@ def get_max_image_tokens(self) -> int: target_width, target_height = self.get_image_size_with_most_features() - # FIXME: This is in fact not accurate and we compare with a known - # max image size. + # FIXME: This is in fact not accurate and we compare with a placeholder. return max( self.get_num_image_tokens( image_width=target_width, image_height=target_height, - ), _MAX_IMAGE_SIZE) + ), _MAX_IMAGE_SIZE_PLACEHOLDER) def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return { From 53f97eed1703cbf17b07bb718dbdc51d2ba44d92 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 16 Jan 2025 12:21:54 +0000 Subject: [PATCH 3/5] Fix and add tests Signed-off-by: DarkLight1337 --- .../multimodal/processing/test_llava_next.py | 61 ++++++++++++++++++ .../processing/test_llava_onevision.py | 62 +++++++++++++++++++ vllm/model_executor/models/llava_onevision.py | 19 ++---- 3 files changed, 128 insertions(+), 14 deletions(-) diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index 1eec35d9c3c72..efdaaf624ff4c 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -13,6 +13,67 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 2 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index 94ea604c58b43..f5d5dd7597f06 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -13,6 +13,68 @@ from ...utils import build_model_context +def _validate_image_max_tokens_one( + processor: BaseMultiModalProcessor, + max_tokens: int, + failed_size_excs: list[tuple[ImageSize, Exception]], + image_size: ImageSize, +) -> None: + info = processor.info + feature_size = info.get_num_image_tokens(image_width=image_size.width, + image_height=image_size.height) + + try: + assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" + except Exception as exc: + failed_size_excs.append((image_size, exc)) + + +@pytest.mark.skip("This test takes around 5 minutes to run. " + "Comment this out to run it manually.") +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +def test_processor_max_tokens(model_id): + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": 1}, + ) + processor = MULTIMODAL_REGISTRY.create_processor( + ctx.model_config, + tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer), + ) + info = processor.info + + seen_aspect_ratios = set[float]() + image_sizes = list[ImageSize]() + + # The aspect ratio of the grid layout is between 1 and 6 + # NOTE: Assumes that feature size calculation is the same if we + # swap the width and height of the image + for w, h in itertools.product(range(32, 4096), repeat=2): + aspect_ratio = w / h + if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: + image_sizes.append(ImageSize(w, h)) + seen_aspect_ratios.add(aspect_ratio) + + failed_size_excs = list[tuple[ImageSize, Exception]]() + + validate_one = partial( + _validate_image_max_tokens_one, + processor, + info.get_max_image_tokens(), # type: ignore + failed_size_excs, + ) + pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes") + + if failed_size_excs: + msg = "Found failing image sizes:" \ + + "\n========\n".join(f"[{size}]\n{exc}" + for size, exc in failed_size_excs) + raise AssertionError(msg) + + def _validate_image_prompt_replacements_one( processor: BaseMultiModalProcessor, num_imgs: int, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 7b381cd493911..128c040363a4c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import PromptReplacement from vllm.multimodal.profiling import ProcessorInputs from vllm.sequence import IntermediateTensors @@ -37,7 +37,6 @@ # For profile run _MAX_FRAMES_PER_VIDEO = 16 -_MAX_IMAGE_SIZE_PLACEHOLDER = 12288 class LlavaOnevisionVideoPixelInputs(TypedDict): @@ -102,17 +101,6 @@ def get_hf_processor(self): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def get_max_image_tokens(self) -> int: - - target_width, target_height = self.get_image_size_with_most_features() - - # FIXME: This is in fact not accurate and we compare with a placeholder. - return max( - self.get_num_image_tokens( - image_width=target_width, - image_height=target_height, - ), _MAX_IMAGE_SIZE_PLACEHOLDER) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return { "image": self.get_max_image_tokens(), @@ -157,6 +145,9 @@ def _get_num_unpadded_features( return (unpadded_features, newline_features) + def get_image_size_with_most_features(self) -> ImageSize: + return ImageSize(width=1153, height=944) + def _get_num_frame_tokens( self, *, From 7fbe17bda61834c9897ed0f5e46b337b92894ee1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 16 Jan 2025 12:23:00 +0000 Subject: [PATCH 4/5] Add note Signed-off-by: DarkLight1337 --- vllm/model_executor/models/llava_onevision.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 128c040363a4c..c9283e0c5ba20 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -146,6 +146,7 @@ def _get_num_unpadded_features( return (unpadded_features, newline_features) def get_image_size_with_most_features(self) -> ImageSize: + # NOTE: This hardcoded value is found via processor tests return ImageSize(width=1153, height=944) def _get_num_frame_tokens( From bd712ea75a650689408fe86d6c76dbbfb755cdb5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 16 Jan 2025 12:27:26 +0000 Subject: [PATCH 5/5] Format Signed-off-by: DarkLight1337 --- tests/models/multimodal/processing/test_llava_next.py | 4 ++-- tests/models/multimodal/processing/test_llava_onevision.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index efdaaf624ff4c..6de649f87204d 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -27,7 +27,7 @@ def _validate_image_max_tokens_one( assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" except Exception as exc: failed_size_excs.append((image_size, exc)) - + @pytest.mark.skip("This test takes around 5 minutes to run. " "Comment this out to run it manually.") @@ -56,7 +56,7 @@ def test_processor_max_tokens(model_id): if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios: image_sizes.append(ImageSize(w, h)) seen_aspect_ratios.add(aspect_ratio) - + failed_size_excs = list[tuple[ImageSize, Exception]]() validate_one = partial( diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index f5d5dd7597f06..806437d35ec87 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -27,7 +27,7 @@ def _validate_image_max_tokens_one( assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}" except Exception as exc: failed_size_excs.append((image_size, exc)) - + @pytest.mark.skip("This test takes around 5 minutes to run. " "Comment this out to run it manually.") @@ -57,7 +57,7 @@ def test_processor_max_tokens(model_id): if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios: image_sizes.append(ImageSize(w, h)) seen_aspect_ratios.add(aspect_ratio) - + failed_size_excs = list[tuple[ImageSize, Exception]]() validate_one = partial(