From 8e12a4d9a4f8779d45378b0b3d5a6b6714bffafa Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 15 Jan 2025 21:09:26 -0800
Subject: [PATCH 1/5] fix

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 78a47e64d9afc..68d58c612b9d6 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -37,6 +37,7 @@
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
+_MAX_IMAGE_SIZE = 8686
 
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
@@ -101,6 +102,18 @@ def get_hf_processor(self):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_max_image_tokens(self) -> int:
+
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        # FIXME: This is in fact not accurate and we compare with a known
+        # max image size.
+        return max(
+            self.get_num_image_tokens(
+                image_width=target_width,
+                image_height=target_height,
+            ), _MAX_IMAGE_SIZE)
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),

From c53d6961f79d73da5f6d545458bd31ad6ca25b41 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 15 Jan 2025 21:24:13 -0800
Subject: [PATCH 2/5] fix

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 68d58c612b9d6..7b381cd493911 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -37,7 +37,7 @@
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
-_MAX_IMAGE_SIZE = 8686
+_MAX_IMAGE_SIZE_PLACEHOLDER = 12288
 
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
@@ -106,13 +106,12 @@ def get_max_image_tokens(self) -> int:
 
         target_width, target_height = self.get_image_size_with_most_features()
 
-        # FIXME: This is in fact not accurate and we compare with a known
-        # max image size.
+        # FIXME: This is in fact not accurate and we compare with a placeholder.
         return max(
             self.get_num_image_tokens(
                 image_width=target_width,
                 image_height=target_height,
-            ), _MAX_IMAGE_SIZE)
+            ), _MAX_IMAGE_SIZE_PLACEHOLDER)
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {

From 53f97eed1703cbf17b07bb718dbdc51d2ba44d92 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 16 Jan 2025 12:21:54 +0000
Subject: [PATCH 3/5] Fix and add tests

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../multimodal/processing/test_llava_next.py  | 61 ++++++++++++++++++
 .../processing/test_llava_onevision.py        | 62 +++++++++++++++++++
 vllm/model_executor/models/llava_onevision.py | 19 ++----
 3 files changed, 128 insertions(+), 14 deletions(-)

diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 1eec35d9c3c72..efdaaf624ff4c 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -13,6 +13,67 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+    
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 2
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 94ea604c58b43..f5d5dd7597f06 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -13,6 +13,68 @@
 from ...utils import build_model_context
 
 
+def _validate_image_max_tokens_one(
+    processor: BaseMultiModalProcessor,
+    max_tokens: int,
+    failed_size_excs: list[tuple[ImageSize, Exception]],
+    image_size: ImageSize,
+) -> None:
+    info = processor.info
+    feature_size = info.get_num_image_tokens(image_width=image_size.width,
+                                             image_height=image_size.height)
+
+    try:
+        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
+    except Exception as exc:
+        failed_size_excs.append((image_size, exc))
+    
+
+@pytest.mark.skip("This test takes around 5 minutes to run. "
+                  "Comment this out to run it manually.")
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_processor_max_tokens(model_id):
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
+    )
+    info = processor.info
+
+    seen_aspect_ratios = set[float]()
+    image_sizes = list[ImageSize]()
+
+    # The aspect ratio of the grid layout is between 1 and 6
+    # NOTE: Assumes that feature size calculation is the same if we
+    # swap the width and height of the image
+    for w, h in itertools.product(range(32, 4096), repeat=2):
+        aspect_ratio = w / h
+        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
+            image_sizes.append(ImageSize(w, h))
+            seen_aspect_ratios.add(aspect_ratio)
+    
+    failed_size_excs = list[tuple[ImageSize, Exception]]()
+
+    validate_one = partial(
+        _validate_image_max_tokens_one,
+        processor,
+        info.get_max_image_tokens(),  # type: ignore
+        failed_size_excs,
+    )
+    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
+
+    if failed_size_excs:
+        msg = "Found failing image sizes:" \
+            + "\n========\n".join(f"[{size}]\n{exc}"
+                                  for size, exc in failed_size_excs)
+        raise AssertionError(msg)
+
+
 def _validate_image_prompt_replacements_one(
     processor: BaseMultiModalProcessor,
     num_imgs: int,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 7b381cd493911..128c040363a4c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
-                                   VideoProcessorItems)
+from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
+                                   VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -37,7 +37,6 @@
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
-_MAX_IMAGE_SIZE_PLACEHOLDER = 12288
 
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
@@ -102,17 +101,6 @@ def get_hf_processor(self):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
-    def get_max_image_tokens(self) -> int:
-
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        # FIXME: This is in fact not accurate and we compare with a placeholder.
-        return max(
-            self.get_num_image_tokens(
-                image_width=target_width,
-                image_height=target_height,
-            ), _MAX_IMAGE_SIZE_PLACEHOLDER)
-
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
@@ -157,6 +145,9 @@ def _get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
+    def get_image_size_with_most_features(self) -> ImageSize:
+        return ImageSize(width=1153, height=944)
+
     def _get_num_frame_tokens(
         self,
         *,

From 7fbe17bda61834c9897ed0f5e46b337b92894ee1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 16 Jan 2025 12:23:00 +0000
Subject: [PATCH 4/5] Add note

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/llava_onevision.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 128c040363a4c..c9283e0c5ba20 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -146,6 +146,7 @@ def _get_num_unpadded_features(
         return (unpadded_features, newline_features)
 
     def get_image_size_with_most_features(self) -> ImageSize:
+        # NOTE: This hardcoded value is found via processor tests
         return ImageSize(width=1153, height=944)
 
     def _get_num_frame_tokens(

From bd712ea75a650689408fe86d6c76dbbfb755cdb5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 16 Jan 2025 12:27:26 +0000
Subject: [PATCH 5/5] Format

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/multimodal/processing/test_llava_next.py      | 4 ++--
 tests/models/multimodal/processing/test_llava_onevision.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index efdaaf624ff4c..6de649f87204d 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -27,7 +27,7 @@ def _validate_image_max_tokens_one(
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
-    
+
 
 @pytest.mark.skip("This test takes around 5 minutes to run. "
                   "Comment this out to run it manually.")
@@ -56,7 +56,7 @@ def test_processor_max_tokens(model_id):
         if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
             image_sizes.append(ImageSize(w, h))
             seen_aspect_ratios.add(aspect_ratio)
-    
+
     failed_size_excs = list[tuple[ImageSize, Exception]]()
 
     validate_one = partial(
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index f5d5dd7597f06..806437d35ec87 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -27,7 +27,7 @@ def _validate_image_max_tokens_one(
         assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
-    
+
 
 @pytest.mark.skip("This test takes around 5 minutes to run. "
                   "Comment this out to run it manually.")
@@ -57,7 +57,7 @@ def test_processor_max_tokens(model_id):
         if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
             image_sizes.append(ImageSize(w, h))
             seen_aspect_ratios.add(aspect_ratio)
-    
+
     failed_size_excs = list[tuple[ImageSize, Exception]]()
 
     validate_one = partial(