ywang96 · ywang96 · Feb 3, 2025 · Feb 2, 2025 · Feb 2, 2025 · Feb 3, 2025
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
@@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ModuleNotFoundError:
+        print('WARNING: `qwen-vl-utils` not installed, input images will not '
+              'be automatically resized. You can enable this functionality by '
+              '`pip install qwen-vl-utils`.')
+        process_vision_info = None
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    stop_token_ids = None
+
+    if process_vision_info is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+        image_data, _ = process_vision_info(messages,
+                                            return_video_sample_fps=False)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
@@ -404,6 +461,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
+    "qwen2_5_vl": load_qwen2_5_vl,
+
 }
 
 

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
@@ -773,8 +773,12 @@ def __init__(
         dtype: torch.dtype,
         mrope_section: Optional[List[int]] = None,
     ) -> None:
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
+                         base, is_neox_style, dtype)
 
         self.mrope_section = mrope_section
         if self.mrope_section:
@@ -835,7 +839,7 @@ def get_input_positions(
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        second_per_grid_ts: Optional[List[float]] = None,
+        video_second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
@@ -847,7 +851,7 @@ def get_input_positions(
                 hf_config=hf_config,
                 image_grid_thw=image_grid_thw,
                 video_grid_thw=video_grid_thw,
-                second_per_grid_ts=second_per_grid_ts,
+                video_second_per_grid_ts=video_second_per_grid_ts,
                 context_len=context_len,
                 seq_len=seq_len,
             )
@@ -860,7 +864,7 @@ def get_input_positions_tensor(
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        second_per_grid_ts: Optional[List[float]] = None,
+        video_second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[torch.Tensor, int]:
@@ -870,8 +874,8 @@ def get_input_positions_tensor(
         video_token_id = hf_config.video_token_id
         vision_start_token_id = hf_config.vision_start_token_id
         spatial_merge_size = hf_config.vision_config.spatial_merge_size
-        tokens_per_second = getattr(hf_config.vision_config,
-                                    "tokens_per_second", None)
+        video_tokens_per_second = getattr(hf_config.vision_config,
+                                          "tokens_per_second", 1.0)
 
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
@@ -891,6 +895,7 @@ def get_input_positions_tensor(
 
         image_index, video_index = 0, 0
         for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
             if image_token_id in input_tokens and remain_images > 0:
                 ed_image = input_tokens.index(image_token_id, st)
             else:
@@ -905,7 +910,6 @@ def get_input_positions_tensor(
                     image_grid_thw[image_index][1],
                     image_grid_thw[image_index][2],
                 )
-                second_per_grid_t = 0.0
                 image_index += 1
                 remain_images -= 1
                 ed = ed_image
@@ -915,10 +919,10 @@ def get_input_positions_tensor(
                     video_grid_thw[video_index][1],
                     video_grid_thw[video_index][2],
                 )
-                if second_per_grid_ts:
-                    second_per_grid_t = second_per_grid_ts[video_index]
-                else:
-                    second_per_grid_t = 1.0
+                video_second_per_grid_t = 1.0
+                if video_second_per_grid_ts is not None:
+                    video_second_per_grid_t = video_second_per_grid_ts[
+                        video_index]
                 video_index += 1
                 remain_videos -= 1
                 ed = ed_video
@@ -932,17 +936,9 @@ def get_input_positions_tensor(
             llm_pos_ids_list.append(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
-            if tokens_per_second is not None:
-                range_tensor = torch.arange(llm_grid_t).view(-1, 1)
-                expanded_range = range_tensor.expand(-1,
-                                                     llm_grid_h * llm_grid_w)
-                time_tensor = expanded_range * second_per_grid_t * \
-                    tokens_per_second
-                time_tensor_long = time_tensor.long()
-                t_index = time_tensor_long.flatten()
-            else:
-                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                    -1, llm_grid_h * llm_grid_w).flatten()
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       video_tokens_per_second).long().flatten()
 
             h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
                 llm_grid_t, -1, llm_grid_w).flatten()