Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Qwen2.5_VL fix from Qwen Team #2

Merged
merged 5 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions examples/offline_inference/vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
)


def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
try:
from qwen_vl_utils import process_vision_info
except ModuleNotFoundError:
print('WARNING: `qwen-vl-utils` not installed, input images will not '
'be automatically resized. You can enable this functionality by '
'`pip install qwen-vl-utils`.')
process_vision_info = None

model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

llm = LLM(
model=model_name,
max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)

placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]

processor = AutoProcessor.from_pretrained(model_name)

prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)

stop_token_ids = None

if process_vision_info is None:
image_data = [fetch_image(url) for url in image_urls]
else:
image_data, _ = process_vision_info(messages,
return_video_sample_fps=False)

return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=image_data,
chat_template=None,
)


model_example_map = {
"aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2,
Expand All @@ -404,6 +461,8 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
"pixtral_hf": load_pixtral_hf,
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,

}


Expand Down
42 changes: 19 additions & 23 deletions vllm/model_executor/layers/rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,8 +773,12 @@ def __init__(
dtype: torch.dtype,
mrope_section: Optional[List[int]] = None,
) -> None:
super().__init__(head_size, rotary_dim, max_position_embeddings, base,
is_neox_style, dtype)
# In Qwen2.5-VL, the maximum index value is related to the duration of
# the input video. We enlarge max_position_embeddings to 4 times to get
# a larger the cos and sin cache.
self.cache_max_position_num = max_position_embeddings * 4
super().__init__(head_size, rotary_dim, self.cache_max_position_num,
base, is_neox_style, dtype)

self.mrope_section = mrope_section
if self.mrope_section:
Expand Down Expand Up @@ -835,7 +839,7 @@ def get_input_positions(
hf_config: PretrainedConfig,
image_grid_thw: Union[List[List[int]], torch.Tensor],
video_grid_thw: Union[List[List[int]], torch.Tensor],
second_per_grid_ts: Optional[List[float]] = None,
video_second_per_grid_ts: Optional[List[float]] = None,
context_len: int = 0,
seq_len: Optional[int] = None,
) -> Tuple[List[List[int]], int]:
Expand All @@ -847,7 +851,7 @@ def get_input_positions(
hf_config=hf_config,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
video_second_per_grid_ts=video_second_per_grid_ts,
context_len=context_len,
seq_len=seq_len,
)
Expand All @@ -860,7 +864,7 @@ def get_input_positions_tensor(
hf_config: PretrainedConfig,
image_grid_thw: Union[List[List[int]], torch.Tensor],
video_grid_thw: Union[List[List[int]], torch.Tensor],
second_per_grid_ts: Optional[List[float]] = None,
video_second_per_grid_ts: Optional[List[float]] = None,
Comment on lines -863 to +867
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it okay if we keep it as second_per_grid_ts? I think generally speaking it's better if we use the same names as those of the output of the Processor class unless there's a strong reason for us to use a different one, what do you think?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The term second_per_grid_ts is a video-related parameter, so using video_second_per_grid_ts would be more appropriate from this perspective. However, if you need to maintain consistency with the transformer, that is also acceptable.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good! I'll merge this PR and rename it afterwards!

context_len: int = 0,
seq_len: Optional[int] = None,
) -> Tuple[torch.Tensor, int]:
Expand All @@ -870,8 +874,8 @@ def get_input_positions_tensor(
video_token_id = hf_config.video_token_id
vision_start_token_id = hf_config.vision_start_token_id
spatial_merge_size = hf_config.vision_config.spatial_merge_size
tokens_per_second = getattr(hf_config.vision_config,
"tokens_per_second", None)
video_tokens_per_second = getattr(hf_config.vision_config,
"tokens_per_second", 1.0)
Comment on lines -873 to +878
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto


if isinstance(image_grid_thw, torch.Tensor):
image_grid_thw = image_grid_thw.tolist()
Expand All @@ -891,6 +895,7 @@ def get_input_positions_tensor(

image_index, video_index = 0, 0
for _ in range(image_nums + video_nums):
video_second_per_grid_t = 0.0
if image_token_id in input_tokens and remain_images > 0:
ed_image = input_tokens.index(image_token_id, st)
else:
Expand All @@ -905,7 +910,6 @@ def get_input_positions_tensor(
image_grid_thw[image_index][1],
image_grid_thw[image_index][2],
)
second_per_grid_t = 0.0
image_index += 1
remain_images -= 1
ed = ed_image
Expand All @@ -915,10 +919,10 @@ def get_input_positions_tensor(
video_grid_thw[video_index][1],
video_grid_thw[video_index][2],
)
if second_per_grid_ts:
second_per_grid_t = second_per_grid_ts[video_index]
else:
second_per_grid_t = 1.0
video_second_per_grid_t = 1.0
if video_second_per_grid_ts is not None:
video_second_per_grid_t = video_second_per_grid_ts[
video_index]
video_index += 1
remain_videos -= 1
ed = ed_video
Expand All @@ -932,17 +936,9 @@ def get_input_positions_tensor(
llm_pos_ids_list.append(
torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)

if tokens_per_second is not None:
range_tensor = torch.arange(llm_grid_t).view(-1, 1)
expanded_range = range_tensor.expand(-1,
llm_grid_h * llm_grid_w)
time_tensor = expanded_range * second_per_grid_t * \
tokens_per_second
time_tensor_long = time_tensor.long()
t_index = time_tensor_long.flatten()
else:
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-1, llm_grid_h * llm_grid_w).flatten()
t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
-1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
video_tokens_per_second).long().flatten()

h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
llm_grid_t, -1, llm_grid_w).flatten()
Expand Down
Loading