Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model][VLM] Add LLaVA-Onevision model support #8486

Merged
merged 20 commits into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ Multimodal Language Models
- Video
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
-
* - :code:`LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- Image\ :sup:`+` / Video
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. (see note)
-
* - :code:`MiniCPMV`
- MiniCPM-V
- Image\ :sup:`+`
Expand Down Expand Up @@ -288,7 +293,7 @@ Multimodal Language Models
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

.. note::
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
For :code:`LLaVA-NeXT-Video`, :code:`LLaVA-Onevision` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
This can be installed by running the following command:

.. code-block:: bash
Expand Down
60 changes: 47 additions & 13 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@


# LLaVA-1.5
def run_llava(question):
def run_llava(question, modality):
assert modality == "image"

prompt = f"USER: <image>\n{question}\nASSISTANT:"

Expand All @@ -24,7 +25,8 @@ def run_llava(question):


# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(question):
def run_llava_next(question, modality):
assert modality == "image"

prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
Expand All @@ -34,15 +36,35 @@ def run_llava_next(question):

# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question):
def run_llava_next_video(question, modality):
assert modality == "video"

prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids


# LLaVA-OneVision
def run_llava_onevision(question, modality):

if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"

elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"

llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=32768)
stop_token_ids = None
return llm, prompt, stop_token_ids


# Fuyu
def run_fuyu(question):
def run_fuyu(question, modality):
assert modality == "image"

prompt = f"{question}\n"
llm = LLM(model="adept/fuyu-8b")
Expand All @@ -51,7 +73,8 @@ def run_fuyu(question):


# Phi-3-Vision
def run_phi3v(question):
def run_phi3v(question, modality):
assert modality == "image"

prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
# Note: The default setting of max_num_seqs (256) and
Expand All @@ -70,7 +93,8 @@ def run_phi3v(question):


# PaliGemma
def run_paligemma(question):
def run_paligemma(question, modality):
assert modality == "image"

# PaliGemma has special prompt format for VQA
prompt = "caption en"
Expand All @@ -80,7 +104,8 @@ def run_paligemma(question):


# Chameleon
def run_chameleon(question):
def run_chameleon(question, modality):
assert modality == "image"

prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b")
Expand All @@ -89,7 +114,8 @@ def run_chameleon(question):


# MiniCPM-V
def run_minicpmv(question):
def run_minicpmv(question, modality):
assert modality == "image"

# 2.0
# The official repo doesn't work yet, so we need to use a fork for now
Expand Down Expand Up @@ -129,7 +155,9 @@ def run_minicpmv(question):


# InternVL
def run_internvl(question):
def run_internvl(question, modality):
assert modality == "image"

model_name = "OpenGVLab/InternVL2-2B"

llm = LLM(
Expand All @@ -155,7 +183,8 @@ def run_internvl(question):


# BLIP-2
def run_blip2(question):
def run_blip2(question, modality):
assert modality == "image"

# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
Expand All @@ -166,7 +195,8 @@ def run_blip2(question):


# Qwen
def run_qwen_vl(question):
def run_qwen_vl(question, modality):
assert modality == "image"

llm = LLM(
model="Qwen/Qwen-VL",
Expand All @@ -180,7 +210,9 @@ def run_qwen_vl(question):


# Qwen2-VL
def run_qwen2_vl(question):
def run_qwen2_vl(question, modality):
assert modality == "image"

model_name = "Qwen/Qwen2-VL-7B-Instruct"

llm = LLM(
Expand All @@ -200,6 +232,7 @@ def run_qwen2_vl(question):
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
"llava-onevision": run_llava_onevision,
"fuyu": run_fuyu,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
Expand Down Expand Up @@ -255,7 +288,7 @@ def main(args):
data = mm_input["data"]
question = mm_input["question"]

llm, prompt, stop_token_ids = model_example_map[model](question)
llm, prompt, stop_token_ids = model_example_map[model](question, modality)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
Expand Down Expand Up @@ -306,6 +339,7 @@ def main(args):
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,6 @@ def run_test(
for asset in video_assets
]

for video in videos:
print(video.shape)

if size_factors is not None:
inputs_per_video = [(
[prompt for _ in size_factors],
Expand Down
Loading
Loading