From 7b12c7f40d907d5ba5942e6b6fdeb8b6db187d05 Mon Sep 17 00:00:00 2001 From: litianjian Date: Thu, 12 Sep 2024 13:15:49 +0000 Subject: [PATCH 01/15] add llava-onevision --- docs/source/models/supported_models.rst | 5 + examples/offline_inference_vision_language.py | 10 + .../vision_language/test_llava_onevision.py | 229 +++++ vllm/model_executor/models/__init__.py | 6 +- vllm/model_executor/models/llava_onevision.py | 937 ++++++++++++++++++ 5 files changed, 1185 insertions(+), 2 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_llava_onevision.py create mode 100644 vllm/model_executor/models/llava_onevision.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 6c7f7f7d5d992..82818b0e6148a 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -232,6 +232,11 @@ Multimodal Language Models - Video - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note) - + * - :code:`LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - Image\ :sup:`+` / Video\ :sup:`+` + - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + - * - :code:`MiniCPMV` - MiniCPM-V - Image\ :sup:`+` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 464eaf334e3de..112b4aa155715 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -41,6 +41,15 @@ def run_llava_next_video(question): return llm, prompt, stop_token_ids +# LLaVA-OneVision +def run_llava_onevision(question): + prompt = f"<|im_start|>user