From 81e26c5be735d77451bb54e2c1dccdb444533c85 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 30 Aug 2024 17:39:14 +0800 Subject: [PATCH 01/14] add self.head_dim for VisionAttention in Qwen2-VL --- src/transformers/models/qwen2_vl/modeling_qwen2_vl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 5e7919a95a7d..69bfa62fd20f 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -275,6 +275,7 @@ class VisionAttention(nn.Module): def __init__(self, dim: int, num_heads: int = 16) -> None: super().__init__() self.num_heads = num_heads + self.head_dim = dim // num_heads self.qkv = nn.Linear(dim, dim * 3, bias=True) self.proj = nn.Linear(dim, dim) From 51e601b5bef43417824cdc473a65f445e189de50 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 30 Aug 2024 17:39:14 +0800 Subject: [PATCH 02/14] add self.head_dim for VisionAttention in Qwen2-VL --- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 536e0ab54abc..a352ae57d99f 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -493,3 +493,31 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) + + def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]: + """ + Extracts vision information (image or video data) from a list of conversations. + + Args: + conversations: A list of conversations, where each conversation is either a dictionary + or a list of dictionaries representing messages. + + Returns: + A list of dictionaries, each containing information about an image or video found + within the conversations. + """ + vision_infos = [] + if isinstance(conversations[0], dict): + conversations = [conversations] + for conversation in conversations: + for message in conversation: + if isinstance(message["content"], list): + for ele in message["content"]: + if ( + "image" in ele + or "image_url" in ele + or "video" in ele + or ele["type"] in ("image", "image_url", "video") + ): + vision_infos.append(ele) + return vision_infos From 7b57e907e9e0fd760c7f429e90d756c523fe5a93 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Mon, 2 Sep 2024 20:14:16 +0800 Subject: [PATCH 03/14] fix ci --- tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index a352ae57d99f..2e2a75db9985 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -493,7 +493,7 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, ) - + def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]: """ Extracts vision information (image or video data) from a list of conversations. From 774e6d5452242b2c688e220383e4cd0f4f6faca5 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Mon, 2 Sep 2024 20:20:27 +0800 Subject: [PATCH 04/14] black the test_modeling_qwen2_vl.py --- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 53 ++++++++++++++----- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 2e2a75db9985..8acc2c2fc83c 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -159,18 +159,31 @@ def prepare_config_and_inputs(self): def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs - vision_seqlen = pixel_values.shape[0] // self.batch_size // (self.vision_config["spatial_merge_size"] ** 2) - input_ids = ids_tensor([self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size) - attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) - input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = self.image_token_id + vision_seqlen = ( + pixel_values.shape[0] + // self.batch_size + // (self.vision_config["spatial_merge_size"] ** 2) + ) + input_ids = ids_tensor( + [self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size + ) + attention_mask = torch.ones( + input_ids.shape, dtype=torch.long, device=torch_device + ) + input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = ( + self.image_token_id + ) labels = torch.zeros( - (self.batch_size, self.seq_length - 1 + vision_seqlen), dtype=torch.long, device=torch_device + (self.batch_size, self.seq_length - 1 + vision_seqlen), + dtype=torch.long, + device=torch_device, ) patch_size = self.vision_config["patch_size"] inputs_dict = { "pixel_values": pixel_values, "image_grid_thw": torch.tensor( - [[1, self.image_size // patch_size, self.image_size // patch_size]] * self.batch_size + [[1, self.image_size // patch_size, self.image_size // patch_size]] + * self.batch_size ), "input_ids": input_ids, "attention_mask": attention_mask, @@ -218,13 +231,17 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas Model tester for `Qwen2VLForConditionalGeneration`. """ - all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else () + all_model_classes = ( + (Qwen2VLForConditionalGeneration,) if is_torch_available() else () + ) test_pruning = False test_head_masking = False def setUp(self): self.model_tester = Qwen2VLVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Qwen2VLConfig, has_text_modality=False + ) def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -270,15 +287,21 @@ def test_inputs_embeds_matches_input_ids_with_generate(self): def test_cpu_offload(self): pass - @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + @unittest.skip( + reason="Some undefined behavior encountered with test versions of this model. Skip for now." + ) def test_disk_offload_bin(self): pass - @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + @unittest.skip( + reason="Some undefined behavior encountered with test versions of this model. Skip for now." + ) def test_disk_offload_safetensors(self): pass - @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") + @unittest.skip( + reason="Some undefined behavior encountered with test versions of this model. Skip for now." + ) def test_model_parallelism(self): pass @@ -325,7 +348,9 @@ def test_small_model_integration_test(self): "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" ) - text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) + text = self.processor.apply_chat_template( + self.messages, tokenize=False, add_generation_prompt=True + ) inputs = self.processor(text=[text], images=[self.image], return_tensors="pt") expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip @@ -494,7 +519,9 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): EXPECTED_DECODED_TEXT, ) - def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]: + def extract_vision_info( + self, conversations: list[dict] | list[list[dict]] + ) -> list[dict]: """ Extracts vision information (image or video data) from a list of conversations. From 93837d2e1913e2775b6e11c6ccfc7e1b9191f333 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Mon, 2 Sep 2024 20:32:42 +0800 Subject: [PATCH 05/14] use ruff to format test_modeling_qwen2_vl.py --- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 49 +++++-------------- 1 file changed, 12 insertions(+), 37 deletions(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 8acc2c2fc83c..7436cbe43251 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -159,20 +159,10 @@ def prepare_config_and_inputs(self): def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs - vision_seqlen = ( - pixel_values.shape[0] - // self.batch_size - // (self.vision_config["spatial_merge_size"] ** 2) - ) - input_ids = ids_tensor( - [self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size - ) - attention_mask = torch.ones( - input_ids.shape, dtype=torch.long, device=torch_device - ) - input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = ( - self.image_token_id - ) + vision_seqlen = pixel_values.shape[0] // self.batch_size // (self.vision_config["spatial_merge_size"] ** 2) + input_ids = ids_tensor([self.batch_size, self.seq_length - 1 + vision_seqlen], self.vocab_size) + attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + input_ids[:, torch.arange(vision_seqlen, device=torch_device) + 1] = self.image_token_id labels = torch.zeros( (self.batch_size, self.seq_length - 1 + vision_seqlen), dtype=torch.long, @@ -182,8 +172,7 @@ def prepare_config_and_inputs_for_common(self): inputs_dict = { "pixel_values": pixel_values, "image_grid_thw": torch.tensor( - [[1, self.image_size // patch_size, self.image_size // patch_size]] - * self.batch_size + [[1, self.image_size // patch_size, self.image_size // patch_size]] * self.batch_size ), "input_ids": input_ids, "attention_mask": attention_mask, @@ -231,17 +220,13 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas Model tester for `Qwen2VLForConditionalGeneration`. """ - all_model_classes = ( - (Qwen2VLForConditionalGeneration,) if is_torch_available() else () - ) + all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else () test_pruning = False test_head_masking = False def setUp(self): self.model_tester = Qwen2VLVisionText2TextModelTester(self) - self.config_tester = ConfigTester( - self, config_class=Qwen2VLConfig, has_text_modality=False - ) + self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False) def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -287,21 +272,15 @@ def test_inputs_embeds_matches_input_ids_with_generate(self): def test_cpu_offload(self): pass - @unittest.skip( - reason="Some undefined behavior encountered with test versions of this model. Skip for now." - ) + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") def test_disk_offload_bin(self): pass - @unittest.skip( - reason="Some undefined behavior encountered with test versions of this model. Skip for now." - ) + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") def test_disk_offload_safetensors(self): pass - @unittest.skip( - reason="Some undefined behavior encountered with test versions of this model. Skip for now." - ) + @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") def test_model_parallelism(self): pass @@ -348,9 +327,7 @@ def test_small_model_integration_test(self): "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" ) - text = self.processor.apply_chat_template( - self.messages, tokenize=False, add_generation_prompt=True - ) + text = self.processor.apply_chat_template(self.messages, tokenize=False, add_generation_prompt=True) inputs = self.processor(text=[text], images=[self.image], return_tensors="pt") expected_input_ids = [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 151652, 151655, 151655] # fmt: skip @@ -519,9 +496,7 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): EXPECTED_DECODED_TEXT, ) - def extract_vision_info( - self, conversations: list[dict] | list[list[dict]] - ) -> list[dict]: + def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]: """ Extracts vision information (image or video data) from a list of conversations. From 35e321b52728c0af5261bf61c3bceb78ba59d98c Mon Sep 17 00:00:00 2001 From: root Date: Wed, 4 Sep 2024 21:14:09 +0800 Subject: [PATCH 06/14] [run-slow] qwen2_vl From 14b4b2b9eecd3770a44223f5892eef1304ef78ec Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Wed, 4 Sep 2024 22:30:43 +0800 Subject: [PATCH 07/14] use tying for python3.8 --- tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 7436cbe43251..a0b0a499daa4 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -18,6 +18,7 @@ import unittest import requests +from typing import List, Dict, Union from transformers import ( AutoProcessor, @@ -496,7 +497,7 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): EXPECTED_DECODED_TEXT, ) - def extract_vision_info(self, conversations: list[dict] | list[list[dict]]) -> list[dict]: + def extract_vision_info(self, conversations: Union[List[Dict], List[List[Dict]]]) -> List[Dict]: """ Extracts vision information (image or video data) from a list of conversations. From 707455435444432e35fc746de4802946e5aac679 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Wed, 4 Sep 2024 22:41:21 +0800 Subject: [PATCH 08/14] fix the import format --- tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index a0b0a499daa4..707c23a12d35 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -16,9 +16,9 @@ import gc import unittest +from typing import List, Dict, Union import requests -from typing import List, Dict, Union from transformers import ( AutoProcessor, From 0000ee06b439c1a0d0f0ad7ae9d0c58b739c184a Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Wed, 4 Sep 2024 22:46:09 +0800 Subject: [PATCH 09/14] use ruff to fix the ci error I001 --- tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 707c23a12d35..7e8214b332d7 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -16,7 +16,7 @@ import gc import unittest -from typing import List, Dict, Union +from typing import Dict, List, Union import requests From 811d745107e9a5310af40caab2ed2191deec6f2d Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Thu, 5 Sep 2024 23:28:24 +0800 Subject: [PATCH 10/14] [run-slow] qwen2_vl From 81cfb00b85f32a8405775770d1bd4284279b4292 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 6 Sep 2024 00:40:35 +0800 Subject: [PATCH 11/14] remove unused import --- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 31 +------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 7e8214b332d7..c1067e1d6030 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -16,7 +16,6 @@ import gc import unittest -from typing import Dict, List, Union import requests @@ -495,32 +494,4 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, - ) - - def extract_vision_info(self, conversations: Union[List[Dict], List[List[Dict]]]) -> List[Dict]: - """ - Extracts vision information (image or video data) from a list of conversations. - - Args: - conversations: A list of conversations, where each conversation is either a dictionary - or a list of dictionaries representing messages. - - Returns: - A list of dictionaries, each containing information about an image or video found - within the conversations. - """ - vision_infos = [] - if isinstance(conversations[0], dict): - conversations = [conversations] - for conversation in conversations: - for message in conversation: - if isinstance(message["content"], list): - for ele in message["content"]: - if ( - "image" in ele - or "image_url" in ele - or "video" in ele - or ele["type"] in ("image", "image_url", "video") - ): - vision_infos.append(ele) - return vision_infos + ) \ No newline at end of file From 48fc376ffd8ab6c5a1bd249bd9b0cb36fbc58bf3 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 6 Sep 2024 16:31:48 +0800 Subject: [PATCH 12/14] commit for rebase From 61f5be4153b0a84e5098d017262a0ce1071f12ac Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 6 Sep 2024 16:41:04 +0800 Subject: [PATCH 13/14] use ruff fix ci --- tests/models/qwen2_vl/test_modeling_qwen2_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index c1067e1d6030..4bd85f06a4f9 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -494,4 +494,4 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): self.assertEqual( self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT, - ) \ No newline at end of file + ) From e366f403b81400aa90d29d226369a5b67fecfcb3 Mon Sep 17 00:00:00 2001 From: GeLee <865038696@qq.com> Date: Fri, 6 Sep 2024 16:50:50 +0800 Subject: [PATCH 14/14] [run-slow] qwen2_vl