huggingface · yonigozlan · Jan 10, 2025 · Jan 13, 2025 · zucchini-nlp · Jan 10, 2025
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
@@ -161,7 +161,7 @@ def __call__(
                 width // self.patch_size
             ) + self.num_additional_image_tokens
             if self.vision_feature_select_strategy == "default":
-                num_image_tokens -= 1
+                num_image_tokens -= self.num_additional_image_tokens
 
             prompt_strings = []
             for sample in text:

diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
@@ -57,10 +57,9 @@ def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image",
         for message in messages:
             if not ("role" in message and "content" in message):
                 raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
-        images = retrieve_images_in_messages(messages, images)
+        messages = retrieve_images_in_messages(messages, images)
 
         self.messages = messages
-        self.images = images
 
 
 def retrieve_images_in_messages(
@@ -72,43 +71,40 @@ def retrieve_images_in_messages(
     if images is None:
         images = []
     idx_images = 0
-    retrieved_images = []
     for message in messages:
         for content in message["content"]:
-            if isinstance(content, dict):
-                if content.get("type") == "image":
-                    for key in ["image", "url", "path", "base64"]:
-                        if key in content:
-                            retrieved_images.append(content[key])
-                            break
-                    else:
-                        if idx_images < len(images):
-                            retrieved_images.append(images[idx_images])
-                            idx_images += 1
-                        else:
-                            raise ValueError(
-                                "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
-                            )
-                # Add support for OpenAI/TGI chat format
-                elif content.get("type") == "image_url":
-                    if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
-                        retrieved_images.append(content["image_url"]["url"])
-                        # Rewrite content to be in the Transformers chat format
-                        content["type"] = "image"
-                        content["image"] = content["image_url"]["url"]
-                        del content["image_url"]
+            if not isinstance(content, dict):
+                continue
+            content_type = content.get("type")
+            if content_type == "image":
+                if not any(key in content for key in ["image", "url", "path", "base64"]):
+                    if idx_images < len(images):
+                        # Insert the image passed as argument in the chat message
+                        content["image"] = images[idx_images]
+                        idx_images += 1
                     else:
                         raise ValueError(
-                            "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
                         )
+            # Add support for OpenAI/TGI chat format
+            elif content_type == "image_url":
+                if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                    # Rewrite content to be in the Transformers chat format
+                    content["type"] = "image"
+                    content["image"] = content["image_url"]["url"]
+                    del content["image_url"]
+                else:
+                    raise ValueError(
+                        "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                    )
 
     # The number of images passed should be consistent with the number of images in the chat without an image key
     if idx_images != len(images):
         raise ValueError(
             "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
         )
 
-    return retrieved_images
+    return messages
 
 
 @add_end_docstrings(build_pipeline_init_args(has_processor=True))
@@ -316,31 +312,30 @@ def __call__(
         return super().__call__({"images": images, "text": text}, **kwargs)
 
     def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
+        if isinstance(inputs, Chat):
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = inputs.messages[-1]["role"] == "assistant"
+            model_inputs = self.processor.apply_chat_template(
+                inputs.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_tensors=self.framework,
+                tokenize=True,
+                return_dict=True,
+            )
+            model_inputs["text"] = inputs
+            return model_inputs
         # In case we only have text inputs
         if isinstance(inputs, (list, tuple, str)):
             images = None
             text = inputs
             inputs_text = inputs
         else:
-            if isinstance(inputs, Chat):
-                # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
-                # because very few models support multiple separate, consecutive assistant messages
-                if continue_final_message is None:
-                    continue_final_message = inputs.messages[-1]["role"] == "assistant"
-                text = self.processor.apply_chat_template(
-                    inputs.messages,
-                    add_generation_prompt=not continue_final_message,
-                    continue_final_message=continue_final_message,
-                    return_tensors=self.framework,
-                )
-                inputs_text = inputs
-                images = inputs.images
-            else:
-                text = inputs["text"]
-                inputs_text = inputs["text"]
-                images = inputs["images"]
-
-            images = load_images(images)
+            images = load_images(inputs["images"])
+            text = inputs["text"]
+            inputs_text = inputs["text"]
 
         # if batched text inputs, we set padding to True unless specified otherwise
         if isinstance(text, (list, tuple)) and len(text) > 1:

diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -134,8 +134,14 @@ def test_model_pt_chat_template(self):
                             "role": "user",
                             "content": [
                                 {"type": "text", "text": "What’s the difference between these two images?"},
-                                {"type": "image"},
-                                {"type": "image"},
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
                             ],
                         }
                     ],
@@ -144,13 +150,19 @@ def test_model_pt_chat_template(self):
                             "role": "user",
                             "content": [
                                 {"type": "text", "text": "What’s the difference between these two images?"},
-                                {"type": "image"},
-                                {"type": "image"},
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
+                                },
+                                {
+                                    "type": "image",
+                                    "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
+                                },
                             ],
                         },
                         {
                             "role": "assistant",
-                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
+                            "content": "The first image shows a statue of Liberty in the foreground, while the second image shows a city skyline",
                         },
                     ],
                 }