Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process inputs directly in apply_chat_template in image-text-to-text pipeline #35616

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/models/llava/processing_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def __call__(
width // self.patch_size
) + self.num_additional_image_tokens
if self.vision_feature_select_strategy == "default":
num_image_tokens -= 1
num_image_tokens -= self.num_additional_image_tokens
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be 1 to work correctly with different ViT backbones. Was it causing any test failures?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without this change, I'm getting errors on pipeline tests that use to work with llava-interleave. For example:

pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
        text = "<image> What this is? Assistant: This is"

        outputs = pipe(image, text=text)
        self.assertEqual(
            outputs,
            [
                {
                    "input_text": "<image> What this is? Assistant: This is",
                    "generated_text": "<image> What this is? Assistant: This is a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable",
                }
            ],
        )

returns:

ValueError: Image features and image tokens do not match: tokens: 728, features 729

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in case of llava-interleave-qwen-0.5b-hf I see a mismatch in vision_feature_select_strategy for the model config and for processor. Will fix that on the hub :)


prompt_strings = []
for sample in text:
Expand Down
87 changes: 41 additions & 46 deletions src/transformers/pipelines/image_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image",
for message in messages:
if not ("role" in message and "content" in message):
raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
images = retrieve_images_in_messages(messages, images)
messages = retrieve_images_in_messages(messages, images)

self.messages = messages
self.images = images


def retrieve_images_in_messages(
Expand All @@ -72,43 +71,40 @@ def retrieve_images_in_messages(
if images is None:
images = []
idx_images = 0
retrieved_images = []
for message in messages:
for content in message["content"]:
if isinstance(content, dict):
if content.get("type") == "image":
for key in ["image", "url", "path", "base64"]:
if key in content:
retrieved_images.append(content[key])
break
else:
if idx_images < len(images):
retrieved_images.append(images[idx_images])
idx_images += 1
else:
raise ValueError(
"The number of images in the chat messages should be the same as the number of images passed to the pipeline."
)
# Add support for OpenAI/TGI chat format
elif content.get("type") == "image_url":
if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
retrieved_images.append(content["image_url"]["url"])
# Rewrite content to be in the Transformers chat format
content["type"] = "image"
content["image"] = content["image_url"]["url"]
del content["image_url"]
if not isinstance(content, dict):
continue
content_type = content.get("type")
if content_type == "image":
if not any(key in content for key in ["image", "url", "path", "base64"]):
if idx_images < len(images):
# Insert the image passed as argument in the chat message
content["image"] = images[idx_images]
idx_images += 1
else:
raise ValueError(
"Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
"The number of images in the chat messages should be the same as the number of images passed to the pipeline."
)
# Add support for OpenAI/TGI chat format
elif content_type == "image_url":
if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
# Rewrite content to be in the Transformers chat format
content["type"] = "image"
content["image"] = content["image_url"]["url"]
del content["image_url"]
else:
raise ValueError(
"Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
)

# The number of images passed should be consistent with the number of images in the chat without an image key
if idx_images != len(images):
raise ValueError(
"The number of images in the chat messages should be the same as the number of images passed to the pipeline."
)

return retrieved_images
return messages


@add_end_docstrings(build_pipeline_init_args(has_processor=True))
Expand Down Expand Up @@ -316,31 +312,30 @@ def __call__(
return super().__call__({"images": images, "text": text}, **kwargs)

def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
if isinstance(inputs, Chat):
# If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
# because very few models support multiple separate, consecutive assistant messages
if continue_final_message is None:
continue_final_message = inputs.messages[-1]["role"] == "assistant"
model_inputs = self.processor.apply_chat_template(
inputs.messages,
add_generation_prompt=not continue_final_message,
continue_final_message=continue_final_message,
return_tensors=self.framework,
tokenize=True,
return_dict=True,
)
model_inputs["text"] = inputs
return model_inputs
# In case we only have text inputs
if isinstance(inputs, (list, tuple, str)):
images = None
text = inputs
inputs_text = inputs
else:
if isinstance(inputs, Chat):
# If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
# because very few models support multiple separate, consecutive assistant messages
if continue_final_message is None:
continue_final_message = inputs.messages[-1]["role"] == "assistant"
text = self.processor.apply_chat_template(
inputs.messages,
add_generation_prompt=not continue_final_message,
continue_final_message=continue_final_message,
return_tensors=self.framework,
)
inputs_text = inputs
images = inputs.images
else:
text = inputs["text"]
inputs_text = inputs["text"]
images = inputs["images"]

images = load_images(images)
images = load_images(inputs["images"])
text = inputs["text"]
inputs_text = inputs["text"]

# if batched text inputs, we set padding to True unless specified otherwise
if isinstance(text, (list, tuple)) and len(text) > 1:
Expand Down
22 changes: 17 additions & 5 deletions tests/pipelines/test_pipelines_image_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,14 @@ def test_model_pt_chat_template(self):
"role": "user",
"content": [
{"type": "text", "text": "What’s the difference between these two images?"},
{"type": "image"},
{"type": "image"},
{
"type": "image",
"image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
},
],
}
],
Expand All @@ -144,13 +150,19 @@ def test_model_pt_chat_template(self):
"role": "user",
"content": [
{"type": "text", "text": "What’s the difference between these two images?"},
{"type": "image"},
{"type": "image"},
{
"type": "image",
"image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
},
],
},
{
"role": "assistant",
"content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
"content": "The first image shows a statue of Liberty in the foreground, while the second image shows a city skyline",
},
],
}
Expand Down