diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index d2a47e7af8f1..b57c69ca6b32 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -87,4 +87,17 @@ If you're interested in submitting a resource to be included here, please feel f [[autodoc]] Blip2ForConditionalGeneration - forward - - generate \ No newline at end of file + - generate + +## Blip2ForImageTextRetrieval + +[[autodoc]] Blip2ForImageTextRetrieval + - forward + +## Blip2TextModelWithProjection + +[[autodoc]] Blip2TextModelWithProjection + +## Blip2VisionModelWithProjection + +[[autodoc]] Blip2VisionModelWithProjection diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3f28753f3a7e..9026c1eaac1e 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1571,10 +1571,13 @@ _import_structure["models.blip_2"].extend( [ "Blip2ForConditionalGeneration", + "Blip2ForImageTextRetrieval", "Blip2Model", "Blip2PreTrainedModel", "Blip2QFormerModel", + "Blip2TextModelWithProjection", "Blip2VisionModel", + "Blip2VisionModelWithProjection", ] ) _import_structure["models.bloom"].extend( @@ -6290,10 +6293,13 @@ ) from .models.blip_2 import ( Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, Blip2Model, Blip2PreTrainedModel, Blip2QFormerModel, + Blip2TextModelWithProjection, Blip2VisionModel, + Blip2VisionModelWithProjection, ) from .models.bloom import ( BloomForCausalLM, diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index f9856ef701f9..0d344cc54b13 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -161,19 +161,19 @@ class AltCLIPOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): + text_model_output (`BaseModelOutputWithPooling`): The output of the [`AltCLIPTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): + vision_model_output (`BaseModelOutputWithPooling`): The output of the [`AltCLIPVisionModel`]. """ diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 2b49c295975d..6a879e4c9266 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1263,6 +1263,7 @@ ("align", "AlignModel"), ("altclip", "AltCLIPModel"), ("blip", "BlipModel"), + ("blip-2", "Blip2ForImageTextRetrieval"), ("chinese_clip", "ChineseCLIPModel"), ("clip", "CLIPModel"), ("clipseg", "CLIPSegModel"), diff --git a/src/transformers/models/blip_2/__init__.py b/src/transformers/models/blip_2/__init__.py index 6897dd35c89b..329ddfe19ac6 100644 --- a/src/transformers/models/blip_2/__init__.py +++ b/src/transformers/models/blip_2/__init__.py @@ -33,10 +33,13 @@ else: _import_structure["modeling_blip_2"] = [ "Blip2Model", + "Blip2VisionModelWithProjection", "Blip2QFormerModel", "Blip2PreTrainedModel", "Blip2ForConditionalGeneration", + "Blip2ForImageTextRetrieval", "Blip2VisionModel", + "Blip2TextModelWithProjection", ] if TYPE_CHECKING: @@ -55,10 +58,13 @@ else: from .modeling_blip_2 import ( Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, Blip2Model, Blip2PreTrainedModel, Blip2QFormerModel, + Blip2TextModelWithProjection, Blip2VisionModel, + Blip2VisionModelWithProjection, ) else: diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 86380e89b6d8..16fa4aec3849 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -15,7 +15,7 @@ """BLIP-2 model configuration""" import os -from typing import Union +from typing import Optional, Union from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES @@ -172,6 +172,8 @@ class Blip2QFormerConfig(PretrainedConfig): The frequency of adding cross-attention to the Transformer layers. encoder_hidden_size (`int`, *optional*, defaults to 1408): The hidden size of the hidden states for cross-attention. + use_qformer_text_input (`bool`, *optional*, defaults to `False`): + Whether to use BERT-style embeddings. Examples: @@ -206,6 +208,7 @@ def __init__( position_embedding_type="absolute", cross_attention_frequency=2, encoder_hidden_size=1408, + use_qformer_text_input=False, **kwargs, ): super().__init__(pad_token_id=pad_token_id, **kwargs) @@ -224,6 +227,7 @@ def __init__( self.position_embedding_type = position_embedding_type self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size + self.use_qformer_text_input = use_qformer_text_input @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": @@ -263,6 +267,8 @@ class Blip2Config(PretrainedConfig): Dictionary of configuration options used to initialize any [`PretrainedConfig`]. num_query_tokens (`int`, *optional*, defaults to 32): The number of query tokens passed through the Transformer. + image_text_hidden_size (`int`, *optional*, defaults to 256): + Dimentionality of the hidden state of the image-text fusion layer. image_token_index (`int`, *optional*): Token index of special image token. @@ -307,6 +313,7 @@ def __init__( qformer_config=None, text_config=None, num_query_tokens=32, + image_text_hidden_size=256, image_token_index=None, **kwargs, ): @@ -333,6 +340,7 @@ def __init__( self.is_encoder_decoder = self.text_config.is_encoder_decoder self.num_query_tokens = num_query_tokens + self.image_text_hidden_size = image_text_hidden_size self.image_token_index = image_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES @@ -344,13 +352,21 @@ def from_vision_qformer_text_configs( cls, vision_config: Blip2VisionConfig, qformer_config: Blip2QFormerConfig, - text_config: PretrainedConfig, + text_config: Optional[PretrainedConfig] = None, **kwargs, ): r""" Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model configurations. + Args: + vision_config (`dict`): + Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. + qformer_config (`dict`): + Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + Returns: [`Blip2Config`]: An instance of a configuration object """ @@ -358,6 +374,6 @@ def from_vision_qformer_text_configs( return cls( vision_config=vision_config.to_dict(), qformer_config=qformer_config.to_dict(), - text_config=text_config.to_dict(), + text_config=text_config.to_dict() if text_config is not None else None, **kwargs, ) diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py index c2e6eceae532..5f972353c4f4 100644 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py @@ -31,9 +31,12 @@ from transformers import ( AutoTokenizer, + BertTokenizer, Blip2Config, Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, Blip2Processor, + Blip2QFormerConfig, Blip2VisionConfig, BlipImageProcessor, OPTConfig, @@ -51,7 +54,7 @@ def load_demo_image(): # here we list all keys to be renamed (original name on the left, our name on the right) -def create_rename_keys(config): +def create_rename_keys(config, model_name): rename_keys = [] # fmt: off @@ -79,6 +82,13 @@ def create_rename_keys(config): # QFormer rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) + if "itm" in model_name: + rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight")) + rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight")) + rename_keys.append(("vision_proj.weight", "vision_projection.weight")) + rename_keys.append(("vision_proj.bias", "vision_projection.bias")) + rename_keys.append(("text_proj.weight", "text_projection.weight")) + rename_keys.append(("text_proj.bias", "text_projection.bias")) # fmt: on return rename_keys @@ -114,26 +124,47 @@ def get_blip2_config(model_name, eos_token_id): text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() elif "t5-xxl" in model_name: text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() - - config = Blip2Config(vision_config=vision_config, text_config=text_config) + elif "itm" in model_name: + text_config = {} + else: + raise ValueError("Model name not supported") + + if "itm" in model_name: + config = Blip2Config( + vision_config=vision_config, + qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(), + ) + else: + config = Blip2Config(vision_config=vision_config, text_config=text_config) return config, image_size @torch.no_grad() -def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): +def convert_blip2_checkpoint( + model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu" +): """ Copy/paste/tweak model's weights to Transformers design. """ - tokenizer = ( - AutoTokenizer.from_pretrained("facebook/opt-2.7b") - if "opt" in model_name - else AutoTokenizer.from_pretrained("google/flan-t5-xl") - ) - eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] + if "opt" in model_name: + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b") + elif "itm" in model_name: + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") + tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + else: + tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") + + if "itm" in model_name: + eos_token_id = None + else: + eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) - hf_model = Blip2ForConditionalGeneration(config).eval() + if "itm" in model_name: + hf_model = Blip2ForImageTextRetrieval(config).eval() + else: + hf_model = Blip2ForConditionalGeneration(config).eval() model_name_to_original = { "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), @@ -143,16 +174,12 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), + "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"), + "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"), } name, type = model_name_to_original[model_name] - # note: this script is tested on 2 GPUs, as models are compared in float32, - # which requires quite some memory. Hence loading both on a - # separate device is the easiest to compare - hf_model_device = "cuda:0" if torch.cuda.is_available() else "cpu" - lavis_device = "cuda:1" if torch.cuda.is_available() else "cpu" - # load original model print("Loading original model...") original_model, vis_processors, _ = load_model_and_preprocess( @@ -163,7 +190,7 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ # update state dict keys state_dict = original_model.state_dict() - rename_keys = create_rename_keys(config) + rename_keys = create_rename_keys(config, model_name) for src, dest in rename_keys: rename_key(state_dict, src, dest) @@ -189,11 +216,15 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) assert len(missing_keys) == 0 - assert unexpected_keys == ["qformer.embeddings.position_ids"] + + if "itm" in model_name: + unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys)) + assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"] + else: + assert unexpected_keys == ["qformer.embeddings.position_ids"] image = load_demo_image() original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) - input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) # create processor image_processor = BlipImageProcessor( @@ -207,50 +238,105 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ original_model.to(lavis_device) hf_model.to(hf_model_device) - with torch.no_grad(): - if "opt" in model_name: - original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits - logits = hf_model(pixel_values, input_ids).logits - else: - original_logits = original_model( - {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} - ).logits - labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) - logits = hf_model(pixel_values, input_ids, labels=labels).logits - - assert original_logits.shape == logits.shape - print("First values of original logits:", original_logits[0, :3, :3]) - print("First values of HF logits:", logits[0, :3, :3]) - # assert values - assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) - print("Looks ok!") + if "itm" in model_name: + caption = "a large fountain spewing water into the air" + input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device) + attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device) - print("Generating a caption...") - prompt = "Question: what object is in this image? Answer:" - input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) - - set_seed(42) - - original_outputs = original_model.generate( - {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True - ) - outputs = hf_model.generate( - pixel_values, - input_ids, - do_sample=True, - num_beams=5, - max_length=30, - min_length=1, - top_p=0.9, - repetition_penalty=1.0, - length_penalty=1.0, - temperature=1, - ) - output_text = processor.batch_decode(outputs, skip_special_tokens=True) - output_text = [text.strip() for text in output_text] - print("Original generation:", original_outputs) - print("HF generation:", output_text) + with torch.no_grad(): + original_logits = original_model( + {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" + ) + logits = hf_model( + pixel_values=original_pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + use_image_text_matching_head=True, + ) + + assert original_logits.shape == logits.logits_per_image.shape + print("First values of original logits:", original_logits[0, :3]) + print("First values of HF logits:", logits.logits_per_image[0, :3]) + + # assert values + # cast to same type + target_dtype = logits.logits_per_image.dtype + assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) + + original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1) + itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1) + assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4) + print("Looks ok!") + + with torch.no_grad(): + original_logits = original_model( + {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" + ) + logits = hf_model( + pixel_values=original_pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + use_image_text_matching_head=False, + ) + + assert original_logits.shape == logits.logits_per_image.shape + print("First values of original logits:", original_logits[0, :3]) + print("First values of HF logits:", logits.logits_per_image[0, :3]) + + # assert values + # cast to same type + target_dtype = logits.logits_per_image.dtype + assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) + print("Looks ok!") + + else: + input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) + + with torch.no_grad(): + if "opt" in model_name: + original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits + logits = hf_model(pixel_values, input_ids).logits + else: + original_logits = original_model( + {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} + ).logits + labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) + logits = hf_model(pixel_values, input_ids, labels=labels).logits + + assert original_logits.shape == logits.shape + print("First values of original logits:", original_logits[0, :3, :3]) + print("First values of HF logits:", logits[0, :3, :3]) + + # assert values + assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) + print("Looks ok!") + + print("Generating a caption...") + prompt = "Question: what object is in this image? Answer:" + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) + + set_seed(42) + + original_outputs = original_model.generate( + {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50 + ) + outputs = hf_model.generate( + pixel_values, + input_ids, + do_sample=True, + num_beams=5, + max_length=30, + min_length=1, + top_p=0.9, + repetition_penalty=1.0, + length_penalty=1.0, + temperature=1, + ) + output_text = processor.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + print("Original generation:", original_outputs) + print("HF generation:", output_text) if pytorch_dump_folder_path is not None: processor.save_pretrained(pytorch_dump_folder_path) @@ -271,6 +357,8 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ "blip2-flan-t5-xl", "blip2-flan-t5-xl-coco", "blip2-flan-t5-xxl", + "blip2-itm-vit-g", + "blip2-itm-vit-g-coco", ] parser.add_argument( "--model_name", @@ -285,7 +373,18 @@ def convert_blip2_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_ action="store_true", help="Whether to push the model and processor to the hub after converting", ) + # note: this script is tested on 2 GPUs, as models are compared in float32, + # which requires quite some memory. Hence loading both on a + # separate device is the easiest to compare + parser.add_argument( + "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." + ) + parser.add_argument( + "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." + ) args = parser.parse_args() - convert_blip2_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) + convert_blip2_checkpoint( + args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device + ) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index e89576c67ecc..fba4c98696a0 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -81,6 +81,103 @@ def to_tuple(self) -> Tuple[Any]: ) +@dataclass +class Blip2ImageTextMatchingModelOutput(ModelOutput): + """ + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`Blip2QFormerModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`Blip2VisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: torch.FloatTensor = None + logits_per_text: torch.FloatTensor = None + text_embeds: torch.FloatTensor = None + image_embeds: torch.FloatTensor = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Blip2 +class Blip2TextModelOutput(ModelOutput): + """ + Base class for text model's outputs that also contains a pooling of the last hidden states. + + Args: + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Blip2 +class Blip2VisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + # Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2 class Blip2VisionEmbeddings(nn.Module): def __init__(self, config: Blip2VisionConfig): @@ -304,7 +401,13 @@ class Blip2PreTrainedModel(PreTrainedModel): config_class = Blip2Config base_model_prefix = "blip" supports_gradient_checkpointing = True - _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"] + _no_split_modules = [ + "Blip2Attention", + "Blip2QFormerMultiHeadAttention", + "Blip2TextEmbeddings", + "T5Block", + "OPTDecoderLayer", + ] _skip_keys_device_placement = "past_key_values" _keep_in_fp32_modules = ["wo"] @@ -398,6 +501,30 @@ def _init_weights(self, module): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ +BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + BLIP_2_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): @@ -444,6 +571,43 @@ def _init_weights(self, module): Whether to interpolate the pre-trained position encodings. """ +BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for + details. + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + use_image_text_matching_head (`bool`, *optional*): + Whether to return the Image-Text Matching or Contrastive scores. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + # Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Blip2 class Blip2Encoder(nn.Module): @@ -842,6 +1006,10 @@ def __init__(self, config, layer_idx): else: self.has_cross_attention = False + if config.use_qformer_text_input: + self.intermediate = Blip2QFormerIntermediate(config) + self.output = Blip2QFormerOutput(config) + self.intermediate_query = Blip2QFormerIntermediate(config) self.output_query = Blip2QFormerOutput(config) @@ -1022,6 +1190,49 @@ def forward( ) +class Blip2TextEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + def forward( + self, + input_ids: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + query_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if input_ids is not None: + input_ids = input_ids.to(self.word_embeddings.weight.device) + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + return embeddings + + class Blip2QFormerModel(Blip2PreTrainedModel): """ Querying Transformer (Q-Former), used in BLIP-2. @@ -1100,6 +1311,7 @@ def get_extended_attention_mask( def forward( self, query_embeds: torch.FloatTensor, + query_length: Optional[int] = None, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, @@ -1140,7 +1352,9 @@ def forward( past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 ) - query_length = query_embeds.shape[1] if query_embeds is not None else 0 + query_length = ( + query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0 + ) embedding_output = self.layernorm(query_embeds) embedding_output = self.dropout(embedding_output) @@ -1567,6 +1781,206 @@ def forward( ) +@add_start_docstrings( + """ + BLIP-2 Text Model with a projection layer on top (a linear layer on top of the pooled output). + """, + BLIP_2_START_DOCSTRING, +) +class Blip2TextModelWithProjection(Blip2PreTrainedModel): + supports_gradient_checkpointing = False + _keep_in_fp32_modules = [] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.embeddings = Blip2TextEmbeddings(config.qformer_config) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # text projection layer + self.text_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2TextModelOutput, config_class=Blip2Config) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2TextModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, Blip2TextModelWithProjection + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> model = Blip2TextModelWithProjection.from_pretrained( + ... "Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16 + ... ) + + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], return_tensors="pt").to(device) + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + >>> print(text_embeds.shape) + torch.Size([2, 7, 256]) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + query_embeds = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + ) + + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=0, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + text_embeds = self.text_projection(pooled_output) + text_embeds = nn.functional.normalize(text_embeds, dim=-1) + + if not return_dict: + outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return Blip2TextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +@add_start_docstrings( + """ + BLIP-2 Vision Model with a projection layer on top (a linear layer on top of the pooled output). + """, + BLIP_2_START_DOCSTRING, +) +class Blip2VisionModelWithProjection(Blip2PreTrainedModel): + main_input_name = "pixel_values" + _keep_in_fp32_modules = [] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # vision projection layer + self.vision_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2VisionModelOutput, config_class=Blip2Config) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2VisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Blip2VisionModelWithProjection + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + >>> model = Blip2VisionModelWithProjection.from_pretrained( + ... "Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16 + ... ) + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + >>> print(image_embeds.shape) + torch.Size([1, 32, 256]) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = vision_outputs[0] if not return_dict else vision_outputs.last_hidden_state + + image_attention_mask = torch.ones(pooled_output.size()[:-1], dtype=torch.long, device=pooled_output.device) + + query_tokens = self.query_tokens.expand(pooled_output.shape[0], -1, -1) + + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=pooled_output, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + + embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state + image_embeds = self.vision_projection(embeds) + image_embeds = nn.functional.normalize(image_embeds, dim=-1) + + if not return_dict: + outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return Blip2VisionModelOutput( + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @add_start_docstrings( """ BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision @@ -1937,3 +2351,180 @@ def generate( else: outputs = torch.cat([bos_tokens, outputs], dim=-1) return outputs + + +@add_start_docstrings( + """ + BLIP-2 Model with a vision and text projector, and a classification head on top. The model is used in the context + of image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_2_START_DOCSTRING, +) +class Blip2ForImageTextRetrieval(Blip2PreTrainedModel): + main_input_name = "pixel_values" + _keep_in_fp32_modules = [] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + + self.embeddings = Blip2TextEmbeddings(config.qformer_config) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # vision projection layer + self.vision_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # text projection layer + self.text_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # image text matching head + self.itm_head = nn.Linear(config.qformer_config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2ImageTextMatchingModelOutput, config_class=Blip2Config) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.LongTensor] = None, + use_image_text_matching_head: Optional[bool] = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ImageTextMatchingModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Blip2ForImageTextRetrieval + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> model = Blip2ForImageTextRetrieval.from_pretrained("Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16) + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "two cats laying on a pink blanket" + + >>> inputs = processor(images=image, text=text, return_tensors="pt").to(device, torch.float16) + >>> itm_out = model(**inputs, use_image_text_matching_head=True) + >>> logits_per_image = torch.nn.functional.softmax(itm_out.logits_per_image, dim=1) + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + + >>> print(f"{probs[0][0]:.1%} that image 0 is not '{text}'") + 26.9% that image 0 is not 'two cats laying on a pink blanket' + + >>> print(f"{probs[0][1]:.1%} that image 0 is '{text}'") + 73.0% that image 0 is 'two cats laying on a pink blanket' + + >>> texts = ["a photo of a cat", "a photo of a dog"] + + >>> inputs = processor(images=image, text=texts, return_tensors="pt").to(device, torch.float16) + >>> itc_out = model(**inputs, use_image_text_matching_head=False) + >>> logits_per_image = itc_out.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + + >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") + 55.3% that image 0 is 'a photo of a cat' + + >>> print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'") + 44.7% that image 0 is 'a photo of a dog' + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if use_image_text_matching_head: + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(query_tokens.device) + attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1) + + query_embeds = self.embeddings( + input_ids=input_ids, + query_embeds=query_tokens, + ) + + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=query_tokens.shape[1], + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + text_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + output = self.itm_head(text_embeds[:, : query_tokens.size(1), :]) + logits_per_image = output.mean(dim=1) + logits_per_text = logits_per_image.t() + else: + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + image_embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state + + query_embeds = self.embeddings( + input_ids=input_ids, + ) + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=0, + attention_mask=attention_mask, + return_dict=return_dict, + ) + question_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + # normalized features + image_embeds = nn.functional.normalize(self.vision_projection(image_embeds), dim=-1) + text_embeds = nn.functional.normalize(self.text_projection(question_embeds[:, 0, :]), dim=-1) + + # cosine similarity as logits + logits_per_image = torch.matmul(image_embeds, text_embeds.t()) + logits_per_image, _ = logits_per_image.max(dim=1) + + logits_per_text = logits_per_image.t() + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return output + + return Blip2ImageTextMatchingModelOutput( + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 939032f2c894..d0224e3caa5b 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -195,19 +195,19 @@ class ClapOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for audio-text similarity. - logits_per_audio:(`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`): + logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`): The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`): + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`): The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`]. - audio_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`]. - text_model_output(`BaseModelOutputWithPooling`): + text_model_output (`BaseModelOutputWithPooling`): The output of the [`ClapTextModel`]. - audio_model_output(`BaseModelOutputWithPooling`): + audio_model_output (`BaseModelOutputWithPooling`): The output of the [`ClapAudioModel`]. """ diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ee85fe312587..ea90f5688c67 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -131,19 +131,19 @@ class CLIPOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): + text_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): + vision_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPVisionModel`]. """ diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 97fcf3d1f2b3..a6507e431f68 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -63,19 +63,19 @@ class CLIPSegOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): + text_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPSegTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): + vision_model_output (`BaseModelOutputWithPooling`): The output of the [`CLIPSegVisionModel`]. """ diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index 797a8fa0c0ef..3439aa49dcb0 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -215,19 +215,19 @@ class SiglipOutput(ModelOutput): Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. - logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. - logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. - text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`]. - image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`]. - text_model_output(`BaseModelOutputWithPooling`): + text_model_output (`BaseModelOutputWithPooling`): The output of the [`SiglipTextModel`]. - vision_model_output(`BaseModelOutputWithPooling`): + vision_model_output (`BaseModelOutputWithPooling`): The output of the [`SiglipVisionModel`]. """ diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py index b0ceba8cbe67..61d5ff522275 100644 --- a/src/transformers/pipelines/zero_shot_image_classification.py +++ b/src/transformers/pipelines/zero_shot_image_classification.py @@ -97,6 +97,9 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and the call may block forever. + tokenizer_kwargs (`dict`, *optional*): + Additional dictionary of keyword arguments passed along to the tokenizer. + Return: A list of dictionaries containing result, one dictionary per proposed label. The dictionaries contain the following keys: @@ -106,7 +109,7 @@ def __call__(self, images: Union[str, List[str], "Image", List["Image"]], **kwar """ return super().__call__(images, **kwargs) - def _sanitize_parameters(self, **kwargs): + def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): preprocess_params = {} if "candidate_labels" in kwargs: preprocess_params["candidate_labels"] = kwargs["candidate_labels"] @@ -114,10 +117,21 @@ def _sanitize_parameters(self, **kwargs): preprocess_params["timeout"] = kwargs["timeout"] if "hypothesis_template" in kwargs: preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + if tokenizer_kwargs is not None: + preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs return preprocess_params, {}, {} - def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None): + def preprocess( + self, + image, + candidate_labels=None, + hypothesis_template="This is a photo of {}.", + timeout=None, + tokenizer_kwargs=None, + ): + if tokenizer_kwargs is None: + tokenizer_kwargs = {} image = load_image(image, timeout=timeout) inputs = self.image_processor(images=[image], return_tensors=self.framework) if self.framework == "pt": @@ -125,7 +139,7 @@ def preprocess(self, image, candidate_labels=None, hypothesis_template="This is inputs["candidate_labels"] = candidate_labels sequences = [hypothesis_template.format(x) for x in candidate_labels] padding = "max_length" if self.model.config.model_type == "siglip" else True - text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding) + text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs) inputs["text_inputs"] = [text_inputs] return inputs diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 7f4208dae05b..63f7a9631dc5 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -1610,6 +1610,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2ForImageTextRetrieval(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Blip2Model(metaclass=DummyObject): _backends = ["torch"] @@ -1631,6 +1638,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2TextModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Blip2VisionModel(metaclass=DummyObject): _backends = ["torch"] @@ -1638,6 +1652,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2VisionModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class BloomForCausalLM(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 362079434a93..cee5d710a85f 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -24,6 +24,8 @@ from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig from transformers.testing_utils import ( require_torch, + require_torch_fp16, + require_torch_gpu, require_torch_multi_accelerator, require_vision, slow, @@ -47,7 +49,14 @@ import torch from torch import nn - from transformers import Blip2ForConditionalGeneration, Blip2Model, Blip2VisionModel + from transformers import ( + Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, + Blip2Model, + Blip2TextModelWithProjection, + Blip2VisionModel, + Blip2VisionModelWithProjection, + ) if is_vision_available(): @@ -243,6 +252,7 @@ def __init__( initializer_range=0.02, bos_token_id=0, scope=None, + use_qformer_text_input=False, ): self.parent = parent self.batch_size = batch_size @@ -262,6 +272,7 @@ def __init__( self.initializer_range = initializer_range self.scope = scope self.bos_token_id = bos_token_id + self.use_qformer_text_input = use_qformer_text_input def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -294,6 +305,7 @@ def get_config(self): max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, bos_token_id=self.bos_token_id, + use_qformer_text_input=self.use_qformer_text_input, ) @@ -489,7 +501,7 @@ def test_forward_signature(self): self.assertListEqual(arg_names[:1], expected_arg_names) def test_load_vision_qformer_text_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config, _ = self.model_tester.prepare_config_and_inputs_for_common() # Save Blip2Config and check if we can load Blip2VisionConfig from it with tempfile.TemporaryDirectory() as tmp_dir_name: @@ -704,6 +716,16 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi test_attention_outputs = False test_torchscript = False + # TODO: Fix the failed tests + def is_pipeline_test_to_skip( + self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + ): + if pipeline_test_casse_name == "VisualQuestionAnsweringPipelineTests": + # Get `RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'`. + return True + + return False + def setUp(self): self.model_tester = Blip2ModelTester(self) @@ -752,7 +774,7 @@ def test_forward_signature(self): self.assertListEqual(arg_names[:1], expected_arg_names) def test_load_vision_qformer_text_config(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config, _ = self.model_tester.prepare_config_and_inputs_for_common() # Save Blip2Config and check if we can load Blip2VisionConfig from it with tempfile.TemporaryDirectory() as tmp_dir_name: @@ -840,6 +862,549 @@ def test_initialization(self): ) +class Blip2TextModelWithProjectionTester: + def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training=True): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {"use_qformer_text_input": True} + + self.parent = parent + self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) + self.is_training = is_training + self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test + + def get_config(self): + return Blip2Config.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + ) + + def prepare_config_and_inputs(self): + _, input_ids, attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + def create_and_check_model(self, config, input_ids, attention_mask): + model = Blip2TextModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(input_ids, attention_mask=attention_mask, output_attentions=True, output_hidden_states=True) + + self.parent.assertEqual( + result.last_hidden_state.shape, + (self.vision_model_tester.batch_size, input_ids.shape[1], self.qformer_model_tester.hidden_size), + ) + self.parent.assertEqual( + result.text_embeds.shape, + ( + self.vision_model_tester.batch_size, + input_ids.shape[1], + config.image_text_hidden_size, + ), + ) + + with torch.no_grad(): + result2 = model( + input_ids, + attention_mask=attention_mask, + return_dict=not config.use_return_dict, + output_attentions=True, + output_hidden_states=True, + ) + + self.parent.assertTrue(torch.allclose(result.text_embeds, result2[0])) + self.parent.assertTrue(torch.allclose(result.last_hidden_state, result2[1])) + self.parent.assertTrue(torch.allclose(result.hidden_states[0], result2[2][0])) + self.parent.assertTrue(torch.allclose(result.hidden_states[1], result2[2][1])) + self.parent.assertTrue(torch.allclose(result.attentions[0], result2[3][0])) + self.parent.assertTrue(torch.allclose(result.attentions[1], result2[3][1])) + + +@require_torch +class Blip2TextModelWithProjectionTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (Blip2TextModelWithProjection,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_head_masking = False + + test_resize_embeddings = False + test_attention_outputs = False + test_torchscript = False + + def setUp(self): + self.model_tester = Blip2TextModelWithProjectionTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Training is not yet supported") + def test_training(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Blip2TextModelWithProjection does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Blip2TextModelWithProjection does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="Blip2TextModelWithProjection does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="Blip2TextModelWithProjection has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="Blip2TextModelWithProjection has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_to_base(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["input_ids", "attention_mask", "position_ids"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + @slow + @require_torch_gpu + def test_model_from_pretrained(self): + model_name = "Salesforce/blip2-itm-vit-g" + model = Blip2TextModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "text_projection")) + + _, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs() + + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(input_ids=input_ids, attention_mask=attention_mask) + + self.assertEqual( + outputs.text_embeds.shape, + ( + self.model_tester.qformer_model_tester.batch_size, + input_ids.shape[1], + model.config.image_text_hidden_size, + ), + ) + + +class Blip2VisionModelWithProjectionTester: + def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training=True): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {"use_qformer_text_input": True} + + self.parent = parent + self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) + self.is_training = is_training + self.num_hidden_layers = self.vision_model_tester.num_hidden_layers + self.num_attention_heads = self.vision_model_tester.num_attention_heads + self.seq_length = self.vision_model_tester.seq_length + self.hidden_size = self.vision_model_tester.hidden_size + self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test + + def get_config(self): + return Blip2Config.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + ) + + def prepare_config_and_inputs(self): + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + def create_and_check_model(self, config, pixel_values): + model = Blip2VisionModelWithProjection(config=config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + result = model(pixel_values, output_attentions=True, output_hidden_states=True) + + self.parent.assertEqual( + result.last_hidden_state.shape, + ( + self.vision_model_tester.batch_size, + self.vision_model_tester.seq_length, + self.qformer_model_tester.hidden_size, + ), + ) + self.parent.assertEqual( + result.image_embeds.shape, + ( + self.vision_model_tester.batch_size, + config.vision_config.hidden_size, + config.image_text_hidden_size, + ), + ) + + with torch.no_grad(): + result2 = model( + pixel_values, + return_dict=not config.use_return_dict, + output_attentions=True, + output_hidden_states=True, + ) + + self.parent.assertTrue(torch.allclose(result.image_embeds, result2[0])) + self.parent.assertTrue(torch.allclose(result.last_hidden_state, result2[1])) + self.parent.assertTrue(torch.allclose(result.hidden_states[0], result2[2][0])) + self.parent.assertTrue(torch.allclose(result.hidden_states[1], result2[2][1])) + self.parent.assertTrue(torch.allclose(result.attentions[0], result2[3][0])) + self.parent.assertTrue(torch.allclose(result.attentions[1], result2[3][1])) + + +@require_torch +class Blip2VisionModelWithProjectionTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (Blip2VisionModelWithProjection,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_head_masking = False + + test_resize_embeddings = False + test_torchscript = False + + def setUp(self): + self.model_tester = Blip2VisionModelWithProjectionTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Training is not yet supported") + def test_training(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Blip2VisionModelWithProjection does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Blip2VisionModelWithProjection does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + def test_model_common_attributes(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) + x = model.get_output_embeddings() + self.assertTrue(x is None or isinstance(x, nn.Linear)) + + @unittest.skip(reason="Blip2VisionModelWithProjection has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip(reason="Blip2VisionModelWithProjection has no base class and is not available in MODEL_MAPPING") + def test_save_load_fast_init_to_base(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + @slow + @require_torch_gpu + def test_model_from_pretrained(self): + model_name = "Salesforce/blip2-itm-vit-g" + model = Blip2VisionModelWithProjection.from_pretrained(model_name) + self.assertIsNotNone(model) + self.assertTrue(hasattr(model, "vision_projection")) + + _, pixel_values = self.model_tester.prepare_config_and_inputs() + + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(pixel_values=pixel_values) + + self.assertEqual( + outputs.image_embeds.shape, + ( + self.model_tester.vision_model_tester.batch_size, + model.config.num_query_tokens, + model.config.image_text_hidden_size, + ), + ) + + +class Blip2TextRetrievalModelTester: + def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training=True): + if vision_kwargs is None: + vision_kwargs = {} + if qformer_kwargs is None: + qformer_kwargs = {"use_qformer_text_input": True} + + self.parent = parent + self.vision_model_tester = Blip2VisionModelTester(parent, **vision_kwargs) + self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) + self.is_training = is_training + self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test + + def get_config(self): + return Blip2Config.from_vision_qformer_text_configs( + vision_config=self.vision_model_tester.get_config(), + qformer_config=self.qformer_model_tester.get_config(), + ) + + def prepare_config_and_inputs(self): + _, input_ids, attention_mask = self.qformer_model_tester.prepare_config_and_inputs() + _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() + + config = self.get_config() + + return config, input_ids, attention_mask, pixel_values + + def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): + model = Blip2ForImageTextRetrieval(config).to(torch_device).eval() + with torch.no_grad(): + result = model(pixel_values, input_ids, attention_mask, use_image_text_matching_head=True) + + self.parent.assertEqual( + result.logits_per_image.shape, + (self.vision_model_tester.batch_size, 2), + ) + + with torch.no_grad(): + result = model(pixel_values, input_ids, attention_mask) + + self.parent.assertEqual( + result.logits_per_image.shape, + (self.vision_model_tester.batch_size, self.qformer_model_tester.batch_size), + ) + self.parent.assertEqual( + result.logits_per_text.shape, (self.qformer_model_tester.batch_size, self.vision_model_tester.batch_size) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, input_ids, attention_mask, pixel_values = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + } + return config, inputs_dict + + +@require_torch +class Blip2TextRetrievalModelTest(ModelTesterMixin, unittest.TestCase): + all_model_classes = (Blip2ForImageTextRetrieval,) if is_torch_available() else () + fx_compatible = False + test_head_masking = False + test_pruning = False + test_resize_embeddings = False + test_attention_outputs = False + test_torchscript = False + + def setUp(self): + self.model_tester = Blip2TextRetrievalModelTester(self) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip(reason="Hidden_states is tested in individual model tests") + def test_hidden_states_output(self): + pass + + @unittest.skip(reason="Inputs_embeds is tested in individual model tests") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Blip2ForImageTextRetrieval does not support input and output embeddings") + def test_model_get_set_embeddings(self): + pass + + @unittest.skip(reason="Retain_grad is tested in individual model tests") + def test_retain_grad_hidden_states_attentions(self): + pass + + @unittest.skip(reason="Blip2Model does not have input/output embeddings") + def test_model_common_attributes(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values", "input_ids", "attention_mask"] + expected_arg_names.extend( + ["use_image_text_matching_head"] if "use_image_text_matching_head" in arg_names else [] + ) + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_load_vision_qformer_text_config(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + # Save Blip2Config and check if we can load Blip2VisionConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + vision_config = Blip2VisionConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.vision_config.to_dict(), vision_config.to_dict()) + + # Save Blip2Config and check if we can load Blip2QFormerConfig from it + with tempfile.TemporaryDirectory() as tmp_dir_name: + config.save_pretrained(tmp_dir_name) + qformer_config = Blip2QFormerConfig.from_pretrained(tmp_dir_name) + self.assertDictEqual(config.qformer_config.to_dict(), qformer_config.to_dict()) + + @slow + @require_torch_gpu + def test_model_from_pretrained(self): + model_name = "Salesforce/blip2-itm-vit-g" + model = Blip2ForImageTextRetrieval.from_pretrained(model_name) + self.assertIsNotNone(model) + + _, input_ids, attention_mask, pixel_values = self.model_tester.prepare_config_and_inputs() + + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + use_image_text_matching_head=True, + ) + self.assertEqual(outputs.logits_per_image.shape, (self.model_tester.qformer_model_tester.batch_size, 2)) + + with torch.no_grad(): + outputs = model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + ) + self.assertEqual( + outputs.logits_per_image.shape, + (self.model_tester.vision_model_tester.batch_size, self.model_tester.qformer_model_tester.batch_size), + ) + + @unittest.skip(reason="Training is not yet supported") + def test_training(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + # check if `logit_scale` is initilized as per the original implementation + if name == "logit_scale": + self.assertAlmostEqual( + param.data.item(), + np.log(1 / 0.07), + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + elif name == "temp": + self.assertAlmostEqual( + param.data.item(), + 0.07, + delta=1e-3, + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + else: + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # We will verify our results on an image of cute cats def prepare_img(): url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg" @@ -984,7 +1549,7 @@ def test_inference_opt_multi_accelerator(self): prompt = "Question: which city is this? Answer:" inputs = processor(images=image, text=prompt, return_tensors="pt").to(0, dtype=torch.float16) - predictions = model.generate(**inputs) + predictions = model.generate(**inputs, max_new_tokens=11) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() # Test output @@ -1063,3 +1628,93 @@ def test_expansion_in_processing(self): generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip() self.assertTrue(generated_text_expanded == generated_text) + + @require_torch_gpu + def test_inference_itm(self): + model_name = "Salesforce/blip2-itm-vit-g" + processor = Blip2Processor.from_pretrained(model_name) + model = Blip2ForImageTextRetrieval.from_pretrained(model_name).to(torch_device) + + image = prepare_img() + text = "A woman and her dog sitting in a beach" + inputs = processor(images=image, text=text, return_tensors="pt").to(torch_device) + + # forward pass + out_itm = model(**inputs, use_image_text_matching_head=True) + out = model(**inputs) + + # verify + expected_scores = torch.Tensor([[0.0238, 0.9762]]) + self.assertTrue(torch.allclose(torch.nn.Softmax()(out_itm[0].cpu()), expected_scores, rtol=1e-3, atol=1e-3)) + self.assertTrue(torch.allclose(out[0].cpu(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3)) + + @require_torch_gpu + @require_torch_fp16 + def test_inference_itm_fp16(self): + model_name = "Salesforce/blip2-itm-vit-g" + processor = Blip2Processor.from_pretrained(model_name) + model = Blip2ForImageTextRetrieval.from_pretrained(model_name, torch_dtype=torch.float16).to(torch_device) + + image = prepare_img() + text = "A woman and her dog sitting in a beach" + inputs = processor(images=image, text=text, return_tensors="pt").to(torch_device, dtype=torch.float16) + + # forward pass + out_itm = model(**inputs, use_image_text_matching_head=True) + out = model(**inputs) + + # verify + expected_scores = torch.Tensor([[0.0239, 0.9761]]) + self.assertTrue( + torch.allclose(torch.nn.Softmax()(out_itm[0].cpu().float()), expected_scores, rtol=1e-3, atol=1e-3) + ) + self.assertTrue(torch.allclose(out[0].cpu().float(), torch.Tensor([[0.4406]]), rtol=1e-3, atol=1e-3)) + + @require_torch_gpu + @require_torch_fp16 + def test_inference_vision_with_projection_fp16(self): + model_name = "Salesforce/blip2-itm-vit-g" + processor = Blip2Processor.from_pretrained(model_name) + model = Blip2VisionModelWithProjection.from_pretrained(model_name, torch_dtype=torch.float16).to(torch_device) + + image = prepare_img() + inputs = processor(images=image, return_tensors="pt").to(torch_device, dtype=torch.float16) + + # forward pass + out = model(**inputs) + + # verify + expected_image_embeds = [ + -0.093994140625, + -0.075927734375, + 0.031890869140625, + 0.053009033203125, + 0.0352783203125, + -0.01190185546875, + ] + self.assertTrue(np.allclose(out.image_embeds[0][0][:6].tolist(), expected_image_embeds, atol=1e-3)) + + @require_torch_gpu + @require_torch_fp16 + def test_inference_text_with_projection_fp16(self): + model_name = "Salesforce/blip2-itm-vit-g" + processor = Blip2Processor.from_pretrained(model_name) + model = Blip2TextModelWithProjection.from_pretrained(model_name, torch_dtype=torch.float16).to(torch_device) + + inputs = processor(text="a woman sitting on the beach with a dog", padding=True, return_tensors="pt").to( + torch_device + ) + + # forward pass + out = model(**inputs) + + # verify + expected_text_embeds = [ + -0.1082763671875, + 0.053192138671875, + -0.02825927734375, + 0.0169830322265625, + 0.08648681640625, + -0.04656982421875, + ] + self.assertTrue(np.allclose(out.text_embeds[0][0][:6].tolist(), expected_text_embeds, atol=1e-3)) diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py index b4501e437335..b57adf609d1e 100644 --- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py +++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py @@ -279,3 +279,46 @@ def test_siglip_model_pt(self): ] * 5, ) + + @slow + @require_torch + def test_blip2_model_pt(self): + image_classifier = pipeline( + task="zero-shot-image-classification", + model="Salesforce/blip2-itm-vit-g", + ) + # This is an image of 2 cats with remotes and no planes + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + output = image_classifier( + image, + candidate_labels=["2 cats", "a plane", "a remote"], + tokenizer_kwargs={"return_token_type_ids": False}, + ) + + self.assertEqual( + nested_simplify(output), + [ + {"score": 0.369, "label": "2 cats"}, + {"score": 0.333, "label": "a remote"}, + {"score": 0.297, "label": "a plane"}, + ], + ) + + output = image_classifier( + [image] * 5, + candidate_labels=["2 cats", "a plane", "a remote"], + batch_size=2, + tokenizer_kwargs={"return_token_type_ids": False}, + ) + + self.assertEqual( + nested_simplify(output), + [ + [ + {"score": 0.369, "label": "2 cats"}, + {"score": 0.333, "label": "a remote"}, + {"score": 0.297, "label": "a plane"}, + ] + ] + * 5, + ) diff --git a/utils/check_repo.py b/utils/check_repo.py index 02570e3c60c3..167482bb08af 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -164,6 +164,8 @@ "ClapAudioModel", "ClapAudioModelWithProjection", "Blip2ForConditionalGeneration", + "Blip2TextModelWithProjection", + "Blip2VisionModelWithProjection", "Blip2QFormerModel", "Blip2VisionModel", "ErnieMForInformationExtraction",