Merge branch 'huggingface:main' into pr1280_fix

Luca-Calabria · web-flow · commit 4d45312c230b · 2024-11-25T19:31:30.000+01:00
diff --git a/README.md b/README.md
@@ -232,6 +232,7 @@ The following model architectures, tasks and device distributions have been vali
 | ClipSeg |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | idefics2 | <div style="text-align:left"><li>LoRA</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Paligemma | | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | Segment Anything Model |   | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | VideoMAE | | <div style="text-align:left"><li>Single card</li></div> | <li>[Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)</li> |
 | TableTransformer |   | <div style="text-align:left"><li>Single card</li></div> | <li>[table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection) </li> |
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -77,6 +77,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | OWLViT       |          | <div style="text-align:left"><li>Single card</li></div> | <li>[zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)</li> |
 | ClipSeg      |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | Llava / Llava-next |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
+| Paligemma |    | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | idefics2     | <div style="text-align:left"><li>LoRA</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | SAM          |          | <div style="text-align:left"><li>Single card</li></div> | <li>[object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)</li> |
 | VideoMAE |          | <div style="text-align:left"><li>Single card</li></div> | <li>[Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)</li> |
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
@@ -32,6 +32,7 @@ Models that have been validated:
   - [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)
   - [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b)
   - [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
+  - [google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224)
 
 ### Inference with BF16
 
@@ -77,16 +78,22 @@ python3 run_pipeline.py \
 ```
 
 To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
-
 ```bash
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-34b-hf \
     --use_hpu_graphs \
     --bf16
 ```
 
-To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
+To run google/paligemma-3b-mix-224 inference, use the following command:
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path google/paligemma-3b-mix-224 \
+    --use_hpu_graphs \
+    --bf16
+```
 
+To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
 ```bash
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
@@ -405,4 +412,4 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json PT_HPU_ENABLE_LAZY_COLLECTI
 --bf16 \
 --use_flash_attention \
 --flash_attention_recompute
-```
+```
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
@@ -187,11 +187,16 @@ def main():
     model_type = AutoConfig.from_pretrained(args.model_name_or_path).model_type
     if args.image_path is None and model_type in ["llava", "idefics2", "mllama"]:
         args.image_path = ["https://llava-vl.github.io/static/images/view.jpg"]
+    elif args.image_path is None and model_type == "paligemma":
+        args.image_path = [
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+        ]
     elif args.image_path is None and model_type == "llava_next":
         args.image_path = [
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
-    if args.prompt is None and model_type in ["llava", "idefics2", "llava_next", "mllama"]:
+
+    if args.prompt is None and model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma"]:
         processor = AutoProcessor.from_pretrained(args.model_name_or_path)
         conversation = [
             {
@@ -202,7 +207,10 @@ def main():
                 ],
             }
         ]
-        args.prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        if model_type == "paligemma":
+            args.prompt = "caption es"
+        else:
+            args.prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
     image_paths = args.image_path
     image_paths_len = len(image_paths)
@@ -276,7 +284,7 @@ def main():
         htcore.hpu_initialize(generator.model)
 
     # delete once pipeline integrate AutoProcessor as preprocess engine
-    if model_type in ["idefics2", "mllama"]:
+    if model_type in ["idefics2", "mllama", "paligemma"]:
         from transformers.image_utils import load_image
 
         def preprocess(self, image, prompt=None, timeout=None):
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
@@ -286,8 +286,12 @@ def main():
         action="store_true",
         help="Use rescale_betas_zero_snr for controlling image brightness",
     )
+    parser.add_argument("--optimize", action="store_true", help="Use optimized pipeline.")
     args = parser.parse_args()
 
+    if args.optimize and not args.use_habana:
+        raise ValueError("--optimize can only be used with --use-habana.")
+
     # Select stable diffuson pipeline based on input
     sdxl_models = ["stable-diffusion-xl", "sdxl"]
     sd3_models = ["stable-diffusion-3"]
@@ -302,6 +306,8 @@ def main():
         scheduler = GaudiEulerDiscreteScheduler.from_pretrained(
             args.model_name_or_path, subfolder="scheduler", **kwargs
         )
+        if args.optimize:
+            scheduler.hpu_opt = True
     elif args.scheduler == "euler_ancestral_discrete":
         scheduler = GaudiEulerAncestralDiscreteScheduler.from_pretrained(
             args.model_name_or_path, subfolder="scheduler", **kwargs
@@ -417,14 +423,31 @@ def main():
 
             pipeline = AutoPipelineForInpainting.from_pretrained(args.model_name_or_path, **kwargs)
 
-        else:
+        elif args.optimize:
             # Import SDXL pipeline
+            import habana_frameworks.torch.hpu as torch_hpu
+
+            from optimum.habana.diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_mlperf import (
+                StableDiffusionXLPipeline_HPU,
+            )
+
+            pipeline = StableDiffusionXLPipeline_HPU.from_pretrained(
+                args.model_name_or_path,
+                **kwargs,
+            )
+
+            pipeline.to(torch.device("hpu"))
+            pipeline.unet.set_default_attn_processor(pipeline.unet)
+            if args.use_hpu_graphs:
+                pipeline.unet = torch_hpu.wrap_in_hpu_graph(pipeline.unet)
+        else:
             from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline
 
             pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
                 args.model_name_or_path,
                 **kwargs,
             )
+
             if args.lora_id:
                 pipeline.load_lora_weights(args.lora_id)
 
diff --git a/optimum/habana/diffusers/models/attention_processor.py b/optimum/habana/diffusers/models/attention_processor.py
@@ -19,7 +19,7 @@
 import torch
 import torch.nn.functional as F
 from diffusers.models.attention_processor import Attention
-from diffusers.utils import USE_PEFT_BACKEND, logging
+from diffusers.utils import deprecate, logging
 from diffusers.utils.import_utils import is_xformers_available
 from torch import nn
 
@@ -107,8 +107,13 @@ def __call__(
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
+        *args,
+        **kwargs,
     ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
         residual = hidden_states
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -132,16 +137,15 @@ def __call__(
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states, *args)
+        query = attn.to_q(hidden_states)
 
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
         elif attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
 
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
@@ -171,7 +175,7 @@ def __call__(
         hidden_states = hidden_states.to(query.dtype)
 
         # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
+        hidden_states = attn.to_out[0](hidden_states)
         # dropout
         hidden_states = attn.to_out[1](hidden_states)
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_mlperf.py
@@ -170,6 +170,7 @@ def __init__(
         )
         self.unet.set_default_attn_processor = set_default_attn_processor_hpu
         self.unet.forward = gaudi_unet_2d_condition_model_forward
+        self.quantized = False
 
     def run_unet(
         self,
@@ -609,7 +610,6 @@ def __call__(
 
         self._num_timesteps = len(timesteps)
         with self.progress_bar(total=num_inference_steps) as progress_bar:
-            timesteps = [t.item() for t in timesteps]
             if self.quantized:
                 for i, t in enumerate(timesteps[0:-2]):
                     if self.interrupt:
@@ -666,7 +666,9 @@ def __call__(
                     )
                     hb_profiler.step()
             else:
-                for i, t in enumerate(timesteps):
+                for i in range(num_inference_steps):
+                    t = timesteps[0]
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
                     if self.interrupt:
                         continue
                     latents = self.run_unet(
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
@@ -109,6 +109,7 @@
     "qwen2_moe",
     "xglm",
     "whisper",
+    "paligemma",
     "idefics2",
     "mllama",
 ]
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
@@ -108,6 +108,7 @@
     GaudiMptModel,
     GaudiOPTForCausalLM,
     GaudiOPTLearnedPositionalEmbedding,
+    GaudiPaliGemmaForConditionalGeneration,
     GaudiPersimmonAttention,
     GaudiPersimmonDecoderLayer,
     GaudiPersimmonForCausalLM,
@@ -436,6 +437,11 @@ def adapt_transformers_to_gaudi():
         GaudiLlavaNextForConditionalGeneration
     )
 
+    # Optimization for paligemma on Gaudi
+    transformers.models.paligemma.modeling_paligemma.PaliGemmaForConditionalGeneration = (
+        GaudiPaliGemmaForConditionalGeneration
+    )
+
     # Optimization for idefics2 on Gaudi
     transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration = (
         GaudiIdefics2ForConditionalGeneration
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
@@ -189,6 +189,7 @@
     gaudi_opt_model_forward,
 )
 from .owlvit import gaudi_owlvitclasspredictionhead_forward
+from .paligemma import GaudiPaliGemmaForConditionalGeneration
 from .persimmon import (
     GaudiPersimmonAttention,
     GaudiPersimmonDecoderLayer,
diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py
@@ -135,7 +135,7 @@ def forward(self, cur, dim, idx):
 class GaudiGemmaAttention(GemmaAttention):
     def __init__(self, config: GemmaConfig, layer_idx: Optional[int] = None):
         super().__init__(config, layer_idx)
-
+        config.rope_scaling = config.rope_scaling if hasattr(config, "rope_scaling") else None
         self.matmul_qk = Matmul()
         self.matmul_av = Matmul()
         self.k_cache = KVCache()
@@ -605,12 +605,13 @@ def forward(
             position_ids = position_ids.unsqueeze(0)
 
         # HPU specific mask generation
-        attention_mask = _gaudi_prepare_4d_causal_attention_mask(
-            attention_mask,
-            input_ids.shape if input_ids is not None else (batch_size, seq_length),
-            inputs_embeds,
-            past_seen_tokens,
-        )
+        if attention_mask.dim() != 4:
+            attention_mask = _gaudi_prepare_4d_causal_attention_mask(
+                attention_mask,
+                input_ids.shape if input_ids is not None else (batch_size, seq_length),
+                inputs_embeds,
+                past_seen_tokens,
+            )
         # embed positions
         hidden_states = inputs_embeds
 
diff --git a/optimum/habana/transformers/models/paligemma/__init__.py b/optimum/habana/transformers/models/paligemma/__init__.py
@@ -0,0 +1 @@
+from .modeling_paligemma import GaudiPaliGemmaForConditionalGeneration
diff --git a/optimum/habana/transformers/models/paligemma/modeling_paligemma.py b/optimum/habana/transformers/models/paligemma/modeling_paligemma.py
diff --git a/optimum/habana/transformers/models/starcoder2/modeling_starcoder2.py b/optimum/habana/transformers/models/starcoder2/modeling_starcoder2.py
diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@`
`109`	`109`	`"qwen2_moe",`
`110`	`110`	`"xglm",`
`111`	`111`	`"whisper",`
	`112`	`+ "paligemma",`
`112`	`113`	`"idefics2",`
`113`	`114`	`"mllama",`
`114`	`115`	`]`
Original file line number	Diff line number	Diff line change
`@@ -189,6 +189,7 @@`
`189`	`189`	`gaudi_opt_model_forward,`
`190`	`190`	`)`
`191`	`191`	`from .owlvit import gaudi_owlvitclasspredictionhead_forward`
	`192`	`+from .paligemma import GaudiPaliGemmaForConditionalGeneration`
`192`	`193`	`from .persimmon import (`
`193`	`194`	`GaudiPersimmonAttention,`
`194`	`195`	`GaudiPersimmonDecoderLayer,`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .modeling_paligemma import GaudiPaliGemmaForConditionalGeneration`