Luca-Calabria
diff --git a/‎examples/contrastive-image-text/README.md
+2-1 b/‎examples/contrastive-image-text/README.md
+2-1
diff --git a/‎examples/image-to-text/README.md
+3-2 b/‎examples/image-to-text/README.md
+3-2
diff --git a/‎examples/image-to-text/run_pipeline.py
+9 b/‎examples/image-to-text/run_pipeline.py
+9
diff --git a/‎examples/question-answering/README.md
+59-8 b/‎examples/question-answering/README.md
+59-8
diff --git a/‎examples/speech-recognition/README.md
+8-4 b/‎examples/speech-recognition/README.md
+8-4
diff --git a/‎examples/stable-diffusion/text_to_image_generation.py
+1 b/‎examples/stable-diffusion/text_to_image_generation.py
+1
diff --git a/‎examples/text-classification/README.md
+12-6 b/‎examples/text-classification/README.md
+12-6
diff --git a/‎examples/text-generation/run_lm_eval.py
-1 b/‎examples/text-generation/run_lm_eval.py
-1
diff --git a/‎examples/text-generation/utils.py
+5-6 b/‎examples/text-generation/utils.py
+5-6
diff --git a/‎optimum/habana/accelerate/utils/transformer_engine.py
+14-1 b/‎optimum/habana/accelerate/utils/transformer_engine.py
+14-1
@@ -235,7 +235,8 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
   --dataloader_num_workers 1 \
   --mediapipe_dataloader \
   --distribution_strategy fast_ddp \
-  --trust_remote_code
+  --trust_remote_code \
+  --sdp_on_bf16
 ```
 
 > `--mediapipe_dataloader` only works on Gaudi2.
 
@@ -112,13 +112,14 @@ python3 run_pipeline.py \
     --bf16
 ```
 
-To run mllama inference, use the following command:
+To run mllama inference using reduced precision in the SDPA, use the following command:
 
 ```bash
 python3 run_pipeline.py \
     --model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FP8
 
@@ -174,6 +174,11 @@ def main():
         action="store_true",
         help="Whether to use the key/value cache for decoding. It should speed up generation.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        help="Allow PyTorch to use reduced precision in the SDPA math backend",
+    )
 
     args = parser.parse_args()
 
@@ -304,6 +309,10 @@ def main():
         "flash_attention_recompute": args.flash_attention_recompute,
         "limit_hpu_graphs": args.limit_hpu_graphs,
     }
+
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.use_kv_cache:
         generate_kwargs["use_cache"] = args.use_kv_cache
 
 
@@ -44,7 +44,30 @@ For the following cases, an example of a Gaudi configuration file is given
 This example code fine-tunes BERT on the SQuAD1.1 dataset.
 
 ```bash
-PT_HPU_LAZY_MODE=0 python run_qa.py \
+python run_qa.py \
+  --model_name_or_path bert-large-uncased-whole-word-masking \
+  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
+  --dataset_name squad \
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 8 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/squad/ \
+  --use_habana \
+  --use_lazy_mode \
+  --use_hpu_graphs_for_inference \
+  --throughput_warmup_steps 3 \
+  --bf16 \
+  --sdp_on_bf16
+```
+
+For torch.compile mode,
+```bash
+PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \
   --model_name_or_path bert-large-uncased-whole-word-masking \
   --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
   --dataset_name squad \
@@ -62,16 +85,40 @@ PT_HPU_LAZY_MODE=0 python run_qa.py \
   --torch_compile \
   --use_lazy_mode false \
   --throughput_warmup_steps 3 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
-
 ### Multi-card Training
 
 Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
 
 ```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
+python ../gaudi_spawn.py \
+    --world_size 8 --use_mpi run_qa.py \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 8 \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir /tmp/squad_output/ \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3 \
+    --bf16 \
+    --sdp_on_bf16
+```
+
+For torch.compile mode,
+```bash
+PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \
     --world_size 8 --use_mpi run_qa.py \
     --model_name_or_path bert-large-uncased-whole-word-masking \
     --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
@@ -90,7 +137,8 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --torch_compile \
     --use_lazy_mode false \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 
@@ -117,7 +165,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
+    --deepspeed path_to_my_deepspeed_config \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -175,7 +224,8 @@ python ../gaudi_spawn.py \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 3 \
   --max_train_samples 45080 \
-  --deepspeed ../../tests/configs/deepspeed_zero_2.json
+  --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+  --sdp_on_bf16
 ```
 
 
@@ -197,7 +247,8 @@ python run_qa.py \
   --use_habana \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 
 
@@ -87,7 +87,8 @@ python run_speech_recognition_ctc.py \
     --throughput_warmup_steps="3" \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 
 On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
@@ -128,7 +129,8 @@ python ../gaudi_spawn.py \
     --throughput_warmup_steps 3 \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 
 On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
@@ -176,7 +178,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+    --sdp_on_bf16
 ```
 
 [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
@@ -208,7 +211,8 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --bf16 \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 ## Sequence to Sequence
 
 
@@ -570,6 +570,7 @@ def main():
                     args.model_name_or_path,
                     **kwargs,
                 )
+                pipeline.unet.set_default_attn_processor(pipeline.unet)
 
                 if args.unet_adapter_name_or_path is not None:
                     from peft import PeftModel
 
@@ -60,7 +60,8 @@ python run_glue.py \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 3 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -88,7 +89,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -116,7 +118,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
+    --deepspeed path_to_my_deepspeed_config \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -160,7 +163,8 @@ python run_glue.py \
   --use_habana \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 ## Llama Guard on MRPC
@@ -190,7 +194,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3  \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -216,5 +221,6 @@ python run_glue.py \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 2 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
@@ -217,7 +217,6 @@ def main():
             for k, v in mem.items():
                 print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
         json.dump(results, open(args.output_file, "w"), indent=2)
-        print(json.dumps(results, indent=2))
     if args.quant_config:
         finalize_quantization(model)
 
 
@@ -605,13 +605,12 @@ def setup_tokenizer(args, model, assistant_model, logger):
         tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
         tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
 
-    # HACK: MiniCPM3 has multiple eos_tokens and does not specify padding token. Set both to second one.
-    if model.config.model_type == "minicpm3":
-        tokenizer.pad_token = tokenizer.eos_token
-        model.generation_config.pad_token_id = model.generation_config.eos_token_id[-1]
+    # HACK: MiniCPM3 does not support list EOS token ID generation config.
+    if model.config.model_type == "minicpm3" and isinstance(model.generation_config.eos_token_id, list):
+        logger.warning(
+            f"Model type {model.config.model_type} does not support list style EOS token ID in generation config. Only last eos token id will be used."
+        )
         model.generation_config.eos_token_id = model.generation_config.eos_token_id[-1]
-        if len(model.generation_config.eos_token_id) > 1:
-            logger.warning("Multiple EOS token IDs found. Only last eos token id will be used.")
 
     # Some models like GPT2 do not have a PAD token so we have to set it if necessary
     if tokenizer.pad_token is None:
 
@@ -91,7 +91,20 @@ def __init__(self):
                         enable_recompute=module.enable_recompute,
                     )
 
-                def forward(self, query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode):
+                def forward(
+                    self,
+                    query,
+                    key,
+                    value,
+                    attn_mask,
+                    dropout_p,
+                    is_causal,
+                    scale,
+                    softmax_mode,
+                    recompute_mode,
+                    valid_sequence_lengths,
+                    padding_side="left",
+                ):
                     return self._hpu_kernel_fsdpa(query, key, value, attn_mask, is_causal, softmax_mode)
 
             setattr(model, name, TE_ModuleFusedSDPA())
Original file line number	Diff line number	Diff line change
`@@ -570,6 +570,7 @@ def main():`
`570`	`570`	`args.model_name_or_path,`
`571`	`571`	`**kwargs,`
`572`	`572`	`)`
	`573`	`+ pipeline.unet.set_default_attn_processor(pipeline.unet)`
`573`	`574`
`574`	`575`	`if args.unet_adapter_name_or_path is not None:`
`575`	`576`	`from peft import PeftModel`