diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 51f4a5dda2..e4dbb05472 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -44,7 +44,8 @@ python3 run_pipeline.py \
     --model_name_or_path Salesforce/blip-image-captioning-large \
     --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-7b inference, use the following command:
@@ -52,7 +53,8 @@ To run Llava-1.5-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-13b inference, use the following command:
@@ -60,7 +62,8 @@ To run Llava-1.5-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-mistral-7b inference, use the following command:
@@ -68,7 +71,8 @@ To run Llava-v1.6-mistral-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-vicuna-13b inference, use the following command:
@@ -76,7 +80,8 @@ To run Llava-v1.6-vicuna-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
@@ -84,7 +89,8 @@ To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-34b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run google/paligemma-3b-mix-224 inference, use the following command:
@@ -92,7 +98,8 @@ To run google/paligemma-3b-mix-224 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path google/paligemma-3b-mix-224 \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
@@ -100,7 +107,8 @@ To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run idefics2 inference, use the following command:
@@ -109,7 +117,8 @@ To run idefics2 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path HuggingFaceM4/idefics2-8b \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run mllama inference using reduced precision in the SDPA, use the following command:
@@ -134,7 +143,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-1.5-7b:
@@ -143,7 +153,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 
@@ -153,7 +164,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b:
@@ -162,7 +174,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b:
@@ -171,7 +184,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b:
@@ -180,7 +194,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FusedSDPA
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index bf6cd04aec..4c9b42f7c0 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -190,14 +190,6 @@ Here is a DeepSpeed configuration you can use to train your models on Gaudi:
 }
 ```
 
-
-### Training in torch.compile mode
-
-Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \
-b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
 ## Fine-tuning Llama on SQuAD1.1
 
 > [!NOTE]
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
index 4d5eb69b91..998d6433bb 100644
--- a/examples/speech-recognition/README.md
+++ b/examples/speech-recognition/README.md
@@ -259,7 +259,8 @@ python run_speech_recognition_seq2seq.py \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
     --dataloader_num_workers 8 \
-    --throughput_warmup_steps 3
+    --throughput_warmup_steps 3 \
+    --sdp_on_bf16
 ```
 
 If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition.
@@ -329,5 +330,6 @@ python run_speech_recognition_seq2seq.py \
     --use_habana \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
-    --dataloader_num_workers 8
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16
 ```
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 6da9bc8470..e2e3605fc8 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -79,7 +79,8 @@ python run_generation.py \
 --use_kv_cache \
 --max_new_tokens 100 \
 --do_sample \
---prompt "Here is my prompt"
+--prompt "Here is my prompt" \
+--sdp_on_bf16
 ```
 
 If you want to provide several prompts as inputs, here is how to do it:
@@ -91,7 +92,8 @@ python run_generation.py \
 --max_new_tokens 100 \
 --do_sample \
 --batch_size 2 \
---prompt "Hello world" "How are you?"
+--prompt "Hello world" "How are you?" \
+--sdp_on_bf16
 ```
 
 > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
@@ -110,7 +112,8 @@ python run_generation.py \
 --use_kv_cache \
 --num_return_sequences 1 \
 --temperature 0 \
---prompt "Alice and Bob"
+--prompt "Alice and Bob" \
+--sdp_on_bf16
 ```
 
 ### Benchmark
@@ -137,7 +140,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --batch_size 1 \
 --use_hpu_graphs \
 --use_kv_cache \
---max_new_tokens 100
+--max_new_tokens 100 \
+--sdp_on_bf16
 ```
 
 You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command:
@@ -152,7 +156,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --attn_softmax_bf16 \
 --limit_hpu_graphs \
 --reuse_cache \
---trim_logits
+--trim_logits \
+--sdp_on_bf16
 ```
 
 To run Falcon-7B inference, use the following command:
@@ -164,7 +169,8 @@ python run_generation.py \
  --use_kv_cache \
  --batch_size 1 \
  --max_new_tokens 128 \
- --do_sample
+ --do_sample \
+ --sdp_on_bf16
 ```
 
 To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
@@ -195,7 +201,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 > --use_hpu_graphs \
 > --use_kv_cache \
 > --max_new_tokens 100 \
-> --bf16
+> --bf16 \
+> --sdp_on_bf16
 > ```
 
 ### Use any dataset from the Hugging Face Hub
@@ -214,7 +221,8 @@ python run_generation.py \
 --use_kv_cache \
 --dataset_name JulesBelveze/tldr_news \
 --column_name content \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated.
@@ -233,7 +241,8 @@ python run_generation.py \
 --bf16 \
 --max_new_tokens 100 \
 --prompt "Here is my prompt" \
---peft_model yard1/llama-2-7b-sql-lora-test
+--peft_model yard1/llama-2-7b-sql-lora-test \
+--sdp_on_bf16
 ```
 
 ### Using growing bucket optimization
@@ -490,7 +499,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for gemma with 1 card:
@@ -502,7 +512,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 
@@ -512,14 +523,13 @@ Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8
 As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script.
 Here are the steps:
 1. Measure the model on a number of cards that are enough for the model to fit in BF16.
-2. Quantize the model on the same amount of cards for scales to be saved.
-3. Run unify_measurements.py script using the measurement files created after running steps 1 and 2. A unified measurement is then calculated.
+2. Run unify_measurements.py script using the measurement files created in step 1. A unified measurement is then calculated.
 ```bash
 python quantization_tools/unify_measurements.py -g 01234567 -m *path_to_8x_measurements* -o *path_to_output_1x_measurement*
 ```
 In the above example, the measurements of cards 0-7 will be unified to a single measurement. For example, if you specify `-g 0123 4567`,
 cards 0-3 and cards 4-7 will be unified in two different measurement files. All different group combinations are supported.
-4. Run quantization using the unified measurement file/s.
+3. Run quantization using the unified measurement file/s.
 
 More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
 
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
index 4282e4ac49..de2b086c2a 100644
--- a/examples/text-generation/quantization_tools/unify_measurements.py
+++ b/examples/text-generation/quantization_tools/unify_measurements.py
@@ -6,49 +6,45 @@
 import numpy as np
 
 
-def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
+def find_measurement_path(measurement, measurements_dir_path, group_size):
     measurment_card = measurement + "_" + str(group_size)
     for measurment_file in os.listdir(measurements_dir_path):
         filename = os.fsdecode(measurment_file)
         if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename:
             continue
-        if scales:
-            if "MAXABS" in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
-        else:
-            if "MAXABS" not in filename:
-                return os.path.join(measurements_dir_path, measurment_file)
 
+        if "MAXABS" not in filename:
+            return os.path.join(measurements_dir_path, measurment_file)
 
-def unify_measurements(
-    measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index, scales=False
-):
+
+def unify_measurements(measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index):
     measurements_paths = []
     group_name = ""
 
     # save all the jsons paths in the given measurement group
     for measurement in measurement_group:
-        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size)
-        measurements_paths.append(measurement_path)
+        measurement_path = find_measurement_path(measurement, measurements_dir_path, groups_size)
+        if measurement_path is not None:
+            measurements_paths.append(measurement_path)
         group_name += measurement
-
     # save all the jsons content in the given measurement group
     measurements_jsons = []
     for measurement_path in measurements_paths:
-        with open(measurement_path, "r") as f:
-            js = json.load(f)
-            measurements_jsons.append(js["Nodes"])
+        if measurement_path is not None:
+            with open(measurement_path, "r") as f:
+                js = json.load(f)
+                measurements_jsons.append(js["Nodes"])
     # create a name for the unified json that will be created for this measurement group
 
     if groups_num == 1:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace("_" + measurement_group[0] + "_" + str(groups_size), "")
         )
     else:
         unified_json_name = (
-            find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+            find_measurement_path(measurement_group[0], measurements_dir_path, groups_size)
             .split("/")[-1]
             .replace(
                 "_" + measurement_group[0] + "_" + str(groups_size), "_" + str(group_index) + "_" + str(groups_num)
@@ -74,70 +70,27 @@ def unify_measurements(
             max_weight = node_values["params"]["weight"]
 
         # iterate over all the measurment group and take the maximum for each tensor and its channel
-        if scales:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
-                if max_outputs is not None:
-                    if isinstance(max_outputs[0], list):
-                        for i in range(0, len(max_outputs)):
-                            for j in range(0, len(max_outputs[i])):
-                                max_outputs[i][j] = max(
-                                    measurement_json[node_name]["outputs"][i][j], max_outputs[i][j]
-                                )
-                    else:
-                        for i in range(0, len(max_outputs)):
-                            max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i])
-                if max_weight is not None:
-                    if isinstance(max_weight, dict):
-                        for key, values in max_weight.items():
-                            for i in range(0, len(values)):
-                                max_weight[key][i] = max(
-                                    measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i]
-                                )
-                    else:
-                        max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
-        else:
-            for measurement_json in measurements_jsons:
-                for i in range(0, len(max_inputs)):
-                    for j in range(0, len(max_inputs[i])):
-                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
-                if max_outputs is not None:
-                    for i in range(0, len(max_outputs)):
-                        max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
-                if max_weight is not None:
-                    for i in range(0, len(max_weight)):
-                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
-
-        # update the maximum in the unified json
-        if scales:
-            for i in range(0, len(max_inputs)):
-                unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
-            if max_outputs is not None:
-                if isinstance(max_outputs[0], list):
-                    for i in range(0, len(max_outputs)):
-                        for j in range(0, len(max_outputs[i])):
-                            unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j]
-                else:
-                    for i in range(0, len(max_outputs)):
-                        unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i]
-            if max_weight is not None:
-                if isinstance(max_weight, dict):
-                    for key, values in max_weight.items():
-                        for i in range(0, len(values)):
-                            unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i]
-                else:
-                    unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
-        else:
+        for measurement_json in measurements_jsons:
             for i in range(0, len(max_inputs)):
                 for j in range(0, len(max_inputs[i])):
-                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+                    max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
             if max_outputs is not None:
                 for i in range(0, len(max_outputs)):
-                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+                    max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
             if max_weight is not None:
                 for i in range(0, len(max_weight)):
-                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
+                    max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
+
+        # update the maximum in the unified json
+        for i in range(0, len(max_inputs)):
+            for j in range(0, len(max_inputs[i])):
+                unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+        if max_outputs is not None:
+            for i in range(0, len(max_outputs)):
+                unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+        if max_weight is not None:
+            for i in range(0, len(max_weight)):
+                unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
     global_rank = None
     local_rank = group_index if groups_num != 1 else -1
     mode = ""
@@ -153,10 +106,10 @@ def unify_measurements(
         layers[layer] = {}
         layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
         if dlayer.get("outputs") is not None:
-            layers[layer]["outputs"] = np.array(dlayer["outputs"])
+            layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
         if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
             layers[layer]["params"] = {}
-            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+            layers[layer]["params"]["weight"] = [np.array(x) for x in dlayer["params"]["weight"]]
     df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
     with open(unified_npz_path, "w"):
         np.savez(unified_npz_path, df)
@@ -196,26 +149,14 @@ def main(args):
     groups = args.groups
 
     num_jsons_drange = 0
-    num_jsons_scales = 0
     for path in os.listdir(measurements_path):
-        if path.endswith(".json"):
-            if "MAXABS" in path:
-                num_jsons_scales += 1
-            elif "mod_list" not in path:
-                num_jsons_drange += 1
-    assert (
-        os.path.isdir(measurements_path)
-        and (num_jsons_drange % len(groups)) == 0
-        and (num_jsons_scales % len(groups)) == 0
-    )
+        if path.endswith(".json") and "MAXABS" not in path and "mod_list" not in path:
+            num_jsons_drange += 1
+
+    assert os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0
 
     for group_index, group in enumerate(groups):
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_drange, len(groups), group_index, scales=False
-        )
-        unify_measurements(
-            group, measurements_path, output_path, num_jsons_scales, len(groups), group_index, scales=True
-        )
+        unify_measurements(group, measurements_path, output_path, num_jsons_drange, len(groups), group_index)
 
     print("finished measurement unifier script")
 
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 4b2ab96842..a057261b32 100755
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -320,6 +320,9 @@ def setup_parser(parser):
         action="store_true",
         help="Run the inference with dataset for specified --n_iterations(default:5)",
     )
+    parser.add_argument(
+        "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend"
+    )
 
     quant_parser_group = parser.add_mutually_exclusive_group()
     quant_parser_group.add_argument(
@@ -389,6 +392,9 @@ def main():
 
     import habana_frameworks.torch.hpu as torch_hpu
 
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.dataset_name is None:
         # Benchmark over the prompts below
         if args.prompt:
diff --git a/optimum/habana/diffusers/models/unet_2d_condition.py b/optimum/habana/diffusers/models/unet_2d_condition.py
index 7bb641bbf1..1218c0fc65 100644
--- a/optimum/habana/diffusers/models/unet_2d_condition.py
+++ b/optimum/habana/diffusers/models/unet_2d_condition.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
@@ -5,7 +6,12 @@
 from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput
 from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, torch_utils, unscale_lora_layers
 
-from optimum.habana.diffusers.utils.torch_utils import gaudi_fourier_filter
+from ..utils.torch_utils import gaudi_fourier_filter
+from .attention_processor import (
+    AttentionProcessor,
+    AttnProcessor2_0,
+    ScaledDotProductAttention,
+)
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -357,3 +363,50 @@ def gaudi_unet_2d_condition_model_forward(
         return (sample,)
 
     return UNet2DConditionOutput(sample=sample)
+
+
+def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention.
+    Sets the attention processor to use to compute attention.
+    Parameters:
+        processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+            The instantiated processor class or a dictionary of processor classes that will be set as the processor
+            for **all** `Attention` layers.
+            If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+            processor. This is strongly recommended when setting trainable attention processors.
+    """
+    count = len(self.attn_processors.keys())
+    if isinstance(processor, dict) and len(processor) != count:
+        raise ValueError(
+            f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+            f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+        )
+
+    def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+        if hasattr(module, "set_processor"):
+            if os.environ.get("PATCH_SDPA") is not None:
+                setattr(module, "attention_module", ScaledDotProductAttention())
+                module.set_processor(processor(module.attention_module))
+            else:
+                if isinstance(processor, dict):
+                    attention_processor = processor.pop(f"{name}.processor", None)
+                    if attention_processor is not None:
+                        module.set_processor(attention_processor)
+                else:
+                    module.set_processor(processor)
+        for sub_name, child in module.named_children():
+            fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+    for name, module in self.named_children():
+        fn_recursive_attn_processor(name, module, processor)
+
+
+def set_default_attn_processor_hpu(self):
+    """
+    Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    Disables custom attention processors and sets the default attention implementation from HPU.
+    """
+    processor = AttnProcessor2_0()
+    set_attn_processor_hpu(self, processor)
diff --git a/optimum/habana/diffusers/pipelines/pipeline_utils.py b/optimum/habana/diffusers/pipelines/pipeline_utils.py
index 5051764887..6dda26f796 100644
--- a/optimum/habana/diffusers/pipelines/pipeline_utils.py
+++ b/optimum/habana/diffusers/pipelines/pipeline_utils.py
@@ -28,10 +28,10 @@
 from diffusers.utils.torch_utils import is_compiled_module
 from huggingface_hub import create_repo
 
-from optimum.habana.utils import to_device_dtype
 from optimum.utils import logging
 
 from ...transformers.gaudi_configuration import GaudiConfig
+from ...utils import to_device_dtype
 
 
 logger = logging.get_logger(__name__)
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 630bc9c18b..f9c9907d39 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -39,6 +39,7 @@
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index f4a0dbd244..6a1b74d129 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import time
 from dataclasses import dataclass
 from math import ceil
@@ -38,6 +37,7 @@
 
 from ....transformers.gaudi_configuration import GaudiConfig
 from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from ...models.unet_2d_condition import set_default_attn_processor_hpu
 from ..pipeline_utils import GaudiDiffusionPipeline
 from ..stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
 
@@ -138,6 +138,8 @@ def __init__(
             force_zeros_for_empty_prompt,
         )
 
+        self.unet.set_default_attn_processor = set_default_attn_processor_hpu
+
         self.to(self._device)
 
     def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None):
diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
index 0e1378ee57..4608a56d3f 100644
--- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 
 import os
+import random
 from typing import Optional, Tuple, Union
 
 import torch
@@ -68,7 +69,7 @@ def _gaudi_wav2vec2_compute_mask_indices(
         )
 
     # epsilon is used for probabilistic rounding
-    epsilon = torch.rand([], device="hpu")
+    epsilon = torch.rand(1).item()
 
     def compute_num_masked_span(input_length):
         """Given input length, compute how many spans should be masked"""
@@ -106,19 +107,9 @@ def compute_num_masked_span(input_length):
         num_masked_span = compute_num_masked_span(input_length)
 
         # get random indices to mask
-        """
-        Original code:
-        spec_aug_mask_idx = np.random.choice(
-            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        spec_aug_mask_idx = torch.tensor(
+            random.sample(range(input_length - (mask_length - 1)), num_masked_span), dtype=torch.int32
         )
-        When (input_length - (mask_length - 1) < 0), then num_masked_span=0
-        and we get: spec_aug_mask_idx=array([], dtype=int64)
-        However torch rewrite fails, because torch.randperm expects positive number
-        This causes a unit test to fail:
-        RUN_SLOW=true  GAUDI2_CI=1 python -m pytest tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py -v -s -k test_compute_mask_indices_short_audio
-        """
-        spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
-
         # pick first sampled index that will serve as a dummy index to pad vector
         # to ensure same dimension for all batches due to probabilistic rounding
         # Picking first sample just pads those vectors twice.
@@ -133,13 +124,12 @@ def compute_num_masked_span(input_length):
         spec_aug_mask_idx = torch.cat(
             [
                 spec_aug_mask_idx,
-                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx,
+                torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32) * dummy_mask_idx,
             ]
         )
         spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long))
 
-    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs)
-
+    spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs).to("hpu")
     # expand masked indices to masked spans
     spec_aug_mask_idxs = torch.broadcast_to(
         spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
@@ -248,7 +238,7 @@ def gaudi_wav2vec2_encoder_forward(
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-        dropout_probability = torch.rand([], device="hpu")
+        dropout_probability = torch.rand([])
 
         skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
         if not skip_the_layer or deepspeed_zero3_is_enabled:
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000..d022b1f3e1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,113 @@
+absl-py==2.1.0
+accelerate==0.33.0
+aiohappyeyeballs==2.4.3
+aiohttp==3.11.7
+aiosignal==1.3.1
+alembic==1.14.0
+async-timeout==5.0.1
+attrs==24.2.0
+av==12.1.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.15.1
+charset-normalizer==3.4.0
+coloredlogs==15.0.1
+colorlog==6.9.0
+datasets==3.1.0
+diffusers==0.31.0
+dill==0.3.8
+exceptiongroup==1.2.2
+expecttest==0.2.1
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+gitdb==4.0.11
+GitPython==3.1.43
+google-auth==2.36.0
+google-auth-oauthlib==0.4.6
+greenlet==3.1.1
+grpcio==1.68.0
+huggingface-hub==0.26.3
+humanfriendly==10.0
+idna==3.10
+importlib_metadata==8.5.0
+iniconfig==2.0.0
+Jinja2==3.1.4
+joblib==1.4.2
+lightning==2.3.3
+lightning-habana==1.6.0
+lightning-utilities==0.11.9
+Mako==1.3.7
+Markdown==3.7
+MarkupSafe==3.0.2
+mpi4py==3.1.6
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+numpy==1.23.5
+oauthlib==3.2.2
+optimum==1.23.3
+optuna==4.1.0
+packaging==24.2
+pandas==2.0.1
+parameterized==0.9.0
+pathspec==0.12.1
+peft==0.14.0
+perfetto==0.11.0
+pillow==11.0.0
+Pillow-SIMD==9.5.0.post1
+pluggy==1.5.0
+prettytable==3.9.0
+propcache==0.2.0
+protobuf==3.20.3
+psutil==6.0.0
+py-cpuinfo==9.0.0
+pyarrow==18.1.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pybind11==2.10.4
+pycparser==2.22
+pydantic==1.10.13
+pynvml==8.0.4
+pytest==7.4.4
+python-dateutil==2.9.0.post0
+pytorch-lightning==2.4.0
+pytz==2024.2
+PyYAML==6.0
+regex==2023.5.5
+requests==2.32.3
+requests-oauthlib==2.0.0
+rsa==4.9
+safetensors==0.4.5
+scikit-learn==1.5.2
+scipy==1.14.1
+sentence-transformers==3.2.1
+sentencepiece==0.2.0
+six==1.16.0
+smmap==5.0.1
+SQLAlchemy==2.0.36
+symengine==0.11.0
+sympy==1.12.1
+tdqm==0.0.1
+tensorboard==2.11.2
+tensorboard-data-server==0.6.1
+tensorboard-plugin-wit==1.8.1
+threadpoolctl==3.5.0
+timm==1.0.12
+tokenizers==0.20.3
+tomli==2.2.1
+torchmetrics==1.6.0
+torchsde==0.2.6
+tqdm==4.67.0
+trampoline==0.1.2
+transformers==4.45.2
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==1.26.20
+wcwidth==0.2.13
+Werkzeug==3.1.3
+xxhash==3.5.0
+yamllint==1.35.1
+yarl==1.18.0
+zipp==3.21.0
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
index d1a563c9ff..055d321152 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/baselines/whisper_small.json
@@ -41,10 +41,10 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_wer": 0.3806988352745424,
-                    "train_runtime": 312.5894,
-                    "train_samples_per_second": 280.111,
-                    "eval_samples_per_second": 19.073,
+                    "eval_wer": 0.4693843594009983,
+                    "train_runtime": 380.00,
+                    "train_samples_per_second": 218.0,
+                    "eval_samples_per_second": 31.0,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 49117dde62..76ee8658da 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -660,6 +660,7 @@ def test_no_throughput_regression_bf16(self):
             gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
             torch_dtype=torch.bfloat16,
         )
+        pipeline.unet.set_default_attn_processor(pipeline.unet)
         set_seed(27)
         outputs = pipeline(
             prompt=prompts,
@@ -1388,6 +1389,7 @@ def _sdxl_generation(self, scheduler: str, batch_size: int, num_images_per_promp
                 "stabilityai/stable-diffusion-xl-base-1.0",
                 **kwargs,
             )
+            pipeline.unet.set_default_attn_processor(pipeline.unet)
             num_images_per_prompt = num_images_per_prompt
             res = {}
             outputs = pipeline(
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 27dd1b75c2..20d808b69f 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -206,6 +206,9 @@ def _test_text_translation(
         if "opus-mt-zh-en" in model_name:
             command_args.append("--max_source_length 512")
 
+        if "Babelscape/mrebel-large" in model_name or "nllb-200-distilled-600M" in model_name:
+            command_args.append("--sdp_on_bf16")
+
         command = self._build_command(
             task=task,
             deepspeed=deepspeed,
diff --git a/tests/test_examples.py b/tests/test_examples.py
index 20f26f9012..6c9e234bd2 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -528,6 +528,26 @@ def test(self):
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
                 env_variables["PT_ENABLE_INT64_SUPPORT"] = "1"
 
+            if self.EXAMPLE_NAME == "run_glue":
+                if model_name == "bert-large-uncased-whole-word-masking":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_qa":
+                if model_name == "bert-large-uncased-whole-word-masking" or \
+                   model_name == "albert-large-v2":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_bridgetower":
+                if model_name == "BridgeTower/bridgetower-large-itm-mlm-itc":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_speech_recognition_seq2seq":
+                if model_name == "openai/whisper-small":
+                    extra_command_line_arguments.append("--sdp_on_bf16")
+
+            if self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments.append("--sdp_on_bf16")
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 6335f28ebf..180a2bb3f9 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -97,6 +97,7 @@ def _test_fsdp(
             f"--gaudi_config_name {gaudi_config}",
             "--throughput_warmup_steps 100",
             "--do_eval",
+            "--sdp_on_bf16",
         ]
     else:
         command += [
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index c73d4d0565..36143e8f91 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -67,6 +67,11 @@ def _test_image_to_text(
         "--use_hpu_graphs",
     ]
 
+    if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model_name or "tiiuae/falcon-11B-vlm" in model_name:
+        command += [
+            "--sdp_on_bf16",
+        ]
+
     command.append("--bf16")
     command.append("--sdp_on_bf16")
 
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 8a43ee81f8..8f27a2d34f 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -218,6 +218,12 @@ def _test_text_generation(
     if "gemma" in model_name.lower():
         command += ["--use_flash_attention"]
 
+    if "decilm" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
+    if "mamba-130m-hf" in model_name.lower():
+        command += ["--sdp_on_bf16"]
+
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model:
         command += ["--reuse_cache"]