From 463daecfc60b0c6a160f14c6c3a774c3b4a6ad0e Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Wed, 4 Dec 2024 19:12:59 +0000
Subject: [PATCH 1/2] apply --sdp_on_bf16 to image-to-text example

---
 examples/image-to-text/run_pipeline.py | 9 +++++++++
 tests/test_image_to_text_example.py    | 1 +
 2 files changed, 10 insertions(+)

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index ae76fdbe55..44eb8d575a 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -174,6 +174,11 @@ def main():
         action="store_true",
         help="Whether to use the key/value cache for decoding. It should speed up generation.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        help="Allow PyTorch to use reduced precision in the SDPA math backend",
+    )
 
     args = parser.parse_args()
 
@@ -304,6 +309,10 @@ def main():
         "flash_attention_recompute": args.flash_attention_recompute,
         "limit_hpu_graphs": args.limit_hpu_graphs,
     }
+
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.use_kv_cache:
         generate_kwargs["use_cache"] = args.use_kv_cache
 
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index c6e153445a..10f1c3c8b9 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -68,6 +68,7 @@ def _test_image_to_text(
     ]
 
     command.append("--bf16")
+    command.append("--sdp_on_bf16")
 
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")

From d738a4700883ad1e5c6deb0db1a06e86a42f3dc2 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Wed, 4 Dec 2024 22:34:32 +0000
Subject: [PATCH 2/2] update README and target performance

---
 examples/image-to-text/README.md    | 5 +++--
 tests/test_image_to_text_example.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 0203b3630b..51f4a5dda2 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -112,13 +112,14 @@ python3 run_pipeline.py \
     --bf16
 ```
 
-To run mllama inference, use the following command:
+To run mllama inference using reduced precision in the SDPA, use the following command:
 
 ```bash
 python3 run_pipeline.py \
     --model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FP8
diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py
index 10f1c3c8b9..c73d4d0565 100644
--- a/tests/test_image_to_text_example.py
+++ b/tests/test_image_to_text_example.py
@@ -21,8 +21,8 @@
             ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925),
             ("google/paligemma-3b-mix-224", 1, 132.8949150246155),
             ("HuggingFaceM4/idefics2-8b", 1, 21.89944593215077),
-            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 20.407843538649303),
-            ("tiiuae/falcon-11B-vlm", 1, 27.0566558689559327),
+            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 18.974541922240313),
+            ("tiiuae/falcon-11B-vlm", 1, 23.69260849957278),
         ],
         "fp8": [
             ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),