huggingface · regisss · Dec 5, 2024 · Dec 4, 2024 · Dec 4, 2024 · mengker33
@@ -112,13 +112,14 @@ python3 run_pipeline.py \
     --bf16
 ```
 
-To run mllama inference, use the following command:
+To run mllama inference using reduced precision in the SDPA, use the following command:
 
 ```bash
 python3 run_pipeline.py \
     --model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FP8

@@ -174,6 +174,11 @@ def main():
         action="store_true",
         help="Whether to use the key/value cache for decoding. It should speed up generation.",
     )
+    parser.add_argument(
+        "--sdp_on_bf16",
+        action="store_true",
+        help="Allow PyTorch to use reduced precision in the SDPA math backend",
+    )
 
     args = parser.parse_args()
 
@@ -304,6 +309,10 @@ def main():
         "flash_attention_recompute": args.flash_attention_recompute,
         "limit_hpu_graphs": args.limit_hpu_graphs,
     }
+
+    if args.sdp_on_bf16:
+        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
+
     if args.use_kv_cache:
         generate_kwargs["use_cache"] = args.use_kv_cache
 

@@ -21,8 +21,8 @@
             ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925),
             ("google/paligemma-3b-mix-224", 1, 132.8949150246155),
             ("HuggingFaceM4/idefics2-8b", 1, 21.89944593215077),
-            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 20.407843538649303),
-            ("tiiuae/falcon-11B-vlm", 1, 27.0566558689559327),
+            ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 18.974541922240313),
+            ("tiiuae/falcon-11B-vlm", 1, 23.69260849957278),
         ],
         "fp8": [
             ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),
@@ -68,6 +68,7 @@ def _test_image_to_text(
     ]
 
     command.append("--bf16")
+    command.append("--sdp_on_bf16")
 
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")