From 463daecfc60b0c6a160f14c6c3a774c3b4a6ad0e Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Wed, 4 Dec 2024 19:12:59 +0000 Subject: [PATCH 1/2] apply --sdp_on_bf16 to image-to-text example --- examples/image-to-text/run_pipeline.py | 9 +++++++++ tests/test_image_to_text_example.py | 1 + 2 files changed, 10 insertions(+) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index ae76fdbe55..44eb8d575a 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -174,6 +174,11 @@ def main(): action="store_true", help="Whether to use the key/value cache for decoding. It should speed up generation.", ) + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + help="Allow PyTorch to use reduced precision in the SDPA math backend", + ) args = parser.parse_args() @@ -304,6 +309,10 @@ def main(): "flash_attention_recompute": args.flash_attention_recompute, "limit_hpu_graphs": args.limit_hpu_graphs, } + + if args.sdp_on_bf16: + torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) + if args.use_kv_cache: generate_kwargs["use_cache"] = args.use_kv_cache diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py index c6e153445a..10f1c3c8b9 100644 --- a/tests/test_image_to_text_example.py +++ b/tests/test_image_to_text_example.py @@ -68,6 +68,7 @@ def _test_image_to_text( ] command.append("--bf16") + command.append("--sdp_on_bf16") with TemporaryDirectory() as tmp_dir: command.append(f"--output_dir {tmp_dir}") From d738a4700883ad1e5c6deb0db1a06e86a42f3dc2 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Wed, 4 Dec 2024 22:34:32 +0000 Subject: [PATCH 2/2] update README and target performance --- examples/image-to-text/README.md | 5 +++-- tests/test_image_to_text_example.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 0203b3630b..51f4a5dda2 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -112,13 +112,14 @@ python3 run_pipeline.py \ --bf16 ``` -To run mllama inference, use the following command: +To run mllama inference using reduced precision in the SDPA, use the following command: ```bash python3 run_pipeline.py \ --model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` ### Inference with FP8 diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py index 10f1c3c8b9..c73d4d0565 100644 --- a/tests/test_image_to_text_example.py +++ b/tests/test_image_to_text_example.py @@ -21,8 +21,8 @@ ("llava-hf/llava-v1.6-vicuna-13b-hf", 1, 23.527610042925), ("google/paligemma-3b-mix-224", 1, 132.8949150246155), ("HuggingFaceM4/idefics2-8b", 1, 21.89944593215077), - ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 20.407843538649303), - ("tiiuae/falcon-11B-vlm", 1, 27.0566558689559327), + ("meta-llama/Llama-3.2-11B-Vision-Instruct", 1, 18.974541922240313), + ("tiiuae/falcon-11B-vlm", 1, 23.69260849957278), ], "fp8": [ ("llava-hf/llava-1.5-7b-hf", 1, 98.72578382705062),