From 9498c7c1a2ebc5fae13b64cac8d006dae287449e Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Sun, 8 Dec 2024 02:23:21 -0800
Subject: [PATCH] Update README commands for more models to use --sdp_on_bf16
 (#1575)

Co-authored-by: Libin Tang <litang@habana.ai>
---
 examples/image-to-text/README.md       | 45 +++++++++++++++++---------
 examples/question-answering/README.md  | 11 +------
 examples/speech-recognition/README.md  | 18 +++++------
 examples/text-classification/README.md |  6 ++--
 examples/text-generation/README.md     | 33 ++++++++++++-------
 tests/test_text_generation_example.py  |  1 -
 6 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 51f4a5dda2..e4dbb05472 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -44,7 +44,8 @@ python3 run_pipeline.py \
     --model_name_or_path Salesforce/blip-image-captioning-large \
     --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-7b inference, use the following command:
@@ -52,7 +53,8 @@ To run Llava-1.5-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-1.5-13b inference, use the following command:
@@ -60,7 +62,8 @@ To run Llava-1.5-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-mistral-7b inference, use the following command:
@@ -68,7 +71,8 @@ To run Llava-v1.6-mistral-7b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-v1.6-vicuna-13b inference, use the following command:
@@ -76,7 +80,8 @@ To run Llava-v1.6-vicuna-13b inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
@@ -84,7 +89,8 @@ To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-34b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run google/paligemma-3b-mix-224 inference, use the following command:
@@ -92,7 +98,8 @@ To run google/paligemma-3b-mix-224 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path google/paligemma-3b-mix-224 \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
@@ -100,7 +107,8 @@ To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run idefics2 inference, use the following command:
@@ -109,7 +117,8 @@ To run idefics2 inference, use the following command:
 python3 run_pipeline.py \
     --model_name_or_path HuggingFaceM4/idefics2-8b \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 To run mllama inference using reduced precision in the SDPA, use the following command:
@@ -134,7 +143,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-1.5-7b:
@@ -143,7 +153,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-1.5-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 
@@ -153,7 +164,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b:
@@ -162,7 +174,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b:
@@ -171,7 +184,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b:
@@ -180,7 +194,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r
     --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 ### Inference with FusedSDPA
diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
index bf6cd04aec..654a9e02ad 100755
--- a/examples/question-answering/README.md
+++ b/examples/question-answering/README.md
@@ -190,14 +190,6 @@ Here is a DeepSpeed configuration you can use to train your models on Gaudi:
 }
 ```
 
-
-### Training in torch.compile mode
-
-Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \
-a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \
-b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags.
-
-
 ## Fine-tuning Llama on SQuAD1.1
 
 > [!NOTE]
@@ -224,8 +216,7 @@ python ../gaudi_spawn.py \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 3 \
   --max_train_samples 45080 \
-  --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-  --sdp_on_bf16
+  --deepspeed ../../tests/configs/deepspeed_zero_2.json
 ```
 
 
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
index 4d5eb69b91..02e4b53d66 100644
--- a/examples/speech-recognition/README.md
+++ b/examples/speech-recognition/README.md
@@ -87,8 +87,7 @@ python run_speech_recognition_ctc.py \
     --throughput_warmup_steps="3" \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference \
-    --sdp_on_bf16
+    --use_hpu_graphs_for_inference
 ```
 
 On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
@@ -129,8 +128,7 @@ python ../gaudi_spawn.py \
     --throughput_warmup_steps 3 \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference \
-    --sdp_on_bf16
+    --use_hpu_graphs_for_inference
 ```
 
 On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
@@ -178,8 +176,7 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-    --sdp_on_bf16
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json
 ```
 
 [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
@@ -211,8 +208,7 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --bf16 \
-    --use_hpu_graphs_for_inference \
-    --sdp_on_bf16
+    --use_hpu_graphs_for_inference
 ```
 ## Sequence to Sequence
 
@@ -259,7 +255,8 @@ python run_speech_recognition_seq2seq.py \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
     --dataloader_num_workers 8 \
-    --throughput_warmup_steps 3
+    --throughput_warmup_steps 3 \
+    --sdp_on_bf16
 ```
 
 If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition.
@@ -329,5 +326,6 @@ python run_speech_recognition_seq2seq.py \
     --use_habana \
     --use_hpu_graphs_for_inference \
     --label_features_max_length 128 \
-    --dataloader_num_workers 8
+    --dataloader_num_workers 8 \
+    --sdp_on_bf16
 ```
diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md
index 9ffc78ae43..32bc3fd5f8 100644
--- a/examples/text-classification/README.md
+++ b/examples/text-classification/README.md
@@ -194,8 +194,7 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3  \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
-    --sdp_on_bf16
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -221,6 +220,5 @@ python run_glue.py \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 2 \
-  --bf16 \
-  --sdp_on_bf16
+  --bf16
 ```
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 5e42af689c..e2e3605fc8 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -79,7 +79,8 @@ python run_generation.py \
 --use_kv_cache \
 --max_new_tokens 100 \
 --do_sample \
---prompt "Here is my prompt"
+--prompt "Here is my prompt" \
+--sdp_on_bf16
 ```
 
 If you want to provide several prompts as inputs, here is how to do it:
@@ -91,7 +92,8 @@ python run_generation.py \
 --max_new_tokens 100 \
 --do_sample \
 --batch_size 2 \
---prompt "Hello world" "How are you?"
+--prompt "Hello world" "How are you?" \
+--sdp_on_bf16
 ```
 
 > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size.
@@ -110,7 +112,8 @@ python run_generation.py \
 --use_kv_cache \
 --num_return_sequences 1 \
 --temperature 0 \
---prompt "Alice and Bob"
+--prompt "Alice and Bob" \
+--sdp_on_bf16
 ```
 
 ### Benchmark
@@ -137,7 +140,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --batch_size 1 \
 --use_hpu_graphs \
 --use_kv_cache \
---max_new_tokens 100
+--max_new_tokens 100 \
+--sdp_on_bf16
 ```
 
 You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command:
@@ -152,7 +156,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --attn_softmax_bf16 \
 --limit_hpu_graphs \
 --reuse_cache \
---trim_logits
+--trim_logits \
+--sdp_on_bf16
 ```
 
 To run Falcon-7B inference, use the following command:
@@ -164,7 +169,8 @@ python run_generation.py \
  --use_kv_cache \
  --batch_size 1 \
  --max_new_tokens 128 \
- --do_sample
+ --do_sample \
+ --sdp_on_bf16
 ```
 
 To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
@@ -195,7 +201,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 > --use_hpu_graphs \
 > --use_kv_cache \
 > --max_new_tokens 100 \
-> --bf16
+> --bf16 \
+> --sdp_on_bf16
 > ```
 
 ### Use any dataset from the Hugging Face Hub
@@ -214,7 +221,8 @@ python run_generation.py \
 --use_kv_cache \
 --dataset_name JulesBelveze/tldr_news \
 --column_name content \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated.
@@ -233,7 +241,8 @@ python run_generation.py \
 --bf16 \
 --max_new_tokens 100 \
 --prompt "Here is my prompt" \
---peft_model yard1/llama-2-7b-sql-lora-test
+--peft_model yard1/llama-2-7b-sql-lora-test \
+--sdp_on_bf16
 ```
 
 ### Using growing bucket optimization
@@ -490,7 +499,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 Here is an example to quantize the model based on previous measurements for gemma with 1 card:
@@ -502,7 +512,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation
 --max_new_tokens 100 \
 --batch_size 1 \
 --reuse_cache \
---bf16
+--bf16 \
+--sdp_on_bf16
 ```
 
 
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 0768e4a746..98f9a692c3 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -221,7 +221,6 @@ def _test_text_generation(
 
     if "gemma" in model_name.lower():
         command += ["--use_flash_attention"]
-        command += ["--sdp_on_bf16"]
 
     if "decilm" in model_name.lower():
         command += ["--sdp_on_bf16"]