Update README commands for the models to use --sdp_on_bf16 (huggingfa…

…ce#1566)
HabanaAI · Jan 20, 2025 · 30beb3e · 30beb3e
1 parent d9779b9
commit 30beb3e
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 19 deletions.
diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md
@@ -235,7 +235,8 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
   --dataloader_num_workers 1 \
   --mediapipe_dataloader \
   --distribution_strategy fast_ddp \
-  --trust_remote_code
+  --trust_remote_code \
+  --sdp_on_bf16
 ```
 
 > `--mediapipe_dataloader` only works on Gaudi2.

diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md
@@ -44,7 +44,30 @@ For the following cases, an example of a Gaudi configuration file is given
 This example code fine-tunes BERT on the SQuAD1.1 dataset.
 
 ```bash
-PT_HPU_LAZY_MODE=0 python run_qa.py \
+python run_qa.py \
+  --model_name_or_path bert-large-uncased-whole-word-masking \
+  --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
+  --dataset_name squad \
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 32 \
+  --per_device_eval_batch_size 8 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/squad/ \
+  --use_habana \
+  --use_lazy_mode \
+  --use_hpu_graphs_for_inference \
+  --throughput_warmup_steps 3 \
+  --bf16 \
+  --sdp_on_bf16
+```
+
+For torch.compile mode,
+```bash
+PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \
   --model_name_or_path bert-large-uncased-whole-word-masking \
   --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
   --dataset_name squad \
@@ -62,16 +85,40 @@ PT_HPU_LAZY_MODE=0 python run_qa.py \
   --torch_compile \
   --use_lazy_mode false \
   --throughput_warmup_steps 3 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
-
 ### Multi-card Training
 
 Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
 
 ```bash
-PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
+python ../gaudi_spawn.py \
+    --world_size 8 --use_mpi run_qa.py \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --per_device_train_batch_size 32 \
+    --per_device_eval_batch_size 8 \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir /tmp/squad_output/ \
+    --use_habana \
+    --use_lazy_mode \
+    --use_hpu_graphs_for_inference \
+    --throughput_warmup_steps 3 \
+    --bf16 \
+    --sdp_on_bf16
+```
+
+For torch.compile mode,
+```bash
+PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \
     --world_size 8 --use_mpi run_qa.py \
     --model_name_or_path bert-large-uncased-whole-word-masking \
     --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
@@ -90,7 +137,8 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
     --torch_compile \
     --use_lazy_mode false \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 
@@ -117,7 +165,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
+    --deepspeed path_to_my_deepspeed_config \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -175,7 +224,8 @@ python ../gaudi_spawn.py \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 3 \
   --max_train_samples 45080 \
-  --deepspeed ../../tests/configs/deepspeed_zero_2.json
+  --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+  --sdp_on_bf16
 ```
 
 
@@ -197,7 +247,8 @@ python run_qa.py \
   --use_habana \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 

diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
@@ -87,7 +87,8 @@ python run_speech_recognition_ctc.py \
     --throughput_warmup_steps="3" \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 
 On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
@@ -128,7 +129,8 @@ python ../gaudi_spawn.py \
     --throughput_warmup_steps 3 \
     --bf16 \
     --use_hpu_graphs_for_training \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 
 On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
@@ -176,7 +178,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+    --sdp_on_bf16
 ```
 
 [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
@@ -208,7 +211,8 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --bf16 \
-    --use_hpu_graphs_for_inference
+    --use_hpu_graphs_for_inference \
+    --sdp_on_bf16
 ```
 ## Sequence to Sequence
 

diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md
@@ -60,7 +60,8 @@ python run_glue.py \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 3 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -88,7 +89,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --sdp_on_bf16
 ```
 
 > If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -116,7 +118,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3 \
-    --deepspeed path_to_my_deepspeed_config
+    --deepspeed path_to_my_deepspeed_config \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -160,7 +163,8 @@ python run_glue.py \
   --use_habana \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```
 
 ## Llama Guard on MRPC
@@ -190,7 +194,8 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --use_hpu_graphs_for_inference \
     --throughput_warmup_steps 3  \
-    --deepspeed ../../tests/configs/deepspeed_zero_2.json
+    --deepspeed ../../tests/configs/deepspeed_zero_2.json \
+    --sdp_on_bf16
 ```
 
 You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -216,5 +221,6 @@ python run_glue.py \
   --use_lazy_mode \
   --use_hpu_graphs_for_inference \
   --throughput_warmup_steps 2 \
-  --bf16
+  --bf16 \
+  --sdp_on_bf16
 ```