Skip to content

Commit ed61be8

Browse files
committed
Merge remote-tracking branch 'upstream/main' into lcalabri/add_some_gemma_test_cases
2 parents f853323 + 03b50d2 commit ed61be8

File tree

17 files changed

+192
-84
lines changed

17 files changed

+192
-84
lines changed

examples/contrastive-image-text/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,8 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \
235235
--dataloader_num_workers 1 \
236236
--mediapipe_dataloader \
237237
--distribution_strategy fast_ddp \
238-
--trust_remote_code
238+
--trust_remote_code \
239+
--sdp_on_bf16
239240
```
240241

241242
> `--mediapipe_dataloader` only works on Gaudi2.

examples/image-to-text/README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,14 @@ python3 run_pipeline.py \
112112
--bf16
113113
```
114114

115-
To run mllama inference, use the following command:
115+
To run mllama inference using reduced precision in the SDPA, use the following command:
116116

117117
```bash
118118
python3 run_pipeline.py \
119119
--model_name_or_path meta-llama/Llama-3.2-11B-Vision-Instruct \
120120
--use_hpu_graphs \
121-
--bf16
121+
--bf16 \
122+
--sdp_on_bf16
122123
```
123124

124125
### Inference with FP8

examples/image-to-text/run_pipeline.py

+9
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,11 @@ def main():
174174
action="store_true",
175175
help="Whether to use the key/value cache for decoding. It should speed up generation.",
176176
)
177+
parser.add_argument(
178+
"--sdp_on_bf16",
179+
action="store_true",
180+
help="Allow PyTorch to use reduced precision in the SDPA math backend",
181+
)
177182

178183
args = parser.parse_args()
179184

@@ -304,6 +309,10 @@ def main():
304309
"flash_attention_recompute": args.flash_attention_recompute,
305310
"limit_hpu_graphs": args.limit_hpu_graphs,
306311
}
312+
313+
if args.sdp_on_bf16:
314+
torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
315+
307316
if args.use_kv_cache:
308317
generate_kwargs["use_cache"] = args.use_kv_cache
309318

examples/question-answering/README.md

+59-8
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,30 @@ For the following cases, an example of a Gaudi configuration file is given
4444
This example code fine-tunes BERT on the SQuAD1.1 dataset.
4545

4646
```bash
47-
PT_HPU_LAZY_MODE=0 python run_qa.py \
47+
python run_qa.py \
48+
--model_name_or_path bert-large-uncased-whole-word-masking \
49+
--gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
50+
--dataset_name squad \
51+
--do_train \
52+
--do_eval \
53+
--per_device_train_batch_size 32 \
54+
--per_device_eval_batch_size 8 \
55+
--learning_rate 3e-5 \
56+
--num_train_epochs 2 \
57+
--max_seq_length 384 \
58+
--doc_stride 128 \
59+
--output_dir /tmp/squad/ \
60+
--use_habana \
61+
--use_lazy_mode \
62+
--use_hpu_graphs_for_inference \
63+
--throughput_warmup_steps 3 \
64+
--bf16 \
65+
--sdp_on_bf16
66+
```
67+
68+
For torch.compile mode,
69+
```bash
70+
PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \
4871
--model_name_or_path bert-large-uncased-whole-word-masking \
4972
--gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
5073
--dataset_name squad \
@@ -62,16 +85,40 @@ PT_HPU_LAZY_MODE=0 python run_qa.py \
6285
--torch_compile \
6386
--use_lazy_mode false \
6487
--throughput_warmup_steps 3 \
65-
--bf16
88+
--bf16 \
89+
--sdp_on_bf16
6690
```
6791

68-
6992
### Multi-card Training
7093

7194
Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs:
7295

7396
```bash
74-
PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
97+
python ../gaudi_spawn.py \
98+
--world_size 8 --use_mpi run_qa.py \
99+
--model_name_or_path bert-large-uncased-whole-word-masking \
100+
--gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
101+
--dataset_name squad \
102+
--do_train \
103+
--do_eval \
104+
--per_device_train_batch_size 32 \
105+
--per_device_eval_batch_size 8 \
106+
--learning_rate 3e-5 \
107+
--num_train_epochs 2 \
108+
--max_seq_length 384 \
109+
--doc_stride 128 \
110+
--output_dir /tmp/squad_output/ \
111+
--use_habana \
112+
--use_lazy_mode \
113+
--use_hpu_graphs_for_inference \
114+
--throughput_warmup_steps 3 \
115+
--bf16 \
116+
--sdp_on_bf16
117+
```
118+
119+
For torch.compile mode,
120+
```bash
121+
PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \
75122
--world_size 8 --use_mpi run_qa.py \
76123
--model_name_or_path bert-large-uncased-whole-word-masking \
77124
--gaudi_config_name Habana/bert-large-uncased-whole-word-masking \
@@ -90,7 +137,8 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \
90137
--torch_compile \
91138
--use_lazy_mode false \
92139
--throughput_warmup_steps 3 \
93-
--bf16
140+
--bf16 \
141+
--sdp_on_bf16
94142
```
95143

96144

@@ -117,7 +165,8 @@ python ../gaudi_spawn.py \
117165
--use_lazy_mode \
118166
--use_hpu_graphs_for_inference \
119167
--throughput_warmup_steps 3 \
120-
--deepspeed path_to_my_deepspeed_config
168+
--deepspeed path_to_my_deepspeed_config \
169+
--sdp_on_bf16
121170
```
122171

123172
You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -175,7 +224,8 @@ python ../gaudi_spawn.py \
175224
--use_hpu_graphs_for_inference \
176225
--throughput_warmup_steps 3 \
177226
--max_train_samples 45080 \
178-
--deepspeed ../../tests/configs/deepspeed_zero_2.json
227+
--deepspeed ../../tests/configs/deepspeed_zero_2.json \
228+
--sdp_on_bf16
179229
```
180230

181231

@@ -197,7 +247,8 @@ python run_qa.py \
197247
--use_habana \
198248
--use_lazy_mode \
199249
--use_hpu_graphs_for_inference \
200-
--bf16
250+
--bf16 \
251+
--sdp_on_bf16
201252
```
202253

203254

examples/speech-recognition/README.md

+8-4
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ python run_speech_recognition_ctc.py \
8787
--throughput_warmup_steps="3" \
8888
--bf16 \
8989
--use_hpu_graphs_for_training \
90-
--use_hpu_graphs_for_inference
90+
--use_hpu_graphs_for_inference \
91+
--sdp_on_bf16
9192
```
9293

9394
On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
@@ -128,7 +129,8 @@ python ../gaudi_spawn.py \
128129
--throughput_warmup_steps 3 \
129130
--bf16 \
130131
--use_hpu_graphs_for_training \
131-
--use_hpu_graphs_for_inference
132+
--use_hpu_graphs_for_inference \
133+
--sdp_on_bf16
132134
```
133135

134136
On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
@@ -176,7 +178,8 @@ python ../gaudi_spawn.py \
176178
--use_lazy_mode \
177179
--gaudi_config_name Habana/wav2vec2 \
178180
--throughput_warmup_steps 3 \
179-
--deepspeed ../../tests/configs/deepspeed_zero_2.json
181+
--deepspeed ../../tests/configs/deepspeed_zero_2.json \
182+
--sdp_on_bf16
180183
```
181184
182185
[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana.
@@ -208,7 +211,8 @@ python run_speech_recognition_ctc.py \
208211
--use_lazy_mode \
209212
--gaudi_config_name="Habana/wav2vec2" \
210213
--bf16 \
211-
--use_hpu_graphs_for_inference
214+
--use_hpu_graphs_for_inference \
215+
--sdp_on_bf16
212216
```
213217
## Sequence to Sequence
214218

examples/stable-diffusion/text_to_image_generation.py

+1
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ def main():
570570
args.model_name_or_path,
571571
**kwargs,
572572
)
573+
pipeline.unet.set_default_attn_processor(pipeline.unet)
573574

574575
if args.unet_adapter_name_or_path is not None:
575576
from peft import PeftModel

examples/text-classification/README.md

+12-6
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ python run_glue.py \
6060
--use_lazy_mode \
6161
--use_hpu_graphs_for_inference \
6262
--throughput_warmup_steps 3 \
63-
--bf16
63+
--bf16 \
64+
--sdp_on_bf16
6465
```
6566

6667
> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -88,7 +89,8 @@ python ../gaudi_spawn.py \
8889
--use_lazy_mode \
8990
--use_hpu_graphs_for_inference \
9091
--throughput_warmup_steps 3 \
91-
--bf16
92+
--bf16 \
93+
--sdp_on_bf16
9294
```
9395

9496
> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
@@ -116,7 +118,8 @@ python ../gaudi_spawn.py \
116118
--use_lazy_mode \
117119
--use_hpu_graphs_for_inference \
118120
--throughput_warmup_steps 3 \
119-
--deepspeed path_to_my_deepspeed_config
121+
--deepspeed path_to_my_deepspeed_config \
122+
--sdp_on_bf16
120123
```
121124

122125
You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -160,7 +163,8 @@ python run_glue.py \
160163
--use_habana \
161164
--use_lazy_mode \
162165
--use_hpu_graphs_for_inference \
163-
--bf16
166+
--bf16 \
167+
--sdp_on_bf16
164168
```
165169

166170
## Llama Guard on MRPC
@@ -190,7 +194,8 @@ python ../gaudi_spawn.py \
190194
--use_lazy_mode \
191195
--use_hpu_graphs_for_inference \
192196
--throughput_warmup_steps 3 \
193-
--deepspeed ../../tests/configs/deepspeed_zero_2.json
197+
--deepspeed ../../tests/configs/deepspeed_zero_2.json \
198+
--sdp_on_bf16
194199
```
195200

196201
You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana.
@@ -216,5 +221,6 @@ python run_glue.py \
216221
--use_lazy_mode \
217222
--use_hpu_graphs_for_inference \
218223
--throughput_warmup_steps 2 \
219-
--bf16
224+
--bf16 \
225+
--sdp_on_bf16
220226
```

examples/text-generation/run_lm_eval.py

-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ def main():
217217
for k, v in mem.items():
218218
print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
219219
json.dump(results, open(args.output_file, "w"), indent=2)
220-
print(json.dumps(results, indent=2))
221220
if args.quant_config:
222221
finalize_quantization(model)
223222

examples/text-generation/utils.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -605,13 +605,12 @@ def setup_tokenizer(args, model, assistant_model, logger):
605605
tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id)
606606
tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id)
607607

608-
# HACK: MiniCPM3 has multiple eos_tokens and does not specify padding token. Set both to second one.
609-
if model.config.model_type == "minicpm3":
610-
tokenizer.pad_token = tokenizer.eos_token
611-
model.generation_config.pad_token_id = model.generation_config.eos_token_id[-1]
608+
# HACK: MiniCPM3 does not support list EOS token ID generation config.
609+
if model.config.model_type == "minicpm3" and isinstance(model.generation_config.eos_token_id, list):
610+
logger.warning(
611+
f"Model type {model.config.model_type} does not support list style EOS token ID in generation config. Only last eos token id will be used."
612+
)
612613
model.generation_config.eos_token_id = model.generation_config.eos_token_id[-1]
613-
if len(model.generation_config.eos_token_id) > 1:
614-
logger.warning("Multiple EOS token IDs found. Only last eos token id will be used.")
615614

616615
# Some models like GPT2 do not have a PAD token so we have to set it if necessary
617616
if tokenizer.pad_token is None:

optimum/habana/accelerate/utils/transformer_engine.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,20 @@ def __init__(self):
9191
enable_recompute=module.enable_recompute,
9292
)
9393

94-
def forward(self, query, key, value, attn_mask, dropout_p, is_causal, scale, softmax_mode):
94+
def forward(
95+
self,
96+
query,
97+
key,
98+
value,
99+
attn_mask,
100+
dropout_p,
101+
is_causal,
102+
scale,
103+
softmax_mode,
104+
recompute_mode,
105+
valid_sequence_lengths,
106+
padding_side="left",
107+
):
95108
return self._hpu_kernel_fsdpa(query, key, value, attn_mask, is_causal, softmax_mode)
96109

97110
setattr(model, name, TE_ModuleFusedSDPA())

0 commit comments

Comments
 (0)