From 9498c7c1a2ebc5fae13b64cac8d006dae287449e Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Sun, 8 Dec 2024 02:23:21 -0800 Subject: [PATCH] Update README commands for more models to use --sdp_on_bf16 (#1575) Co-authored-by: Libin Tang --- examples/image-to-text/README.md | 45 +++++++++++++++++--------- examples/question-answering/README.md | 11 +------ examples/speech-recognition/README.md | 18 +++++------ examples/text-classification/README.md | 6 ++-- examples/text-generation/README.md | 33 ++++++++++++------- tests/test_text_generation_example.py | 1 - 6 files changed, 63 insertions(+), 51 deletions(-) diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 51f4a5dda2..e4dbb05472 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -44,7 +44,8 @@ python3 run_pipeline.py \ --model_name_or_path Salesforce/blip-image-captioning-large \ --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-1.5-7b inference, use the following command: @@ -52,7 +53,8 @@ To run Llava-1.5-7b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-7b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-1.5-13b inference, use the following command: @@ -60,7 +62,8 @@ To run Llava-1.5-13b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-13b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-v1.6-mistral-7b inference, use the following command: @@ -68,7 +71,8 @@ To run Llava-v1.6-mistral-7b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-v1.6-vicuna-13b inference, use the following command: @@ -76,7 +80,8 @@ To run Llava-v1.6-vicuna-13b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-hf/llava-v1.6-34b-hf inference, use the following command: @@ -84,7 +89,8 @@ To run Llava-hf/llava-v1.6-34b-hf inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-34b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run google/paligemma-3b-mix-224 inference, use the following command: @@ -92,7 +98,8 @@ To run google/paligemma-3b-mix-224 inference, use the following command: python3 run_pipeline.py \ --model_name_or_path google/paligemma-3b-mix-224 \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command: @@ -100,7 +107,8 @@ To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llama3-llava-next-8b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run idefics2 inference, use the following command: @@ -109,7 +117,8 @@ To run idefics2 inference, use the following command: python3 run_pipeline.py \ --model_name_or_path HuggingFaceM4/idefics2-8b \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run mllama inference using reduced precision in the SDPA, use the following command: @@ -134,7 +143,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-1.5-7b: @@ -143,7 +153,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-1.5-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` @@ -153,7 +164,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b: @@ -162,7 +174,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b: @@ -171,7 +184,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b: @@ -180,7 +194,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` ### Inference with FusedSDPA diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md index bf6cd04aec..654a9e02ad 100755 --- a/examples/question-answering/README.md +++ b/examples/question-answering/README.md @@ -190,14 +190,6 @@ Here is a DeepSpeed configuration you can use to train your models on Gaudi: } ``` - -### Training in torch.compile mode - -Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \ -a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \ -b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags. - - ## Fine-tuning Llama on SQuAD1.1 > [!NOTE] @@ -224,8 +216,7 @@ python ../gaudi_spawn.py \ --use_hpu_graphs_for_inference \ --throughput_warmup_steps 3 \ --max_train_samples 45080 \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --sdp_on_bf16 + --deepspeed ../../tests/configs/deepspeed_zero_2.json ``` diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md index 4d5eb69b91..02e4b53d66 100644 --- a/examples/speech-recognition/README.md +++ b/examples/speech-recognition/README.md @@ -87,8 +87,7 @@ python run_speech_recognition_ctc.py \ --throughput_warmup_steps="3" \ --bf16 \ --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --use_hpu_graphs_for_inference ``` On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**. @@ -129,8 +128,7 @@ python ../gaudi_spawn.py \ --throughput_warmup_steps 3 \ --bf16 \ --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --use_hpu_graphs_for_inference ``` On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**. @@ -178,8 +176,7 @@ python ../gaudi_spawn.py \ --use_lazy_mode \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --sdp_on_bf16 + --deepspeed ../../tests/configs/deepspeed_zero_2.json ``` [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana. @@ -211,8 +208,7 @@ python run_speech_recognition_ctc.py \ --use_lazy_mode \ --gaudi_config_name="Habana/wav2vec2" \ --bf16 \ - --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --use_hpu_graphs_for_inference ``` ## Sequence to Sequence @@ -259,7 +255,8 @@ python run_speech_recognition_seq2seq.py \ --use_hpu_graphs_for_inference \ --label_features_max_length 128 \ --dataloader_num_workers 8 \ - --throughput_warmup_steps 3 + --throughput_warmup_steps 3 \ + --sdp_on_bf16 ``` If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition. @@ -329,5 +326,6 @@ python run_speech_recognition_seq2seq.py \ --use_habana \ --use_hpu_graphs_for_inference \ --label_features_max_length 128 \ - --dataloader_num_workers 8 + --dataloader_num_workers 8 \ + --sdp_on_bf16 ``` diff --git a/examples/text-classification/README.md b/examples/text-classification/README.md index 9ffc78ae43..32bc3fd5f8 100644 --- a/examples/text-classification/README.md +++ b/examples/text-classification/README.md @@ -194,8 +194,7 @@ python ../gaudi_spawn.py \ --use_lazy_mode \ --use_hpu_graphs_for_inference \ --throughput_warmup_steps 3 \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --sdp_on_bf16 + --deepspeed ../../tests/configs/deepspeed_zero_2.json ``` You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. @@ -221,6 +220,5 @@ python run_glue.py \ --use_lazy_mode \ --use_hpu_graphs_for_inference \ --throughput_warmup_steps 2 \ - --bf16 \ - --sdp_on_bf16 + --bf16 ``` diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 5e42af689c..e2e3605fc8 100755 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -79,7 +79,8 @@ python run_generation.py \ --use_kv_cache \ --max_new_tokens 100 \ --do_sample \ ---prompt "Here is my prompt" +--prompt "Here is my prompt" \ +--sdp_on_bf16 ``` If you want to provide several prompts as inputs, here is how to do it: @@ -91,7 +92,8 @@ python run_generation.py \ --max_new_tokens 100 \ --do_sample \ --batch_size 2 \ ---prompt "Hello world" "How are you?" +--prompt "Hello world" "How are you?" \ +--sdp_on_bf16 ``` > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size. @@ -110,7 +112,8 @@ python run_generation.py \ --use_kv_cache \ --num_return_sequences 1 \ --temperature 0 \ ---prompt "Alice and Bob" +--prompt "Alice and Bob" \ +--sdp_on_bf16 ``` ### Benchmark @@ -137,7 +140,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --batch_size 1 \ --use_hpu_graphs \ --use_kv_cache \ ---max_new_tokens 100 +--max_new_tokens 100 \ +--sdp_on_bf16 ``` You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command: @@ -152,7 +156,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --attn_softmax_bf16 \ --limit_hpu_graphs \ --reuse_cache \ ---trim_logits +--trim_logits \ +--sdp_on_bf16 ``` To run Falcon-7B inference, use the following command: @@ -164,7 +169,8 @@ python run_generation.py \ --use_kv_cache \ --batch_size 1 \ --max_new_tokens 128 \ - --do_sample + --do_sample \ + --sdp_on_bf16 ``` To run Falcon-40B inference on 8 Gaudi2 cards, use the following command: @@ -195,7 +201,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ > --use_hpu_graphs \ > --use_kv_cache \ > --max_new_tokens 100 \ -> --bf16 +> --bf16 \ +> --sdp_on_bf16 > ``` ### Use any dataset from the Hugging Face Hub @@ -214,7 +221,8 @@ python run_generation.py \ --use_kv_cache \ --dataset_name JulesBelveze/tldr_news \ --column_name content \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated. @@ -233,7 +241,8 @@ python run_generation.py \ --bf16 \ --max_new_tokens 100 \ --prompt "Here is my prompt" \ ---peft_model yard1/llama-2-7b-sql-lora-test +--peft_model yard1/llama-2-7b-sql-lora-test \ +--sdp_on_bf16 ``` ### Using growing bucket optimization @@ -490,7 +499,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py --max_new_tokens 100 \ --batch_size 1 \ --reuse_cache \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for gemma with 1 card: @@ -502,7 +512,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation --max_new_tokens 100 \ --batch_size 1 \ --reuse_cache \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 0768e4a746..98f9a692c3 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -221,7 +221,6 @@ def _test_text_generation( if "gemma" in model_name.lower(): command += ["--use_flash_attention"] - command += ["--sdp_on_bf16"] if "decilm" in model_name.lower(): command += ["--sdp_on_bf16"]