diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index 51f4a5dda2..e4dbb05472 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -44,7 +44,8 @@ python3 run_pipeline.py \ --model_name_or_path Salesforce/blip-image-captioning-large \ --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-1.5-7b inference, use the following command: @@ -52,7 +53,8 @@ To run Llava-1.5-7b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-7b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-1.5-13b inference, use the following command: @@ -60,7 +62,8 @@ To run Llava-1.5-13b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-13b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-v1.6-mistral-7b inference, use the following command: @@ -68,7 +71,8 @@ To run Llava-v1.6-mistral-7b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-v1.6-vicuna-13b inference, use the following command: @@ -76,7 +80,8 @@ To run Llava-v1.6-vicuna-13b inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-hf/llava-v1.6-34b-hf inference, use the following command: @@ -84,7 +89,8 @@ To run Llava-hf/llava-v1.6-34b-hf inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-34b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run google/paligemma-3b-mix-224 inference, use the following command: @@ -92,7 +98,8 @@ To run google/paligemma-3b-mix-224 inference, use the following command: python3 run_pipeline.py \ --model_name_or_path google/paligemma-3b-mix-224 \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command: @@ -100,7 +107,8 @@ To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command: python3 run_pipeline.py \ --model_name_or_path llava-hf/llama3-llava-next-8b-hf \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run idefics2 inference, use the following command: @@ -109,7 +117,8 @@ To run idefics2 inference, use the following command: python3 run_pipeline.py \ --model_name_or_path HuggingFaceM4/idefics2-8b \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` To run mllama inference using reduced precision in the SDPA, use the following command: @@ -134,7 +143,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-1.5-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-1.5-7b: @@ -143,7 +153,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-1.5-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` @@ -153,7 +164,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b: @@ -162,7 +174,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b: @@ -171,7 +184,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b: @@ -180,7 +194,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ - --bf16 + --bf16 \ + --sdp_on_bf16 ``` ### Inference with FusedSDPA diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md index bf6cd04aec..4c9b42f7c0 100755 --- a/examples/question-answering/README.md +++ b/examples/question-answering/README.md @@ -190,14 +190,6 @@ Here is a DeepSpeed configuration you can use to train your models on Gaudi: } ``` - -### Training in torch.compile mode - -Albert XXL model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, \ -a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. \ -b) Run the above commands with `--model_name_or_path albert-xxlarge-v1`, `--use_lazy_mode False` and add `--torch_compile`, `--torch_compile_backend hpu_backend` and remove `--use_hpu_graphs_for_inference` flags. - - ## Fine-tuning Llama on SQuAD1.1 > [!NOTE] diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md index 4d5eb69b91..998d6433bb 100644 --- a/examples/speech-recognition/README.md +++ b/examples/speech-recognition/README.md @@ -259,7 +259,8 @@ python run_speech_recognition_seq2seq.py \ --use_hpu_graphs_for_inference \ --label_features_max_length 128 \ --dataloader_num_workers 8 \ - --throughput_warmup_steps 3 + --throughput_warmup_steps 3 \ + --sdp_on_bf16 ``` If training on a different language, you should be sure to change the `language` argument. The `language` and `task` arguments should be omitted for English speech recognition. @@ -329,5 +330,6 @@ python run_speech_recognition_seq2seq.py \ --use_habana \ --use_hpu_graphs_for_inference \ --label_features_max_length 128 \ - --dataloader_num_workers 8 + --dataloader_num_workers 8 \ + --sdp_on_bf16 ``` diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 6da9bc8470..e2e3605fc8 100755 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -79,7 +79,8 @@ python run_generation.py \ --use_kv_cache \ --max_new_tokens 100 \ --do_sample \ ---prompt "Here is my prompt" +--prompt "Here is my prompt" \ +--sdp_on_bf16 ``` If you want to provide several prompts as inputs, here is how to do it: @@ -91,7 +92,8 @@ python run_generation.py \ --max_new_tokens 100 \ --do_sample \ --batch_size 2 \ ---prompt "Hello world" "How are you?" +--prompt "Hello world" "How are you?" \ +--sdp_on_bf16 ``` > The batch size should be larger than or equal to the number of prompts. Otherwise, only the first N prompts are kept with N being equal to the batch size. @@ -110,7 +112,8 @@ python run_generation.py \ --use_kv_cache \ --num_return_sequences 1 \ --temperature 0 \ ---prompt "Alice and Bob" +--prompt "Alice and Bob" \ +--sdp_on_bf16 ``` ### Benchmark @@ -137,7 +140,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --batch_size 1 \ --use_hpu_graphs \ --use_kv_cache \ ---max_new_tokens 100 +--max_new_tokens 100 \ +--sdp_on_bf16 ``` You can also run Llama2-70B on Gaudi2 with all optimizations enabled using the following command: @@ -152,7 +156,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --attn_softmax_bf16 \ --limit_hpu_graphs \ --reuse_cache \ ---trim_logits +--trim_logits \ +--sdp_on_bf16 ``` To run Falcon-7B inference, use the following command: @@ -164,7 +169,8 @@ python run_generation.py \ --use_kv_cache \ --batch_size 1 \ --max_new_tokens 128 \ - --do_sample + --do_sample \ + --sdp_on_bf16 ``` To run Falcon-40B inference on 8 Gaudi2 cards, use the following command: @@ -195,7 +201,8 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ > --use_hpu_graphs \ > --use_kv_cache \ > --max_new_tokens 100 \ -> --bf16 +> --bf16 \ +> --sdp_on_bf16 > ``` ### Use any dataset from the Hugging Face Hub @@ -214,7 +221,8 @@ python run_generation.py \ --use_kv_cache \ --dataset_name JulesBelveze/tldr_news \ --column_name content \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated. @@ -233,7 +241,8 @@ python run_generation.py \ --bf16 \ --max_new_tokens 100 \ --prompt "Here is my prompt" \ ---peft_model yard1/llama-2-7b-sql-lora-test +--peft_model yard1/llama-2-7b-sql-lora-test \ +--sdp_on_bf16 ``` ### Using growing bucket optimization @@ -490,7 +499,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_generation.py --max_new_tokens 100 \ --batch_size 1 \ --reuse_cache \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` Here is an example to quantize the model based on previous measurements for gemma with 1 card: @@ -502,7 +512,8 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_gemma.json python run_generation --max_new_tokens 100 \ --batch_size 1 \ --reuse_cache \ ---bf16 +--bf16 \ +--sdp_on_bf16 ``` @@ -512,14 +523,13 @@ Some bf16 models don't fit on one card due to hpu memory limitation, but in fp8 As measurement is being calculated in bf16 precision, to be able to run fp8 model on single card you should use `unify_measurements` script. Here are the steps: 1. Measure the model on a number of cards that are enough for the model to fit in BF16. -2. Quantize the model on the same amount of cards for scales to be saved. -3. Run unify_measurements.py script using the measurement files created after running steps 1 and 2. A unified measurement is then calculated. +2. Run unify_measurements.py script using the measurement files created in step 1. A unified measurement is then calculated. ```bash python quantization_tools/unify_measurements.py -g 01234567 -m *path_to_8x_measurements* -o *path_to_output_1x_measurement* ``` In the above example, the measurements of cards 0-7 will be unified to a single measurement. For example, if you specify `-g 0123 4567`, cards 0-3 and cards 4-7 will be unified in two different measurement files. All different group combinations are supported. -4. Run quantization using the unified measurement file/s. +3. Run quantization using the unified measurement file/s. More information on usage of the unifier script can be found in fp8 Habana docs: https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py index 4282e4ac49..de2b086c2a 100644 --- a/examples/text-generation/quantization_tools/unify_measurements.py +++ b/examples/text-generation/quantization_tools/unify_measurements.py @@ -6,49 +6,45 @@ import numpy as np -def find_measurement_path(measurement, measurements_dir_path, scales, group_size): +def find_measurement_path(measurement, measurements_dir_path, group_size): measurment_card = measurement + "_" + str(group_size) for measurment_file in os.listdir(measurements_dir_path): filename = os.fsdecode(measurment_file) if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename: continue - if scales: - if "MAXABS" in filename: - return os.path.join(measurements_dir_path, measurment_file) - else: - if "MAXABS" not in filename: - return os.path.join(measurements_dir_path, measurment_file) + if "MAXABS" not in filename: + return os.path.join(measurements_dir_path, measurment_file) -def unify_measurements( - measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index, scales=False -): + +def unify_measurements(measurement_group, measurements_dir_path, output_path, groups_size, groups_num, group_index): measurements_paths = [] group_name = "" # save all the jsons paths in the given measurement group for measurement in measurement_group: - measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size) - measurements_paths.append(measurement_path) + measurement_path = find_measurement_path(measurement, measurements_dir_path, groups_size) + if measurement_path is not None: + measurements_paths.append(measurement_path) group_name += measurement - # save all the jsons content in the given measurement group measurements_jsons = [] for measurement_path in measurements_paths: - with open(measurement_path, "r") as f: - js = json.load(f) - measurements_jsons.append(js["Nodes"]) + if measurement_path is not None: + with open(measurement_path, "r") as f: + js = json.load(f) + measurements_jsons.append(js["Nodes"]) # create a name for the unified json that will be created for this measurement group if groups_num == 1: unified_json_name = ( - find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size) + find_measurement_path(measurement_group[0], measurements_dir_path, groups_size) .split("/")[-1] .replace("_" + measurement_group[0] + "_" + str(groups_size), "") ) else: unified_json_name = ( - find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size) + find_measurement_path(measurement_group[0], measurements_dir_path, groups_size) .split("/")[-1] .replace( "_" + measurement_group[0] + "_" + str(groups_size), "_" + str(group_index) + "_" + str(groups_num) @@ -74,70 +70,27 @@ def unify_measurements( max_weight = node_values["params"]["weight"] # iterate over all the measurment group and take the maximum for each tensor and its channel - if scales: - for measurement_json in measurements_jsons: - for i in range(0, len(max_inputs)): - max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i]) - if max_outputs is not None: - if isinstance(max_outputs[0], list): - for i in range(0, len(max_outputs)): - for j in range(0, len(max_outputs[i])): - max_outputs[i][j] = max( - measurement_json[node_name]["outputs"][i][j], max_outputs[i][j] - ) - else: - for i in range(0, len(max_outputs)): - max_outputs[i] = max(measurement_json[node_name]["outputs"][i], max_outputs[i]) - if max_weight is not None: - if isinstance(max_weight, dict): - for key, values in max_weight.items(): - for i in range(0, len(values)): - max_weight[key][i] = max( - measurement_json[node_name]["params"]["weight"][key][i], max_weight[key][i] - ) - else: - max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight) - else: - for measurement_json in measurements_jsons: - for i in range(0, len(max_inputs)): - for j in range(0, len(max_inputs[i])): - max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0]) - if max_outputs is not None: - for i in range(0, len(max_outputs)): - max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0]) - if max_weight is not None: - for i in range(0, len(max_weight)): - max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0]) - - # update the maximum in the unified json - if scales: - for i in range(0, len(max_inputs)): - unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i] - if max_outputs is not None: - if isinstance(max_outputs[0], list): - for i in range(0, len(max_outputs)): - for j in range(0, len(max_outputs[i])): - unified_json["Nodes"][node_name]["outputs"][i][j] = max_outputs[i][j] - else: - for i in range(0, len(max_outputs)): - unified_json["Nodes"][node_name]["outputs"][i] = max_outputs[i] - if max_weight is not None: - if isinstance(max_weight, dict): - for key, values in max_weight.items(): - for i in range(0, len(values)): - unified_json["Nodes"][node_name]["params"]["weight"][key][i] = max_weight[key][i] - else: - unified_json["Nodes"][node_name]["params"]["weight"] = max_weight - else: + for measurement_json in measurements_jsons: for i in range(0, len(max_inputs)): for j in range(0, len(max_inputs[i])): - unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0] + max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0]) if max_outputs is not None: for i in range(0, len(max_outputs)): - unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0] + max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0]) if max_weight is not None: for i in range(0, len(max_weight)): - unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0] + max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0]) + + # update the maximum in the unified json + for i in range(0, len(max_inputs)): + for j in range(0, len(max_inputs[i])): + unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0] + if max_outputs is not None: + for i in range(0, len(max_outputs)): + unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0] + if max_weight is not None: + for i in range(0, len(max_weight)): + unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0] global_rank = None local_rank = group_index if groups_num != 1 else -1 mode = "" @@ -153,10 +106,10 @@ def unify_measurements( layers[layer] = {} layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]] if dlayer.get("outputs") is not None: - layers[layer]["outputs"] = np.array(dlayer["outputs"]) + layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]] if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None: layers[layer]["params"] = {} - layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"]) + layers[layer]["params"]["weight"] = [np.array(x) for x in dlayer["params"]["weight"]] df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers} with open(unified_npz_path, "w"): np.savez(unified_npz_path, df) @@ -196,26 +149,14 @@ def main(args): groups = args.groups num_jsons_drange = 0 - num_jsons_scales = 0 for path in os.listdir(measurements_path): - if path.endswith(".json"): - if "MAXABS" in path: - num_jsons_scales += 1 - elif "mod_list" not in path: - num_jsons_drange += 1 - assert ( - os.path.isdir(measurements_path) - and (num_jsons_drange % len(groups)) == 0 - and (num_jsons_scales % len(groups)) == 0 - ) + if path.endswith(".json") and "MAXABS" not in path and "mod_list" not in path: + num_jsons_drange += 1 + + assert os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0 for group_index, group in enumerate(groups): - unify_measurements( - group, measurements_path, output_path, num_jsons_drange, len(groups), group_index, scales=False - ) - unify_measurements( - group, measurements_path, output_path, num_jsons_scales, len(groups), group_index, scales=True - ) + unify_measurements(group, measurements_path, output_path, num_jsons_drange, len(groups), group_index) print("finished measurement unifier script") diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index 4b2ab96842..a057261b32 100755 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -320,6 +320,9 @@ def setup_parser(parser): action="store_true", help="Run the inference with dataset for specified --n_iterations(default:5)", ) + parser.add_argument( + "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) quant_parser_group = parser.add_mutually_exclusive_group() quant_parser_group.add_argument( @@ -389,6 +392,9 @@ def main(): import habana_frameworks.torch.hpu as torch_hpu + if args.sdp_on_bf16: + torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) + if args.dataset_name is None: # Benchmark over the prompts below if args.prompt: diff --git a/optimum/habana/diffusers/models/unet_2d_condition.py b/optimum/habana/diffusers/models/unet_2d_condition.py index 7bb641bbf1..1218c0fc65 100644 --- a/optimum/habana/diffusers/models/unet_2d_condition.py +++ b/optimum/habana/diffusers/models/unet_2d_condition.py @@ -1,3 +1,4 @@ +import os from typing import Any, Dict, Optional, Tuple, Union import torch @@ -5,7 +6,12 @@ from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, torch_utils, unscale_lora_layers -from optimum.habana.diffusers.utils.torch_utils import gaudi_fourier_filter +from ..utils.torch_utils import gaudi_fourier_filter +from .attention_processor import ( + AttentionProcessor, + AttnProcessor2_0, + ScaledDotProductAttention, +) logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -357,3 +363,50 @@ def gaudi_unet_2d_condition_model_forward( return (sample,) return UNet2DConditionOutput(sample=sample) + + +def set_attn_processor_hpu(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + """ + Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + Added env PATCH_SDPA for HPU specific handle to use ScaledDotProductAttention. + Sets the attention processor to use to compute attention. + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + """ + count = len(self.attn_processors.keys()) + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if os.environ.get("PATCH_SDPA") is not None: + setattr(module, "attention_module", ScaledDotProductAttention()) + module.set_processor(processor(module.attention_module)) + else: + if isinstance(processor, dict): + attention_processor = processor.pop(f"{name}.processor", None) + if attention_processor is not None: + module.set_processor(attention_processor) + else: + module.set_processor(processor) + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + +def set_default_attn_processor_hpu(self): + """ + Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + Disables custom attention processors and sets the default attention implementation from HPU. + """ + processor = AttnProcessor2_0() + set_attn_processor_hpu(self, processor) diff --git a/optimum/habana/diffusers/pipelines/pipeline_utils.py b/optimum/habana/diffusers/pipelines/pipeline_utils.py index 5051764887..6dda26f796 100644 --- a/optimum/habana/diffusers/pipelines/pipeline_utils.py +++ b/optimum/habana/diffusers/pipelines/pipeline_utils.py @@ -28,10 +28,10 @@ from diffusers.utils.torch_utils import is_compiled_module from huggingface_hub import create_repo -from optimum.habana.utils import to_device_dtype from optimum.utils import logging from ...transformers.gaudi_configuration import GaudiConfig +from ...utils import to_device_dtype logger = logging.get_logger(__name__) diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 630bc9c18b..f9c9907d39 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -39,6 +39,7 @@ from ....transformers.gaudi_configuration import GaudiConfig from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment +from ...models.unet_2d_condition import set_default_attn_processor_hpu from ..pipeline_utils import GaudiDiffusionPipeline diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index f4a0dbd244..6a1b74d129 100644 --- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time from dataclasses import dataclass from math import ceil @@ -38,6 +37,7 @@ from ....transformers.gaudi_configuration import GaudiConfig from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment +from ...models.unet_2d_condition import set_default_attn_processor_hpu from ..pipeline_utils import GaudiDiffusionPipeline from ..stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps @@ -138,6 +138,8 @@ def __init__( force_zeros_for_empty_prompt, ) + self.unet.set_default_attn_processor = set_default_attn_processor_hpu + self.to(self._device) def prepare_latents(self, num_images, num_channels_latents, height, width, dtype, device, generator, latents=None): diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py index 0e1378ee57..4608a56d3f 100644 --- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -15,6 +15,7 @@ # limitations under the License. import os +import random from typing import Optional, Tuple, Union import torch @@ -68,7 +69,7 @@ def _gaudi_wav2vec2_compute_mask_indices( ) # epsilon is used for probabilistic rounding - epsilon = torch.rand([], device="hpu") + epsilon = torch.rand(1).item() def compute_num_masked_span(input_length): """Given input length, compute how many spans should be masked""" @@ -106,19 +107,9 @@ def compute_num_masked_span(input_length): num_masked_span = compute_num_masked_span(input_length) # get random indices to mask - """ - Original code: - spec_aug_mask_idx = np.random.choice( - np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + spec_aug_mask_idx = torch.tensor( + random.sample(range(input_length - (mask_length - 1)), num_masked_span), dtype=torch.int32 ) - When (input_length - (mask_length - 1) < 0), then num_masked_span=0 - and we get: spec_aug_mask_idx=array([], dtype=int64) - However torch rewrite fails, because torch.randperm expects positive number - This causes a unit test to fail: - RUN_SLOW=true GAUDI2_CI=1 python -m pytest tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py -v -s -k test_compute_mask_indices_short_audio - """ - spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span] - # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. @@ -133,13 +124,12 @@ def compute_num_masked_span(input_length): spec_aug_mask_idx = torch.cat( [ spec_aug_mask_idx, - torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32, device="hpu") * dummy_mask_idx, + torch.ones(max_num_masked_span - num_masked_span, dtype=torch.int32) * dummy_mask_idx, ] ) spec_aug_mask_idxs.append(spec_aug_mask_idx.to(dtype=torch.long)) - spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs) - + spec_aug_mask_idxs = torch.vstack(spec_aug_mask_idxs).to("hpu") # expand masked indices to masked spans spec_aug_mask_idxs = torch.broadcast_to( spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) @@ -248,7 +238,7 @@ def gaudi_wav2vec2_encoder_forward( all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) - dropout_probability = torch.rand([], device="hpu") + dropout_probability = torch.rand([]) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000..d022b1f3e1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,113 @@ +absl-py==2.1.0 +accelerate==0.33.0 +aiohappyeyeballs==2.4.3 +aiohttp==3.11.7 +aiosignal==1.3.1 +alembic==1.14.0 +async-timeout==5.0.1 +attrs==24.2.0 +av==12.1.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.15.1 +charset-normalizer==3.4.0 +coloredlogs==15.0.1 +colorlog==6.9.0 +datasets==3.1.0 +diffusers==0.31.0 +dill==0.3.8 +exceptiongroup==1.2.2 +expecttest==0.2.1 +filelock==3.16.1 +frozenlist==1.5.0 +fsspec==2024.9.0 +gitdb==4.0.11 +GitPython==3.1.43 +google-auth==2.36.0 +google-auth-oauthlib==0.4.6 +greenlet==3.1.1 +grpcio==1.68.0 +huggingface-hub==0.26.3 +humanfriendly==10.0 +idna==3.10 +importlib_metadata==8.5.0 +iniconfig==2.0.0 +Jinja2==3.1.4 +joblib==1.4.2 +lightning==2.3.3 +lightning-habana==1.6.0 +lightning-utilities==0.11.9 +Mako==1.3.7 +Markdown==3.7 +MarkupSafe==3.0.2 +mpi4py==3.1.6 +mpmath==1.3.0 +multidict==6.1.0 +multiprocess==0.70.16 +networkx==3.4.2 +numpy==1.23.5 +oauthlib==3.2.2 +optimum==1.23.3 +optuna==4.1.0 +packaging==24.2 +pandas==2.0.1 +parameterized==0.9.0 +pathspec==0.12.1 +peft==0.14.0 +perfetto==0.11.0 +pillow==11.0.0 +Pillow-SIMD==9.5.0.post1 +pluggy==1.5.0 +prettytable==3.9.0 +propcache==0.2.0 +protobuf==3.20.3 +psutil==6.0.0 +py-cpuinfo==9.0.0 +pyarrow==18.1.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pybind11==2.10.4 +pycparser==2.22 +pydantic==1.10.13 +pynvml==8.0.4 +pytest==7.4.4 +python-dateutil==2.9.0.post0 +pytorch-lightning==2.4.0 +pytz==2024.2 +PyYAML==6.0 +regex==2023.5.5 +requests==2.32.3 +requests-oauthlib==2.0.0 +rsa==4.9 +safetensors==0.4.5 +scikit-learn==1.5.2 +scipy==1.14.1 +sentence-transformers==3.2.1 +sentencepiece==0.2.0 +six==1.16.0 +smmap==5.0.1 +SQLAlchemy==2.0.36 +symengine==0.11.0 +sympy==1.12.1 +tdqm==0.0.1 +tensorboard==2.11.2 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +threadpoolctl==3.5.0 +timm==1.0.12 +tokenizers==0.20.3 +tomli==2.2.1 +torchmetrics==1.6.0 +torchsde==0.2.6 +tqdm==4.67.0 +trampoline==0.1.2 +transformers==4.45.2 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==1.26.20 +wcwidth==0.2.13 +Werkzeug==3.1.3 +xxhash==3.5.0 +yamllint==1.35.1 +yarl==1.18.0 +zipp==3.21.0 diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json index d1a563c9ff..055d321152 100644 --- a/tests/baselines/whisper_small.json +++ b/tests/baselines/whisper_small.json @@ -41,10 +41,10 @@ "multi_card": { "learning_rate": 8e-5, "train_batch_size": 32, - "eval_wer": 0.3806988352745424, - "train_runtime": 312.5894, - "train_samples_per_second": 280.111, - "eval_samples_per_second": 19.073, + "eval_wer": 0.4693843594009983, + "train_runtime": 380.00, + "train_samples_per_second": 218.0, + "eval_samples_per_second": 31.0, "extra_arguments": [ "--dataset_config_name hi", "--language hindi", diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py index 49117dde62..76ee8658da 100755 --- a/tests/test_diffusers.py +++ b/tests/test_diffusers.py @@ -660,6 +660,7 @@ def test_no_throughput_regression_bf16(self): gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"), torch_dtype=torch.bfloat16, ) + pipeline.unet.set_default_attn_processor(pipeline.unet) set_seed(27) outputs = pipeline( prompt=prompts, @@ -1388,6 +1389,7 @@ def _sdxl_generation(self, scheduler: str, batch_size: int, num_images_per_promp "stabilityai/stable-diffusion-xl-base-1.0", **kwargs, ) + pipeline.unet.set_default_attn_processor(pipeline.unet) num_images_per_prompt = num_images_per_prompt res = {} outputs = pipeline( diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py index 27dd1b75c2..20d808b69f 100644 --- a/tests/test_encoder_decoder.py +++ b/tests/test_encoder_decoder.py @@ -206,6 +206,9 @@ def _test_text_translation( if "opus-mt-zh-en" in model_name: command_args.append("--max_source_length 512") + if "Babelscape/mrebel-large" in model_name or "nllb-200-distilled-600M" in model_name: + command_args.append("--sdp_on_bf16") + command = self._build_command( task=task, deepspeed=deepspeed, diff --git a/tests/test_examples.py b/tests/test_examples.py index 20f26f9012..6c9e234bd2 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -528,6 +528,26 @@ def test(self): env_variables["PT_HPU_LAZY_MODE"] = "0" env_variables["PT_ENABLE_INT64_SUPPORT"] = "1" + if self.EXAMPLE_NAME == "run_glue": + if model_name == "bert-large-uncased-whole-word-masking": + extra_command_line_arguments.append("--sdp_on_bf16") + + if self.EXAMPLE_NAME == "run_qa": + if model_name == "bert-large-uncased-whole-word-masking" or \ + model_name == "albert-large-v2": + extra_command_line_arguments.append("--sdp_on_bf16") + + if self.EXAMPLE_NAME == "run_bridgetower": + if model_name == "BridgeTower/bridgetower-large-itm-mlm-itc": + extra_command_line_arguments.append("--sdp_on_bf16") + + if self.EXAMPLE_NAME == "run_speech_recognition_seq2seq": + if model_name == "openai/whisper-small": + extra_command_line_arguments.append("--sdp_on_bf16") + + if self.EXAMPLE_NAME == "run_clip": + extra_command_line_arguments.append("--sdp_on_bf16") + with TemporaryDirectory() as tmp_dir: cmd_line = self._create_command_line( multi_card, diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py index 6335f28ebf..180a2bb3f9 100644 --- a/tests/test_fsdp_examples.py +++ b/tests/test_fsdp_examples.py @@ -97,6 +97,7 @@ def _test_fsdp( f"--gaudi_config_name {gaudi_config}", "--throughput_warmup_steps 100", "--do_eval", + "--sdp_on_bf16", ] else: command += [ diff --git a/tests/test_image_to_text_example.py b/tests/test_image_to_text_example.py index c73d4d0565..36143e8f91 100644 --- a/tests/test_image_to_text_example.py +++ b/tests/test_image_to_text_example.py @@ -67,6 +67,11 @@ def _test_image_to_text( "--use_hpu_graphs", ] + if "meta-llama/Llama-3.2-11B-Vision-Instruct" in model_name or "tiiuae/falcon-11B-vlm" in model_name: + command += [ + "--sdp_on_bf16", + ] + command.append("--bf16") command.append("--sdp_on_bf16") diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index 8a43ee81f8..8f27a2d34f 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -218,6 +218,12 @@ def _test_text_generation( if "gemma" in model_name.lower(): command += ["--use_flash_attention"] + if "decilm" in model_name.lower(): + command += ["--sdp_on_bf16"] + + if "mamba-130m-hf" in model_name.lower(): + command += ["--sdp_on_bf16"] + if (reuse_cache or torch_compile) and not parallel_strategy == "tp" and not is_starcoder_first_gen_model: command += ["--reuse_cache"]