diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md index 7a095bc9ca..ff8d7951d1 100644 --- a/examples/contrastive-image-text/README.md +++ b/examples/contrastive-image-text/README.md @@ -115,6 +115,7 @@ PT_HPU_LAZY_MODE=0 python run_clip.py \ --gaudi_config_name Habana/clip \ --throughput_warmup_steps 3 \ --dataloader_num_workers 16 \ + --sdp_on_bf16 \ --bf16 \ --trust_remote_code \ --torch_compile_backend=hpu_backend \ @@ -149,6 +150,7 @@ python ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \ --dataloader_num_workers 16 \ --mediapipe_dataloader \ --bf16 \ + --sdp_on_bf16 \ --distribution_strategy fast_ddp \ --trust_remote_code \ --torch_compile_backend=hpu_backend \ @@ -265,6 +267,7 @@ python run_clip.py \ --use_hpu_graphs_for_inference \ --gaudi_config_name Habana/clip \ --bf16 \ + --sdp_on_bf16 \ --mediapipe_dataloader \ --trust_remote_code ``` diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md index 654a9e02ad..304d787de6 100755 --- a/examples/question-answering/README.md +++ b/examples/question-answering/README.md @@ -292,6 +292,7 @@ python run_seq2seq_qa.py \ --pad_to_max_length \ --save_strategy epoch \ --throughput_warmup_steps 3 \ + --sdp_on_bf16 \ --bf16 ``` diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md index 02e4b53d66..a99f5fb7d4 100644 --- a/examples/speech-recognition/README.md +++ b/examples/speech-recognition/README.md @@ -85,6 +85,7 @@ python run_speech_recognition_ctc.py \ --use_lazy_mode \ --gaudi_config_name="Habana/wav2vec2" \ --throughput_warmup_steps="3" \ + --sdp_on_bf16 \ --bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference @@ -127,6 +128,7 @@ python ../gaudi_spawn.py \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ --bf16 \ + --sdp_on_bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference ``` @@ -207,6 +209,7 @@ python run_speech_recognition_ctc.py \ --use_habana \ --use_lazy_mode \ --gaudi_config_name="Habana/wav2vec2" \ + --sdp_on_bf16 \ --bf16 \ --use_hpu_graphs_for_inference ``` @@ -246,6 +249,7 @@ python run_speech_recognition_seq2seq.py \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ + --sdp_on_bf16 \ --bf16 \ --overwrite_output_dir \ --do_train \ @@ -286,6 +290,7 @@ python ../gaudi_spawn.py \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ + --sdp_on_bf16 \ --bf16 \ --overwrite_output_dir \ --do_train \ @@ -319,6 +324,7 @@ python run_speech_recognition_seq2seq.py \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ + --sdp_on_bf16 \ --bf16 \ --overwrite_output_dir \ --do_eval \ diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md index f4df474f09..ac2a792c22 100644 --- a/examples/stable-diffusion/README.md +++ b/examples/stable-diffusion/README.md @@ -44,6 +44,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -65,6 +66,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -83,6 +85,7 @@ python ../gaudi_spawn.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --distributed ``` @@ -107,6 +110,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion-2 \ + --sdp_on_bf16 \ --bf16 ``` @@ -135,6 +139,7 @@ python text_to_image_generation.py \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion-2 \ --ldm3d \ + --sdp_on_bf16 \ --bf16 ``` @@ -180,6 +185,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -200,6 +206,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -221,6 +228,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -241,6 +249,7 @@ python ../gaudi_spawn.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --distributed ``` @@ -257,6 +266,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --optimize ``` @@ -273,6 +283,7 @@ QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --optimize ``` @@ -298,6 +309,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --num_inference_steps 1 \ --guidance_scale 1.000001 \ @@ -339,6 +351,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -363,6 +376,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -389,6 +403,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -409,6 +424,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --quant_mode measure ``` @@ -428,6 +444,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --quant_mode quantize ``` @@ -451,6 +468,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -468,6 +486,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -486,6 +505,7 @@ python ../gaudi_spawn.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 \ --distributed ``` @@ -505,6 +525,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -524,6 +545,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion-2 \ + --sdp_on_bf16 \ --bf16 ``` @@ -547,6 +569,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -566,6 +589,7 @@ python text_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -590,6 +614,7 @@ python image_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -615,6 +640,7 @@ python image_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -639,6 +665,7 @@ python image_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -660,6 +687,7 @@ python image_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -678,6 +706,7 @@ python image_to_image_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -693,6 +722,7 @@ python depth_to_image_generation.py \ --image_save_dir /tmp/stable_diffusion_images \ --use_habana \ --use_hpu_graphs \ + --sdp_on_bf16 \ --bf16 ``` @@ -709,6 +739,7 @@ python unconditional_image_generation.py \ --use_habana \ --use_gaudi_ddim_scheduler \ --use_hpu_graphs \ + --sdp_on_bf16 \ --bf16 \ --save_outputs \ --output_dir "/tmp/" @@ -753,6 +784,7 @@ python text_to_image_generation.py \ --use_habana --use_hpu_graphs \ --image_save_dir /tmp/stable_diffusion_images_compel \ --seed 33 \ + --sdp_on_bf16 \ --bf16 \ --num_inference_steps 20 \ --use_compel @@ -773,6 +805,7 @@ python text_to_image_generation.py \ --image_save_dir /tmp/stable_diffusion_images_freeu \ --seed 33 \ --use_freeu \ + --sdp_on_bf16 \ --bf16 ``` # Stable Video Diffusion Examples @@ -799,6 +832,7 @@ python image_to_video_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -823,6 +857,7 @@ python image_to_video_generation.py \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ --bf16 ``` @@ -858,6 +893,7 @@ python image_to_video_generation.py \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ --bf16 \ + --sdp_on_bf16 \ --num_frames 14 \ --motion_bucket_id=14 \ --width=512 \ diff --git a/examples/stable-diffusion/depth_to_image_generation.py b/examples/stable-diffusion/depth_to_image_generation.py index 570a39b2c3..e0c602ad99 100755 --- a/examples/stable-diffusion/depth_to_image_generation.py +++ b/examples/stable-diffusion/depth_to_image_generation.py @@ -172,6 +172,12 @@ def main(): ), ) parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.") + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + default=False, + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--throughput_warmup_steps", type=int, @@ -223,6 +229,7 @@ def main(): "use_habana": args.use_habana, "use_hpu_graphs": args.use_hpu_graphs, "gaudi_config": args.gaudi_config_name, + "sdp_on_bf16": args.sdp_on_bf16, } if args.bf16: diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py index a9f2f81930..92d7a56317 100755 --- a/examples/stable-diffusion/image_to_image_generation.py +++ b/examples/stable-diffusion/image_to_image_generation.py @@ -193,6 +193,12 @@ def main(): ), ) parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.") + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + default=False, + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--ldm3d", action="store_true", help="Use LDM3D to generate an image and a depth map from a given text prompt." ) @@ -318,6 +324,7 @@ def main(): output_type=args.output_type, profiling_warmup_steps=args.profiling_warmup_steps, profiling_steps=args.profiling_steps, + sdp_on_bf16=args.sdp_on_bf16, **res, ) elif flux: diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py index 4112a1b39c..7c305cb802 100755 --- a/examples/stable-diffusion/image_to_video_generation.py +++ b/examples/stable-diffusion/image_to_video_generation.py @@ -177,6 +177,12 @@ def main(): ), ) parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.") + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + default=False, + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument("--num_frames", type=int, default=25, help="The number of video frames to generate.") args = parser.parse_args() @@ -218,6 +224,7 @@ def main(): "use_habana": args.use_habana, "use_hpu_graphs": args.use_hpu_graphs, "gaudi_config": args.gaudi_config_name, + "sdp_on_bf16" : args.sdp_on_bf16, } set_seed(args.seed) diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index c9d84ae1b9..56130746f5 100755 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -491,6 +491,12 @@ def parse_args(input_args=None): default=False, help=("Whether to use bf16 mixed precision."), ) + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + default=False, + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--local_rank", type=int, @@ -1421,6 +1427,7 @@ def compute_time_ids(original_size, crops_coords_top_left): use_habana=True, use_hpu_graphs=args.use_hpu_graphs_for_inference, gaudi_config=args.gaudi_config_name, + sdp_on_bf16=args.sdp_on_bf16, ) else: # vae and text encoders are frozen, only need to update unet diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py index 36e35ff90f..82e7fec0bb 100755 --- a/examples/stable-diffusion/unconditional_image_generation.py +++ b/examples/stable-diffusion/unconditional_image_generation.py @@ -68,6 +68,12 @@ def main(): action="store_true", help="Whether to use bf16 precision for classification.", ) + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + default=False, + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--save_outputs", action="store_true", @@ -104,6 +110,7 @@ def main(): "use_habana": args.use_habana, "use_hpu_graphs": args.use_hpu_graphs, "gaudi_config": gaudi_config, + "sdp_on_bf16": args.sdp_on_bf16, } kwargs_call = {"throughput_warmup_steps": args.throughput_warmup_steps} diff --git a/examples/text-feature-extraction/README.md b/examples/text-feature-extraction/README.md index 9c34ede54a..2b0d5354ef 100644 --- a/examples/text-feature-extraction/README.md +++ b/examples/text-feature-extraction/README.md @@ -28,6 +28,7 @@ python run_feature_extraction.py \ "BERT is a common machine learning architecture for text-based applications." \ "Alexander Hamilton is one of the founding fathers of the United States." \ --use_hpu_graphs \ + --sdp_on_bf16 \ --bf16 ``` diff --git a/examples/text-feature-extraction/run_feature_extraction.py b/examples/text-feature-extraction/run_feature_extraction.py index 47320b1979..99cde63cf9 100644 --- a/examples/text-feature-extraction/run_feature_extraction.py +++ b/examples/text-feature-extraction/run_feature_extraction.py @@ -83,6 +83,11 @@ def parse_args(): action="store_true", help="Whether to perform generation in bf16 precision.", ) + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--warmup", type=int, @@ -100,6 +105,8 @@ def parse_args(): def main(): args = parse_args() + if args.sdp_on_bf16: + torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModel.from_pretrained(args.model_name_or_path).to("hpu") diff --git a/examples/video-classification/run_example.py b/examples/video-classification/run_example.py index b593fb5955..c2beeba619 100644 --- a/examples/video-classification/run_example.py +++ b/examples/video-classification/run_example.py @@ -80,7 +80,10 @@ def run( warm_up_epcohs: int, use_hpu_graphs: bool, cast_bf16: bool, + sdp_on_bf16: bool, ): + if sdp_on_bf16: + torch._C._set_math_sdp_allow_fp16_bf16_reduction(True) processor = VideoMAEImageProcessor.from_pretrained(model_name) device = torch.device("hpu") model = VideoMAEForVideoClassification.from_pretrained(model_name) @@ -152,6 +155,11 @@ def main(): action="store_true", help="Whether to perform in bf16 precision.", ) + parser.add_argument( + "--sdp_on_bf16", + action="store_true", + help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--log_level", default=None, @@ -176,6 +184,7 @@ def main(): args.warm_up_epochs, args.use_hpu_graphs, args.bf16, + args.sdp_on_bf16, )