From 907019fdf6fd118fcf551be78c6d023e2e2ce45c Mon Sep 17 00:00:00 2001 From: Vivek Date: Mon, 14 Oct 2024 08:23:50 +0300 Subject: [PATCH] [SW-0] Clean deprecated flags usage Change-Id: I1c2e2460dc2072ba7b311f239441b304694918c8 --- docs/source/package_reference/gaudi_config.mdx | 4 ++-- examples/language-modeling/README.md | 10 +++++----- optimum/habana/transformers/gaudi_configuration.py | 4 ++-- optimum/habana/transformers/trainer.py | 2 +- tests/test_examples.py | 8 +++++--- 5 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/source/package_reference/gaudi_config.mdx b/docs/source/package_reference/gaudi_config.mdx index 1060e9c64e..a7b9f077b5 100644 --- a/docs/source/package_reference/gaudi_config.mdx +++ b/docs/source/package_reference/gaudi_config.mdx @@ -20,8 +20,8 @@ Here is a description of each configuration parameter: - `use_fused_adam` enables to decide whether to use the [custom fused implementation of the ADAM optimizer provided by Intel® Gaudi® AI Accelerator](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#custom-optimizers). - `use_fused_clip_norm` enables to decide whether to use the [custom fused implementation of gradient norm clipping provided by Intel® Gaudi® AI Accelerator](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Custom_Ops_PyTorch.html#other-custom-ops). - `use_torch_autocast` enables PyTorch autocast; used to define good pre-defined config; users should favor `--bf16` training argument -- `autocast_bf16_ops` list of operations that should be run with bf16 precision under autocast context; using environment flag LOWER_LIST is a preffered way for operator autocast list override -- `autocast_fp32_ops` list of operations that should be run with fp32 precision under autocast context; using environment flag FP32_LIST is a preffered way for operator autocast list override +- `autocast_bf16_ops` list of operations that should be run with bf16 precision under autocast context; using environment flag PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST is a preffered way for operator autocast list override +- `autocast_fp32_ops` list of operations that should be run with fp32 precision under autocast context; using environment flag PT_HPU_AUTOCAST_FP32_OPS_LIST is a preffered way for operator autocast list override You can find examples of Gaudi configurations in the [Habana model repository on the Hugging Face Hub](https://huggingface.co/habana). For instance, [for BERT Large we have](https://huggingface.co/Habana/bert-large-uncased-whole-word-masking/blob/main/gaudi_config.json): diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 8ea0cdd554..2bca4551ae 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -404,7 +404,7 @@ python3 run_lora_clm.py \ ``` - Single-card finetuning of Falcon-40B: ```bash -LOWER_LIST=ops_bf16.txt python3 run_lora_clm.py \ +PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 run_lora_clm.py \ --model_name_or_path tiiuae/falcon-40b \ --dataset_name timdettmers/openassistant-guanaco \ --bf16 True \ @@ -474,7 +474,7 @@ python ../gaudi_spawn.py \ - Multi-card finetuning of Llama2-7B with FP8: ```bash -LOWER_LIST=ops_bf16.txt python ../gaudi_spawn.py \ +PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python ../gaudi_spawn.py \ --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-7b-hf \ --dataset_name tatsu-lab/alpaca \ @@ -569,7 +569,7 @@ python ../gaudi_spawn.py \ - Multi-card finetuning of Falcon-40B: ```bash -LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ +PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path tiiuae/falcon-40b \ --dataset_name timdettmers/openassistant-guanaco \ @@ -647,7 +647,7 @@ python3 ../gaudi_spawn.py --use_deepspeed --world_size 8 run_lora_clm.py \ - Multi-card finetuning of Llama2-70B with FSDP and LoRA: ```bash -LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \ +PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ --model_name_or_path meta-llama/Llama-2-70b-hf \ --dataset_name tatsu-lab/alpaca \ @@ -690,7 +690,7 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ - Falcon-180B example command saves only the LoRA parameters at end - For inference we need to merge the pretrained model and LoRA weights ```bash -DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ +PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ --world_size 8 --use_deepspeed run_lora_clm.py \ --model_name_or_path tiiuae/falcon-180B \ --dataset_name timdettmers/openassistant-guanaco \ diff --git a/optimum/habana/transformers/gaudi_configuration.py b/optimum/habana/transformers/gaudi_configuration.py index 76638d8e95..faeceb8be8 100644 --- a/optimum/habana/transformers/gaudi_configuration.py +++ b/optimum/habana/transformers/gaudi_configuration.py @@ -93,5 +93,5 @@ def declare_autocast_bf16_fp32_ops(self): autocast_bf16_filename, autocast_fp32_filename, ) - os.environ["LOWER_LIST"] = autocast_bf16_filename - os.environ["FP32_LIST"] = autocast_fp32_filename + os.environ["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = autocast_bf16_filename + os.environ["PT_HPU_AUTOCAST_FP32_OPS_LIST"] = autocast_fp32_filename diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py index 843f646b14..27eab623bb 100644 --- a/optimum/habana/transformers/trainer.py +++ b/optimum/habana/transformers/trainer.py @@ -252,7 +252,7 @@ def __init__( "The argument `--bf16` was not given but `use_torch_autocast` is True in the Gaudi configuration so mixed-precision training with Torch Autocast is enabled." ) - if self.use_hpu_amp and "LOWER_LIST" not in os.environ: + if self.use_hpu_amp and "PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST" not in os.environ: self.gaudi_config.declare_autocast_bf16_fp32_ops() if self.args.use_lazy_mode: diff --git a/tests/test_examples.py b/tests/test_examples.py index 4fd61d9b7f..980783cce5 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -439,7 +439,7 @@ def test(self): env_variables = os.environ.copy() if "falcon" in model_name: - env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt") + env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt") elif "flan" in model_name: env_variables["PT_HPU_MAX_COMPOUND_OP_SIZE"] = "512" elif "bloom" in model_name: @@ -450,13 +450,15 @@ def test(self): env_variables["DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED"] = "1" elif fsdp: if "llama" in model_name: - env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt") + env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str( + example_script.parent / "ops_bf16.txt" + ) env_variables["PT_HPU_LAZY_MODE"] = "0" elif deepspeed and "gpt-neox-20b" in model_name: env_variables["LD_PRELOAD"] = "" if fp8 and "llama" in model_name: - env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt") + env_variables["PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST"] = str(example_script.parent / "ops_bf16.txt") extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])