From 95e41a099f4446954d7e000e8908a95c7c037395 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 2 Nov 2022 13:27:42 -0400 Subject: [PATCH 01/24] TPU pod launcher --- src/accelerate/commands/launch.py | 76 ++++++++++++++++++++++++++----- src/accelerate/utils/__init__.py | 2 +- src/accelerate/utils/launch.py | 25 +++++++--- 3 files changed, 84 insertions(+), 19 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 415b0690663..01182db15d6 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -36,12 +36,13 @@ DistributedType, PrecisionType, PrepareForLaunch, - _filter_args, + filter_args, is_deepspeed_available, is_rich_available, is_sagemaker_available, is_torch_version, patch_environment, + prepare_tpu_environment, ) from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS from accelerate.utils.dataclasses import SageMakerDistributedType @@ -283,6 +284,26 @@ def launch_command_parser(subparsers=None): action="store_true", help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.", ) + tpu_args.add_argument( + "--use_pod", + action="store_true", + help="Whether to use a GCP TPU pod for training.", + ) + tpu_args.add_argument( + "--vm", + type=str, + action="append", + help=( + "List of single Compute VM instance names. " + "If not provided we assume usage of instance groups. For TPU pods." + ), + ) + tpu_args.add_argument( + "--env", + type=str, + action="append", + help="List of environment variables to set on the Compute VM instances. For TPU pods.", + ) # DeepSpeed arguments deepspeed_args = parser.add_argument_group("DeepSpeed Arguments", "Arguments related to DeepSpeed.") @@ -619,7 +640,7 @@ def multi_gpu_launcher(args): raise NotImplementedError("Multi-node training requires pytorch>=1.9.0") debug = getattr(args, "debug", False) - args = _filter_args(args) + args = filter_args(args, distrib_run.get_args_parser()) with patch_environment(**current_env): try: distrib_run.run(args) @@ -736,7 +757,7 @@ def deepspeed_launcher(args): raise NotImplementedError("Multi-node training requires pytorch>=1.9.0") debug = getattr(args, "debug", False) - args = _filter_args(args) + args = filter_args(args, distrib_run.get_args_parser()) with patch_environment(**current_env): try: distrib_run.run(args) @@ -755,13 +776,7 @@ def tpu_launcher(args): if args.no_python: raise ValueError("--no_python cannot be used with TPU launcher") - if args.mixed_precision == "bf16": - if args.downcast_bf16: - current_env["XLA_USE_BF16"] = "0" - current_env["XLA_DOWNCAST_BF16"] = "1" - else: - current_env["XLA_USE_BF16"] = "1" - current_env["XLA_DOWNCAST_BF16"] = "0" + current_env = prepare_tpu_environment(args, current_env) if args.module: mod_name = args.training_script @@ -786,6 +801,42 @@ def tpu_launcher(args): xmp.spawn(PrepareForLaunch(main_function), args=(), nprocs=args.num_processes) +def tpu_pod_launcher(args): + from torch_xla.distributed import xla_dist + + current_env = {} + current_env = prepare_tpu_environment(args, current_env, True) + + # XLA uses the arg `tpu` to determine the TPU name, which will get erased + if args.tpu_name: + tpu_name = args.tpu_name + debug = getattr(args, "debug", False) + + training_script = args.training_script + training_script_args = args.training_script_args + + args = filter_args(args, xla_dist.get_args_parser()) + args.tpu = tpu_name + args.positional = ["python3", training_script] + training_script_args + bad_flags = "" + for k, v in vars(args): + if k.startswith("docker_") and v != "": + bad_flags += f'{k}="{v}"\n' + if bad_flags != "": + raise ValueError( + f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}" + ) + + with patch_environment(**current_env): + try: + xla_dist.resolve_and_execute(args) + except: + if is_rich_available() and debug: + console = get_console() + console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]") + console.print_exception(suppress=[__file__], show_locals=False) + + def _convert_nargs_to_dict(nargs: List[str]) -> Dict[str, str]: if len(nargs) < 0: return {} @@ -1045,7 +1096,10 @@ def launch_command(args): elif args.multi_gpu and not args.cpu: multi_gpu_launcher(args) elif args.tpu and not args.cpu: - tpu_launcher(args) + if args.use_pod: + tpu_pod_launcher(args) + else: + tpu_launcher(args) elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER: sagemaker_launcher(defaults, args) else: diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index a017a2ad562..070e112c229 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -97,7 +97,7 @@ HfDeepSpeedConfig, ) -from .launch import PrepareForLaunch, _filter_args, get_launch_prefix +from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu_environment from .megatron_lm import ( AbstractTrainStep, BertTrainStep, diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index 8642a441aff..4d3bb9f3770 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -21,10 +21,6 @@ from .dataclasses import DistributedType -if is_torch_version(">=", "1.9.0"): - import torch.distributed.run as distrib_run - - def get_launch_prefix(): """ Grabs the correct launcher for starting a distributed command, such as either `torchrun`, `python -m @@ -39,12 +35,11 @@ def get_launch_prefix(): return cmd -def _filter_args(args): +def filter_args(args, parser): """ Filters out all `accelerate` specific args """ - distrib_args = distrib_run.get_args_parser() - new_args, _ = distrib_args.parse_known_args() + new_args, _ = parser.parse_known_args() for key, value in vars(args).items(): if key in vars(new_args).keys(): @@ -52,6 +47,22 @@ def _filter_args(args): return new_args +def prepare_tpu_environment(args, current_env, pod=False): + """ + Prepares and returns an environment with the correct TPU environment variables. + """ + current_env["XLA_USE_BF16"] = "0" + current_env["XLA_DOWNCAST_BF16"] = "0" + if args.mixed_precision == "bf16": + if args.downcast_bf16: + current_env["XLA_DOWNCAST_BF16"] = "1" + else: + current_env["XLA_USE_BF16"] = "1" + if pod: + current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" + return current_env + + def env_var_path_add(env_var_name, path_to_add): """ Extends a path-based environment variable's value with a new path and returns the updated value. It's up to the From b6d3f79c1c8e17b6070325c7412e2594e592933a Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 2 Nov 2022 13:48:17 -0400 Subject: [PATCH 02/24] Should be working now, just need final steps --- src/accelerate/commands/config/cluster.py | 107 ++++++++++-------- src/accelerate/commands/config/config_args.py | 3 + src/accelerate/commands/launch.py | 10 +- src/accelerate/utils/__init__.py | 2 +- src/accelerate/utils/launch.py | 8 +- 5 files changed, 76 insertions(+), 54 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index acc4a182813..8d122bd3146 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -347,49 +347,6 @@ def get_cluster_input(): "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ", default="main", ) - use_cluster = _ask_field( - "Are you using a TPU cluster? [yes/NO]: ", - _convert_yes_no_to_bool, - default=False, - error_message="Please enter yes or no.", - ) - if use_cluster: - tpu_name = _ask_field( - "What is the name of your TPU cluster? ", - default=None, - error_message="Please enter the name of your TPU cluster.", - ) - tpu_zone = _ask_field( - "What is the zone of your TPU cluster? ", - default=None, - error_message="Please enter the zone of your TPU cluster.", - ) - run_commands = _ask_field( - "Do you have code you wish to run on startup in each pod? [yes/NO]: ", - _convert_yes_no_to_bool, - default=False, - error_message="Please enter yes or no.", - ) - if run_commands: - use_command_file = _ask_field( - "Is this code located in a bash script? [yes/NO]: ", - _convert_yes_no_to_bool, - default=False, - error_message="Please enter yes or no.", - ) - if use_command_file: - command_file = _ask_field( - "What is the path to your bash script? ", - default=None, - error_message="Please enter the path to your bash script.", - ) - command_file = os.path.abspath(command_file) - else: - commands = _ask_field( - "What commands do you wish to run on startup in each pod? ", - default=None, - error_message="Please enter the commands you wish to run on startup in each pod as a single string.", - ) else: main_training_function = "main" @@ -435,10 +392,65 @@ def get_cluster_input(): mixed_precision = "no" downcast_bf16 = "no" - if distributed_type == DistributedType.TPU and mixed_precision == "bf16": - downcast_bf16 = _ask_field( - "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no" + if distributed_type == DistributedType.TPU: + if mixed_precision == "bf16": + downcast_bf16 = _ask_field( + "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no" + ) + + use_cluster = _ask_field( + "Are you using a TPU cluster? [yes/NO]: ", + _convert_yes_no_to_bool, + default=False, + error_message="Please enter yes or no.", ) + if use_cluster: + tpu_name = _ask_field( + "What is the name of your TPU cluster? ", + default=None, + error_message="Please enter the name of your TPU cluster.", + ) + tpu_zone = _ask_field( + "What is the zone of your TPU cluster? ", + default=None, + error_message="Please enter the zone of your TPU cluster.", + ) + run_commands = _ask_field( + "Do you have code you wish to run on startup in each pod? [yes/NO]: ", + _convert_yes_no_to_bool, + default=False, + error_message="Please enter yes or no.", + ) + if run_commands: + use_command_file = _ask_field( + "Is this code located in a bash script? [yes/NO]: ", + _convert_yes_no_to_bool, + default=False, + error_message="Please enter yes or no.", + ) + if use_command_file: + command_file = _ask_field( + "What is the path to your bash script? ", + default=None, + error_message="Please enter the path to your bash script.", + ) + command_file = os.path.abspath(command_file) + else: + commands = _ask_field( + "What commands do you wish to run on startup in each pod? ", + default=None, + error_message="Please enter the commands you wish to run on startup in each pod as a single string.", + ) + + tpu_vm = _ask_field( + "If not using an instance group, what are the names of the Compute VM instances to be used, seperated by a comma: ", + default="", + ).split(",") + + tpu_env = _ask_field( + "What environment variables do you wish to set in each pod, seperated by a comma: ", + default="", + ).split(",") return ClusterConfig( compute_environment=ComputeEnvironment.LOCAL_MACHINE, @@ -460,6 +472,9 @@ def get_cluster_input(): same_network=same_network, tpu_name=tpu_name, tpu_zone=tpu_zone, + tpu_vm=tpu_vm, + tpu_env=tpu_env, + tpu_cluster=use_cluster, commands=commands, command_file=command_file, ) diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py index 9a1247c5550..e4c93a13b38 100644 --- a/src/accelerate/commands/config/config_args.py +++ b/src/accelerate/commands/config/config_args.py @@ -154,8 +154,11 @@ class ClusterConfig(BaseConfig): # args for TPU pods tpu_name: str = None tpu_zone: str = None + tpu_cluster: bool = False command_file: str = None command: List[str] = None + tpu_vm: List[str] = None + tpu_env: List[str] = None def __post_init__(self): if self.deepspeed_config is None: diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 01182db15d6..2df6badb14a 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -42,7 +42,7 @@ is_sagemaker_available, is_torch_version, patch_environment, - prepare_tpu_environment, + prepare_tpu, ) from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS from accelerate.utils.dataclasses import SageMakerDistributedType @@ -285,7 +285,7 @@ def launch_command_parser(subparsers=None): help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.", ) tpu_args.add_argument( - "--use_pod", + "--use_cluster", action="store_true", help="Whether to use a GCP TPU pod for training.", ) @@ -776,7 +776,7 @@ def tpu_launcher(args): if args.no_python: raise ValueError("--no_python cannot be used with TPU launcher") - current_env = prepare_tpu_environment(args, current_env) + current_env = prepare_tpu(args, current_env) if args.module: mod_name = args.training_script @@ -805,7 +805,7 @@ def tpu_pod_launcher(args): from torch_xla.distributed import xla_dist current_env = {} - current_env = prepare_tpu_environment(args, current_env, True) + current_env = prepare_tpu(args, current_env, True) # XLA uses the arg `tpu` to determine the TPU name, which will get erased if args.tpu_name: @@ -1096,7 +1096,7 @@ def launch_command(args): elif args.multi_gpu and not args.cpu: multi_gpu_launcher(args) elif args.tpu and not args.cpu: - if args.use_pod: + if args.use_cluster: tpu_pod_launcher(args) else: tpu_launcher(args) diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index 070e112c229..e10ae15f880 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -97,7 +97,7 @@ HfDeepSpeedConfig, ) -from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu_environment +from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu from .megatron_lm import ( AbstractTrainStep, BertTrainStep, diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index 4d3bb9f3770..eaca5b82296 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -47,7 +47,7 @@ def filter_args(args, parser): return new_args -def prepare_tpu_environment(args, current_env, pod=False): +def prepare_tpu(args, current_env, pod=False): """ Prepares and returns an environment with the correct TPU environment variables. """ @@ -60,7 +60,11 @@ def prepare_tpu_environment(args, current_env, pod=False): current_env["XLA_USE_BF16"] = "1" if pod: current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" - return current_env + + # Take explicit args and set them up for XLA + args.vm = args.tpu_vm + args.tpu = args.tpu_name + return args, current_env def env_var_path_add(env_var_name, path_to_add): From 1c17bdcb78a748f83622a01a47020a9b0b4af50a Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 2 Nov 2022 13:56:20 -0400 Subject: [PATCH 03/24] Filter args --- src/accelerate/commands/launch.py | 8 ++++---- src/accelerate/utils/__init__.py | 2 +- src/accelerate/utils/launch.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 2df6badb14a..e525a60592b 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -36,7 +36,7 @@ DistributedType, PrecisionType, PrepareForLaunch, - filter_args, + _filter_args, is_deepspeed_available, is_rich_available, is_sagemaker_available, @@ -640,7 +640,7 @@ def multi_gpu_launcher(args): raise NotImplementedError("Multi-node training requires pytorch>=1.9.0") debug = getattr(args, "debug", False) - args = filter_args(args, distrib_run.get_args_parser()) + args = _filter_args(args, distrib_run.get_args_parser()) with patch_environment(**current_env): try: distrib_run.run(args) @@ -757,7 +757,7 @@ def deepspeed_launcher(args): raise NotImplementedError("Multi-node training requires pytorch>=1.9.0") debug = getattr(args, "debug", False) - args = filter_args(args, distrib_run.get_args_parser()) + args = _filter_args(args, distrib_run.get_args_parser()) with patch_environment(**current_env): try: distrib_run.run(args) @@ -815,7 +815,7 @@ def tpu_pod_launcher(args): training_script = args.training_script training_script_args = args.training_script_args - args = filter_args(args, xla_dist.get_args_parser()) + args = _filter_args(args, xla_dist.get_args_parser()) args.tpu = tpu_name args.positional = ["python3", training_script] + training_script_args bad_flags = "" diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py index e10ae15f880..5982065019e 100644 --- a/src/accelerate/utils/__init__.py +++ b/src/accelerate/utils/__init__.py @@ -97,7 +97,7 @@ HfDeepSpeedConfig, ) -from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu +from .launch import PrepareForLaunch, _filter_args, get_launch_prefix, prepare_tpu from .megatron_lm import ( AbstractTrainStep, BertTrainStep, diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index eaca5b82296..d43d9f32deb 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -35,7 +35,7 @@ def get_launch_prefix(): return cmd -def filter_args(args, parser): +def _filter_args(args, parser): """ Filters out all `accelerate` specific args """ From adb2309ce7866462e922b54b70adee8941404331 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Wed, 2 Nov 2022 14:01:07 -0400 Subject: [PATCH 04/24] Remove redundancy --- src/accelerate/commands/launch.py | 9 ++------- src/accelerate/utils/launch.py | 7 +++---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index e525a60592b..e6c989201d5 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -776,7 +776,7 @@ def tpu_launcher(args): if args.no_python: raise ValueError("--no_python cannot be used with TPU launcher") - current_env = prepare_tpu(args, current_env) + args, current_env = prepare_tpu(args, current_env) if args.module: mod_name = args.training_script @@ -805,18 +805,13 @@ def tpu_pod_launcher(args): from torch_xla.distributed import xla_dist current_env = {} - current_env = prepare_tpu(args, current_env, True) - - # XLA uses the arg `tpu` to determine the TPU name, which will get erased - if args.tpu_name: - tpu_name = args.tpu_name + args, current_env = prepare_tpu(args, current_env, True) debug = getattr(args, "debug", False) training_script = args.training_script training_script_args = args.training_script_args args = _filter_args(args, xla_dist.get_args_parser()) - args.tpu = tpu_name args.positional = ["python3", training_script] + training_script_args bad_flags = "" for k, v in vars(args): diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index d43d9f32deb..b0761c73ad1 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -60,10 +60,9 @@ def prepare_tpu(args, current_env, pod=False): current_env["XLA_USE_BF16"] = "1" if pod: current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" - - # Take explicit args and set them up for XLA - args.vm = args.tpu_vm - args.tpu = args.tpu_name + # Take explicit args and set them up for XLA + args.vm = args.tpu_vm + args.tpu = args.tpu_name return args, current_env From 95527788c7fa47fc80408b877c3d93b533c68de9 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 7 Nov 2022 14:21:56 -0500 Subject: [PATCH 05/24] Working WIP! --- src/accelerate/commands/launch.py | 21 ++++++++++++++------- src/accelerate/utils/launch.py | 4 ++-- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index e6c989201d5..db801d76e56 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -285,7 +285,7 @@ def launch_command_parser(subparsers=None): help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.", ) tpu_args.add_argument( - "--use_cluster", + "--tpu_cluster", action="store_true", help="Whether to use a GCP TPU pod for training.", ) @@ -810,13 +810,16 @@ def tpu_pod_launcher(args): training_script = args.training_script training_script_args = args.training_script_args - - args = _filter_args(args, xla_dist.get_args_parser()) + args = _filter_args( + args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] + ) args.positional = ["python3", training_script] + training_script_args bad_flags = "" - for k, v in vars(args): - if k.startswith("docker_") and v != "": - bad_flags += f'{k}="{v}"\n' + for arg in vars(args): + if arg.startswith("docker_"): + value = getattr(args, arg) + if value != "" and value is not None: + bad_flags += f'{arg}="{value}"\n' if bad_flags != "": raise ValueError( f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}" @@ -1001,6 +1004,7 @@ def launch_command(args): if ( not args.multi_gpu and not args.tpu + and not args.tpu_cluster and not args.use_deepspeed and not args.use_fsdp and not args.use_mps_device @@ -1009,6 +1013,7 @@ def launch_command(args): args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED args.multi_gpu = defaults.distributed_type == DistributedType.MULTI_GPU args.tpu = defaults.distributed_type == DistributedType.TPU + args.tpu_cluster = defaults.tpu_cluster and args.tpu args.use_fsdp = defaults.distributed_type == DistributedType.FSDP args.use_mps_device = defaults.distributed_type == DistributedType.MPS args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM @@ -1091,9 +1096,11 @@ def launch_command(args): elif args.multi_gpu and not args.cpu: multi_gpu_launcher(args) elif args.tpu and not args.cpu: - if args.use_cluster: + if args.tpu_cluster: + print("Calling pod launcher!") tpu_pod_launcher(args) else: + print("Calling tpu launcher!") tpu_launcher(args) elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER: sagemaker_launcher(defaults, args) diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index b0761c73ad1..a79b3b0decd 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -35,11 +35,11 @@ def get_launch_prefix(): return cmd -def _filter_args(args, parser): +def _filter_args(args, parser, default_args=[]): """ Filters out all `accelerate` specific args """ - new_args, _ = parser.parse_known_args() + new_args, _ = parser.parse_known_args(default_args) for key, value in vars(args).items(): if key in vars(new_args).keys(): From c694064da9c1f18ed141e720b0c661c2948c478e Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 7 Nov 2022 14:43:29 -0500 Subject: [PATCH 06/24] Fix arg --- src/accelerate/commands/config/config_args.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py index e4c93a13b38..b1c95447f32 100644 --- a/src/accelerate/commands/config/config_args.py +++ b/src/accelerate/commands/config/config_args.py @@ -157,6 +157,7 @@ class ClusterConfig(BaseConfig): tpu_cluster: bool = False command_file: str = None command: List[str] = None + commands: List[str] = None tpu_vm: List[str] = None tpu_env: List[str] = None From 2d4b8374f89d315f6988cfc2964c4aa2eb84021a Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 7 Nov 2022 14:46:48 -0500 Subject: [PATCH 07/24] rm print --- src/accelerate/commands/launch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 2df550f5b61..69b55874c78 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -1113,10 +1113,8 @@ def launch_command(args): multi_gpu_launcher(args) elif args.tpu and not args.cpu: if args.tpu_cluster: - print("Calling pod launcher!") tpu_pod_launcher(args) else: - print("Calling tpu launcher!") tpu_launcher(args) elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER: sagemaker_launcher(defaults, args) From b07d05328fa1fa6c743fb76c25e51fd095e3b8cd Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 7 Nov 2022 15:43:23 -0500 Subject: [PATCH 08/24] Try with no_tpu_cluster --- src/accelerate/commands/launch.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 69b55874c78..cbc2613c3a2 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -296,6 +296,12 @@ def launch_command_parser(subparsers=None): action="store_true", help="Whether to use a GCP TPU pod for training.", ) + tpu_args.add_argument( + "--no_tpu_cluster", + action="store_false", + dest="tpu_cluster", + help="Whether to use a GCP TPU pod for training.", + ) tpu_args.add_argument( "--vm", type=str, @@ -829,7 +835,7 @@ def tpu_pod_launcher(args): args = _filter_args( args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - args.positional = ["python3", training_script] + training_script_args + args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args bad_flags = "" for arg in vars(args): if arg.startswith("docker_"): From 81640873dce1aa4e9dd1aadc208332760d378cc1 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 7 Nov 2022 15:53:52 -0500 Subject: [PATCH 09/24] Switch to python3, use different branch --- src/accelerate/commands/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index cbc2613c3a2..509ffeaa2be 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -835,7 +835,7 @@ def tpu_pod_launcher(args): args = _filter_args( args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args + args.positional = ["python3", training_script] + training_script_args bad_flags = "" for arg in vars(args): if arg.startswith("docker_"): From ef3612558610283a16a294d27c8df9330f57c68a Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 8 Nov 2022 12:16:57 -0500 Subject: [PATCH 10/24] Try with just this --- src/accelerate/commands/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 509ffeaa2be..cbc2613c3a2 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -835,7 +835,7 @@ def tpu_pod_launcher(args): args = _filter_args( args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - args.positional = ["python3", training_script] + training_script_args + args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args bad_flags = "" for arg in vars(args): if arg.startswith("docker_"): From c98c71d2b3f70201372a11ae0d7350ba485851ba Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 8 Nov 2022 12:20:15 -0500 Subject: [PATCH 11/24] With python --- src/accelerate/commands/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index cbc2613c3a2..509ffeaa2be 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -835,7 +835,7 @@ def tpu_pod_launcher(args): args = _filter_args( args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args + args.positional = ["python3", training_script] + training_script_args bad_flags = "" for arg in vars(args): if arg.startswith("docker_"): From 0f0567deede0176d7a037ba5d4ee01bac81034b0 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 15 Nov 2022 16:34:56 -0500 Subject: [PATCH 12/24] It's working! --- src/accelerate/commands/launch.py | 36 ++++++++++++++++++++----------- src/accelerate/utils/launch.py | 4 +++- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 509ffeaa2be..5fcd064b400 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -832,29 +832,39 @@ def tpu_pod_launcher(args): training_script = args.training_script training_script_args = args.training_script_args - args = _filter_args( + new_args = _filter_args( args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - args.positional = ["python3", training_script] + training_script_args + + new_args.positional = [ + "accelerate-launch", + "--tpu", + "--no_tpu_cluster", + "--num_processes", + str(args.num_processes), + training_script, + ] + training_script_args bad_flags = "" - for arg in vars(args): + for arg in vars(new_args): if arg.startswith("docker_"): - value = getattr(args, arg) + value = getattr(new_args, arg) if value != "" and value is not None: bad_flags += f'{arg}="{value}"\n' if bad_flags != "": raise ValueError( f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}" ) - - with patch_environment(**current_env): - try: - xla_dist.resolve_and_execute(args) - except: - if is_rich_available() and debug: - console = get_console() - console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]") - console.print_exception(suppress=[__file__], show_locals=False) + new_args.env = [f"{k}={v}" for k, v in current_env.items()] + new_args.env.append("ACCELERATE_IN_TPU_POD=1") + try: + xla_dist.resolve_and_execute(new_args) + except: + if is_rich_available() and debug: + console = get_console() + console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]") + console.print_exception(suppress=[__file__], show_locals=False) + else: + raise def _convert_nargs_to_dict(nargs: List[str]) -> Dict[str, str]: diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index a79b3b0decd..992c62013b4 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -59,10 +59,12 @@ def prepare_tpu(args, current_env, pod=False): else: current_env["XLA_USE_BF16"] = "1" if pod: - current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" # Take explicit args and set them up for XLA args.vm = args.tpu_vm args.tpu = args.tpu_name + elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1": + # `xla_dist` will take care of this on pods + current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" return args, current_env From e8b694bb7da74d9a67b5b9d31db2d30748c64c94 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 15 Nov 2022 16:59:17 -0500 Subject: [PATCH 13/24] Fixed up CLI, needs a change before final merge and ci redo is in --- src/accelerate/commands/config/cluster.py | 20 ++++++++++++++------ src/accelerate/commands/config/config.py | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 6e7553032a8..0f4da2d97bd 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -375,12 +375,8 @@ def get_cluster_input(): else: main_training_function = "main" - if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU, DistributedType.TPU]: - machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") - if machine_type == "TPU": - machine_type += " cores" - else: - machine_type += "(s)" + if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU]: + machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)" num_processes = _ask_field( f"How many {machine_type} should be used for distributed training? [1]:", lambda x: int(x), @@ -394,6 +390,13 @@ def get_cluster_input(): default=1, error_message="Please enter an integer.", ) + elif distributed_type == DistributedType.TPU: + num_processes = _ask_field( + "How many TPU core(s) should be used for distributed training (if using pods, on each pod)? [8]:", + lambda x: int(x), + default=8, + error_message="Please enter an integer.", + ) else: num_processes = 1 @@ -421,6 +424,11 @@ def get_cluster_input(): ) downcast_bf16 = "no" + tpu_vm = None + tpu_env = [] + tpu_name = None + use_cluster = False + if distributed_type == DistributedType.TPU: if mixed_precision == "bf16": downcast_bf16 = _ask_field( diff --git a/src/accelerate/commands/config/config.py b/src/accelerate/commands/config/config.py index b504f07adb4..a80506bc13f 100644 --- a/src/accelerate/commands/config/config.py +++ b/src/accelerate/commands/config/config.py @@ -76,7 +76,7 @@ def config_command_parser(subparsers=None): def config_command(args): config = get_user_input() - if args.config_file is not None: + if args.__dict__["config_args.config_file"] is not None: config_file = args.config_file else: if not os.path.isdir(cache_dir): From 60665b5b0b8d1d3dd33e0ba775fe8a05c0753ff9 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 15 Nov 2022 17:12:38 -0500 Subject: [PATCH 14/24] Merge with main --- src/accelerate/commands/config/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/config/config.py b/src/accelerate/commands/config/config.py index d11a46fc455..72414f2abe6 100644 --- a/src/accelerate/commands/config/config.py +++ b/src/accelerate/commands/config/config.py @@ -65,7 +65,7 @@ def config_command_parser(subparsers=None): def config_command(args): config = get_user_input() - if args.__dict__["config_args.config_file"] is not None: + if args.config_file is not None: config_file = args.config_file else: if not os.path.isdir(cache_dir): From f3ace09c93c130f52f13d5c0f5092fa668eea9b5 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Tue, 15 Nov 2022 17:24:50 -0500 Subject: [PATCH 15/24] Better doc --- src/accelerate/commands/launch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index c5aadbf25a1..2f327a51811 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -306,7 +306,7 @@ def launch_command_parser(subparsers=None): "--no_tpu_cluster", action="store_false", dest="tpu_cluster", - help="Whether to use a GCP TPU pod for training.", + help="Should not be passed explicitly, this is for internal use only.", ) tpu_args.add_argument( "--vm", From 9623e0689c1ca92f8c2adb0097892162b4989101 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Thu, 17 Nov 2022 08:54:14 -0500 Subject: [PATCH 16/24] machine_type -> device_type --- src/accelerate/commands/config/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 0f4da2d97bd..bd7de97077b 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -376,9 +376,9 @@ def get_cluster_input(): main_training_function = "main" if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU]: - machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)" + device_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)" num_processes = _ask_field( - f"How many {machine_type} should be used for distributed training? [1]:", + f"How many {device_type} should be used for distributed training? [1]:", lambda x: int(x), default=1, error_message="Please enter an integer.", From 7d3066f4312f0a552aa6e5039c6e8c729c947c88 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Thu, 17 Nov 2022 08:57:07 -0500 Subject: [PATCH 17/24] Enable bf16 on TPUs through config --- src/accelerate/commands/config/cluster.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index bd7de97077b..b5c654e634d 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -416,7 +416,11 @@ def get_cluster_input(): _convert_mixed_precision, ) else: - mixed_precision = "no" + mixed_precision = _ask_options( + "Do you wish to use BF16 (mixed precision)?", + ["no", "bf16"], + _convert_mixed_precision, + ) if use_dynamo and mixed_precision == "no" and not use_cpu: print( @@ -426,7 +430,6 @@ def get_cluster_input(): downcast_bf16 = "no" tpu_vm = None tpu_env = [] - tpu_name = None use_cluster = False if distributed_type == DistributedType.TPU: From f5eb40c39a68cd031a8518c3e95c9016f62c39fa Mon Sep 17 00:00:00 2001 From: DESKTOP-RCJN1PO Date: Wed, 14 Dec 2022 14:28:21 -0500 Subject: [PATCH 18/24] Rm XRT_TPU_CONFIG for now --- src/accelerate/utils/launch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py index 992c62013b4..e888404ca6d 100644 --- a/src/accelerate/utils/launch.py +++ b/src/accelerate/utils/launch.py @@ -62,9 +62,9 @@ def prepare_tpu(args, current_env, pod=False): # Take explicit args and set them up for XLA args.vm = args.tpu_vm args.tpu = args.tpu_name - elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1": - # `xla_dist` will take care of this on pods - current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" + # elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1": + # # `xla_dist` will take care of this on pods + # current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" return args, current_env From ccb52913cb72c6a23922333005f59cd71d1a5da0 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 19 Dec 2022 14:41:31 -0500 Subject: [PATCH 19/24] New version --- src/accelerate/commands/launch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 2f327a51811..1af6c2300ad 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -861,6 +861,7 @@ def tpu_pod_launcher(args): ) new_args.positional = [ + "sudo", "accelerate-launch", "--tpu", "--no_tpu_cluster", From 6bf5cb8eb61e92b16e34d2c69faa428fca00498e Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 19 Dec 2022 14:48:51 -0500 Subject: [PATCH 20/24] Add training function --- src/accelerate/commands/launch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 1af6c2300ad..62c55733b7d 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -867,6 +867,8 @@ def tpu_pod_launcher(args): "--no_tpu_cluster", "--num_processes", str(args.num_processes), + "--main_training_function", + str(args.main_training_function), training_script, ] + training_script_args bad_flags = "" From 679f636051b8071cbd6a9f7a76c8d246e21a99de Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 19 Dec 2022 15:05:57 -0500 Subject: [PATCH 21/24] With sudo option --- src/accelerate/commands/config/cluster.py | 6 ++++++ src/accelerate/commands/config/config_args.py | 1 + src/accelerate/commands/launch.py | 15 +++++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index b5c654e634d..462a2b0ddd1 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -493,6 +493,11 @@ def get_cluster_input(): default=False, error_message="Please enter yes or no.", ) + tpu_use_sudo = _ask_field( + "To run a python script in your TPU environment should `sudo` be used? [yes/NO]: ", + default=None, + error_message="Please enter yes or no.", + ) tpu_vm = _ask_field( "If not using an instance group, what are the names of the Compute VM instances to be used, seperated by a comma: ", default="", @@ -523,6 +528,7 @@ def get_cluster_input(): same_network=same_network, tpu_name=tpu_name, tpu_zone=tpu_zone, + tpu_use_sudo=tpu_use_sudo, tpu_vm=tpu_vm, tpu_env=tpu_env, tpu_cluster=use_cluster, diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py index 4d145083c19..d34a54a5d16 100644 --- a/src/accelerate/commands/config/config_args.py +++ b/src/accelerate/commands/config/config_args.py @@ -162,6 +162,7 @@ class ClusterConfig(BaseConfig): tpu_name: str = None tpu_zone: str = None tpu_cluster: bool = False + tpu_use_sudo: bool = False command_file: str = None commands: List[str] = None tpu_vm: List[str] = None diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 62c55733b7d..dc1554387ac 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -308,6 +308,11 @@ def launch_command_parser(subparsers=None): dest="tpu_cluster", help="Should not be passed explicitly, this is for internal use only.", ) + tpu_args.add_argument( + "--tpu_use_sudo", + action="store_true", + help="Whether to use sudo when running the TPU training script.", + ) tpu_args.add_argument( "--vm", type=str, @@ -860,8 +865,12 @@ def tpu_pod_launcher(args): args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"] ) - new_args.positional = [ - "sudo", + if args.tpu_use_sudo: + new_args = ["sudo"] + else: + new_args = [] + + new_args += [ "accelerate-launch", "--tpu", "--no_tpu_cluster", @@ -871,6 +880,8 @@ def tpu_pod_launcher(args): str(args.main_training_function), training_script, ] + training_script_args + + new_args.positional = new_args bad_flags = "" for arg in vars(new_args): if arg.startswith("docker_"): From d423b7b1287c430ab415986f3ee511a10ae2ca36 Mon Sep 17 00:00:00 2001 From: muellerzr Date: Mon, 19 Dec 2022 17:51:55 -0500 Subject: [PATCH 22/24] Push fix --- src/accelerate/commands/launch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index dc1554387ac..627f6ed9758 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -866,11 +866,11 @@ def tpu_pod_launcher(args): ) if args.tpu_use_sudo: - new_args = ["sudo"] + new_cmd = ["sudo"] else: - new_args = [] + new_cmd = [] - new_args += [ + new_cmd += [ "accelerate-launch", "--tpu", "--no_tpu_cluster", @@ -881,7 +881,7 @@ def tpu_pod_launcher(args): training_script, ] + training_script_args - new_args.positional = new_args + new_args.positional = new_cmd bad_flags = "" for arg in vars(new_args): if arg.startswith("docker_"): From 7effca3665815c96e27ce7e923c70f5a8bb7b056 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 7 Feb 2023 10:06:09 -0500 Subject: [PATCH 23/24] Update with alpha option --- src/accelerate/commands/launch.py | 2 +- src/accelerate/commands/tpu.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py index 627f6ed9758..1df3ffe6cbd 100644 --- a/src/accelerate/commands/launch.py +++ b/src/accelerate/commands/launch.py @@ -284,7 +284,7 @@ def launch_command_parser(subparsers=None): help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.", ) - # tpu arguments + # TPU arguments tpu_args = parser.add_argument_group("TPU", "Arguments related to TPU.") tpu_args.add_argument( "--main_training_function", diff --git a/src/accelerate/commands/tpu.py b/src/accelerate/commands/tpu.py index 6b90770c750..0db53363ec4 100644 --- a/src/accelerate/commands/tpu.py +++ b/src/accelerate/commands/tpu.py @@ -51,6 +51,11 @@ def tpu_command_parser(subparsers=None): help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.", ) pod_args = parser.add_argument_group("TPU Arguments", "Arguments for options ran inside the TPU.") + pod_args.add_argument( + "--use_alpha", + action="store_true", + help="Whether to use `gcloud alpha` when running the TPU training script instead of `gcloud`.", + ) pod_args.add_argument( "--command_file", default=None, From 8a0b5c419b5ff5bca7c18e62225ff17a8127f09e Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 8 Feb 2023 13:35:22 -0500 Subject: [PATCH 24/24] Add use_sudo --- src/accelerate/commands/config/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py index 462a2b0ddd1..8cb61b84651 100644 --- a/src/accelerate/commands/config/cluster.py +++ b/src/accelerate/commands/config/cluster.py @@ -431,6 +431,7 @@ def get_cluster_input(): tpu_vm = None tpu_env = [] use_cluster = False + tpu_use_sudo = False if distributed_type == DistributedType.TPU: if mixed_precision == "bf16":