From 95e41a099f4446954d7e000e8908a95c7c037395 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Wed, 2 Nov 2022 13:27:42 -0400
Subject: [PATCH 01/24] TPU pod launcher

---
 src/accelerate/commands/launch.py | 76 ++++++++++++++++++++++++++-----
 src/accelerate/utils/__init__.py  |  2 +-
 src/accelerate/utils/launch.py    | 25 +++++++---
 3 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 415b0690663..01182db15d6 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -36,12 +36,13 @@
     DistributedType,
     PrecisionType,
     PrepareForLaunch,
-    _filter_args,
+    filter_args,
     is_deepspeed_available,
     is_rich_available,
     is_sagemaker_available,
     is_torch_version,
     patch_environment,
+    prepare_tpu_environment,
 )
 from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
 from accelerate.utils.dataclasses import SageMakerDistributedType
@@ -283,6 +284,26 @@ def launch_command_parser(subparsers=None):
         action="store_true",
         help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
     )
+    tpu_args.add_argument(
+        "--use_pod",
+        action="store_true",
+        help="Whether to use a GCP TPU pod for training.",
+    )
+    tpu_args.add_argument(
+        "--vm",
+        type=str,
+        action="append",
+        help=(
+            "List of single Compute VM instance names. "
+            "If not provided we assume usage of instance groups. For TPU pods."
+        ),
+    )
+    tpu_args.add_argument(
+        "--env",
+        type=str,
+        action="append",
+        help="List of environment variables to set on the Compute VM instances. For TPU pods.",
+    )
 
     # DeepSpeed arguments
     deepspeed_args = parser.add_argument_group("DeepSpeed Arguments", "Arguments related to DeepSpeed.")
@@ -619,7 +640,7 @@ def multi_gpu_launcher(args):
         raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
 
     debug = getattr(args, "debug", False)
-    args = _filter_args(args)
+    args = filter_args(args, distrib_run.get_args_parser())
     with patch_environment(**current_env):
         try:
             distrib_run.run(args)
@@ -736,7 +757,7 @@ def deepspeed_launcher(args):
             raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
 
         debug = getattr(args, "debug", False)
-        args = _filter_args(args)
+        args = filter_args(args, distrib_run.get_args_parser())
         with patch_environment(**current_env):
             try:
                 distrib_run.run(args)
@@ -755,13 +776,7 @@ def tpu_launcher(args):
     if args.no_python:
         raise ValueError("--no_python cannot be used with TPU launcher")
 
-    if args.mixed_precision == "bf16":
-        if args.downcast_bf16:
-            current_env["XLA_USE_BF16"] = "0"
-            current_env["XLA_DOWNCAST_BF16"] = "1"
-        else:
-            current_env["XLA_USE_BF16"] = "1"
-            current_env["XLA_DOWNCAST_BF16"] = "0"
+    current_env = prepare_tpu_environment(args, current_env)
 
     if args.module:
         mod_name = args.training_script
@@ -786,6 +801,42 @@ def tpu_launcher(args):
         xmp.spawn(PrepareForLaunch(main_function), args=(), nprocs=args.num_processes)
 
 
+def tpu_pod_launcher(args):
+    from torch_xla.distributed import xla_dist
+
+    current_env = {}
+    current_env = prepare_tpu_environment(args, current_env, True)
+
+    # XLA uses the arg `tpu` to determine the TPU name, which will get erased
+    if args.tpu_name:
+        tpu_name = args.tpu_name
+    debug = getattr(args, "debug", False)
+
+    training_script = args.training_script
+    training_script_args = args.training_script_args
+
+    args = filter_args(args, xla_dist.get_args_parser())
+    args.tpu = tpu_name
+    args.positional = ["python3", training_script] + training_script_args
+    bad_flags = ""
+    for k, v in vars(args):
+        if k.startswith("docker_") and v != "":
+            bad_flags += f'{k}="{v}"\n'
+    if bad_flags != "":
+        raise ValueError(
+            f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}"
+        )
+
+    with patch_environment(**current_env):
+        try:
+            xla_dist.resolve_and_execute(args)
+        except:
+            if is_rich_available() and debug:
+                console = get_console()
+                console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]")
+                console.print_exception(suppress=[__file__], show_locals=False)
+
+
 def _convert_nargs_to_dict(nargs: List[str]) -> Dict[str, str]:
     if len(nargs) < 0:
         return {}
@@ -1045,7 +1096,10 @@ def launch_command(args):
     elif args.multi_gpu and not args.cpu:
         multi_gpu_launcher(args)
     elif args.tpu and not args.cpu:
-        tpu_launcher(args)
+        if args.use_pod:
+            tpu_pod_launcher(args)
+        else:
+            tpu_launcher(args)
     elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
         sagemaker_launcher(defaults, args)
     else:
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index a017a2ad562..070e112c229 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -97,7 +97,7 @@
         HfDeepSpeedConfig,
     )
 
-from .launch import PrepareForLaunch, _filter_args, get_launch_prefix
+from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu_environment
 from .megatron_lm import (
     AbstractTrainStep,
     BertTrainStep,
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index 8642a441aff..4d3bb9f3770 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -21,10 +21,6 @@
 from .dataclasses import DistributedType
 
 
-if is_torch_version(">=", "1.9.0"):
-    import torch.distributed.run as distrib_run
-
-
 def get_launch_prefix():
     """
     Grabs the correct launcher for starting a distributed command, such as either `torchrun`, `python -m
@@ -39,12 +35,11 @@ def get_launch_prefix():
     return cmd
 
 
-def _filter_args(args):
+def filter_args(args, parser):
     """
     Filters out all `accelerate` specific args
     """
-    distrib_args = distrib_run.get_args_parser()
-    new_args, _ = distrib_args.parse_known_args()
+    new_args, _ = parser.parse_known_args()
 
     for key, value in vars(args).items():
         if key in vars(new_args).keys():
@@ -52,6 +47,22 @@ def _filter_args(args):
     return new_args
 
 
+def prepare_tpu_environment(args, current_env, pod=False):
+    """
+    Prepares and returns an environment with the correct TPU environment variables.
+    """
+    current_env["XLA_USE_BF16"] = "0"
+    current_env["XLA_DOWNCAST_BF16"] = "0"
+    if args.mixed_precision == "bf16":
+        if args.downcast_bf16:
+            current_env["XLA_DOWNCAST_BF16"] = "1"
+        else:
+            current_env["XLA_USE_BF16"] = "1"
+    if pod:
+        current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
+    return current_env
+
+
 def env_var_path_add(env_var_name, path_to_add):
     """
     Extends a path-based environment variable's value with a new path and returns the updated value. It's up to the

From b6d3f79c1c8e17b6070325c7412e2594e592933a Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Wed, 2 Nov 2022 13:48:17 -0400
Subject: [PATCH 02/24] Should be working now, just need final steps

---
 src/accelerate/commands/config/cluster.py     | 107 ++++++++++--------
 src/accelerate/commands/config/config_args.py |   3 +
 src/accelerate/commands/launch.py             |  10 +-
 src/accelerate/utils/__init__.py              |   2 +-
 src/accelerate/utils/launch.py                |   8 +-
 5 files changed, 76 insertions(+), 54 deletions(-)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index acc4a182813..8d122bd3146 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -347,49 +347,6 @@ def get_cluster_input():
             "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ",
             default="main",
         )
-        use_cluster = _ask_field(
-            "Are you using a TPU cluster? [yes/NO]: ",
-            _convert_yes_no_to_bool,
-            default=False,
-            error_message="Please enter yes or no.",
-        )
-        if use_cluster:
-            tpu_name = _ask_field(
-                "What is the name of your TPU cluster? ",
-                default=None,
-                error_message="Please enter the name of your TPU cluster.",
-            )
-            tpu_zone = _ask_field(
-                "What is the zone of your TPU cluster? ",
-                default=None,
-                error_message="Please enter the zone of your TPU cluster.",
-            )
-            run_commands = _ask_field(
-                "Do you have code you wish to run on startup in each pod? [yes/NO]: ",
-                _convert_yes_no_to_bool,
-                default=False,
-                error_message="Please enter yes or no.",
-            )
-            if run_commands:
-                use_command_file = _ask_field(
-                    "Is this code located in a bash script? [yes/NO]: ",
-                    _convert_yes_no_to_bool,
-                    default=False,
-                    error_message="Please enter yes or no.",
-                )
-                if use_command_file:
-                    command_file = _ask_field(
-                        "What is the path to your bash script? ",
-                        default=None,
-                        error_message="Please enter the path to your bash script.",
-                    )
-                    command_file = os.path.abspath(command_file)
-                else:
-                    commands = _ask_field(
-                        "What commands do you wish to run on startup in each pod? ",
-                        default=None,
-                        error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
-                    )
 
     else:
         main_training_function = "main"
@@ -435,10 +392,65 @@ def get_cluster_input():
         mixed_precision = "no"
 
     downcast_bf16 = "no"
-    if distributed_type == DistributedType.TPU and mixed_precision == "bf16":
-        downcast_bf16 = _ask_field(
-            "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
+    if distributed_type == DistributedType.TPU:
+        if mixed_precision == "bf16":
+            downcast_bf16 = _ask_field(
+                "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no"
+            )
+
+        use_cluster = _ask_field(
+            "Are you using a TPU cluster? [yes/NO]: ",
+            _convert_yes_no_to_bool,
+            default=False,
+            error_message="Please enter yes or no.",
         )
+        if use_cluster:
+            tpu_name = _ask_field(
+                "What is the name of your TPU cluster? ",
+                default=None,
+                error_message="Please enter the name of your TPU cluster.",
+            )
+            tpu_zone = _ask_field(
+                "What is the zone of your TPU cluster? ",
+                default=None,
+                error_message="Please enter the zone of your TPU cluster.",
+            )
+            run_commands = _ask_field(
+                "Do you have code you wish to run on startup in each pod? [yes/NO]: ",
+                _convert_yes_no_to_bool,
+                default=False,
+                error_message="Please enter yes or no.",
+            )
+            if run_commands:
+                use_command_file = _ask_field(
+                    "Is this code located in a bash script? [yes/NO]: ",
+                    _convert_yes_no_to_bool,
+                    default=False,
+                    error_message="Please enter yes or no.",
+                )
+                if use_command_file:
+                    command_file = _ask_field(
+                        "What is the path to your bash script? ",
+                        default=None,
+                        error_message="Please enter the path to your bash script.",
+                    )
+                    command_file = os.path.abspath(command_file)
+                else:
+                    commands = _ask_field(
+                        "What commands do you wish to run on startup in each pod? ",
+                        default=None,
+                        error_message="Please enter the commands you wish to run on startup in each pod as a single string.",
+                    )
+
+            tpu_vm = _ask_field(
+                "If not using an instance group, what are the names of the Compute VM instances to be used, seperated by a comma: ",
+                default="",
+            ).split(",")
+
+            tpu_env = _ask_field(
+                "What environment variables do you wish to set in each pod, seperated by a comma: ",
+                default="",
+            ).split(",")
 
     return ClusterConfig(
         compute_environment=ComputeEnvironment.LOCAL_MACHINE,
@@ -460,6 +472,9 @@ def get_cluster_input():
         same_network=same_network,
         tpu_name=tpu_name,
         tpu_zone=tpu_zone,
+        tpu_vm=tpu_vm,
+        tpu_env=tpu_env,
+        tpu_cluster=use_cluster,
         commands=commands,
         command_file=command_file,
     )
diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py
index 9a1247c5550..e4c93a13b38 100644
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@@ -154,8 +154,11 @@ class ClusterConfig(BaseConfig):
     # args for TPU pods
     tpu_name: str = None
     tpu_zone: str = None
+    tpu_cluster: bool = False
     command_file: str = None
     command: List[str] = None
+    tpu_vm: List[str] = None
+    tpu_env: List[str] = None
 
     def __post_init__(self):
         if self.deepspeed_config is None:
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 01182db15d6..2df6badb14a 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -42,7 +42,7 @@
     is_sagemaker_available,
     is_torch_version,
     patch_environment,
-    prepare_tpu_environment,
+    prepare_tpu,
 )
 from accelerate.utils.constants import DEEPSPEED_MULTINODE_LAUNCHERS
 from accelerate.utils.dataclasses import SageMakerDistributedType
@@ -285,7 +285,7 @@ def launch_command_parser(subparsers=None):
         help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
     )
     tpu_args.add_argument(
-        "--use_pod",
+        "--use_cluster",
         action="store_true",
         help="Whether to use a GCP TPU pod for training.",
     )
@@ -776,7 +776,7 @@ def tpu_launcher(args):
     if args.no_python:
         raise ValueError("--no_python cannot be used with TPU launcher")
 
-    current_env = prepare_tpu_environment(args, current_env)
+    current_env = prepare_tpu(args, current_env)
 
     if args.module:
         mod_name = args.training_script
@@ -805,7 +805,7 @@ def tpu_pod_launcher(args):
     from torch_xla.distributed import xla_dist
 
     current_env = {}
-    current_env = prepare_tpu_environment(args, current_env, True)
+    current_env = prepare_tpu(args, current_env, True)
 
     # XLA uses the arg `tpu` to determine the TPU name, which will get erased
     if args.tpu_name:
@@ -1096,7 +1096,7 @@ def launch_command(args):
     elif args.multi_gpu and not args.cpu:
         multi_gpu_launcher(args)
     elif args.tpu and not args.cpu:
-        if args.use_pod:
+        if args.use_cluster:
             tpu_pod_launcher(args)
         else:
             tpu_launcher(args)
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index 070e112c229..e10ae15f880 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -97,7 +97,7 @@
         HfDeepSpeedConfig,
     )
 
-from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu_environment
+from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu
 from .megatron_lm import (
     AbstractTrainStep,
     BertTrainStep,
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index 4d3bb9f3770..eaca5b82296 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -47,7 +47,7 @@ def filter_args(args, parser):
     return new_args
 
 
-def prepare_tpu_environment(args, current_env, pod=False):
+def prepare_tpu(args, current_env, pod=False):
     """
     Prepares and returns an environment with the correct TPU environment variables.
     """
@@ -60,7 +60,11 @@ def prepare_tpu_environment(args, current_env, pod=False):
             current_env["XLA_USE_BF16"] = "1"
     if pod:
         current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
-    return current_env
+
+    # Take explicit args and set them up for XLA
+    args.vm = args.tpu_vm
+    args.tpu = args.tpu_name
+    return args, current_env
 
 
 def env_var_path_add(env_var_name, path_to_add):

From 1c17bdcb78a748f83622a01a47020a9b0b4af50a Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Wed, 2 Nov 2022 13:56:20 -0400
Subject: [PATCH 03/24] Filter args

---
 src/accelerate/commands/launch.py | 8 ++++----
 src/accelerate/utils/__init__.py  | 2 +-
 src/accelerate/utils/launch.py    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 2df6badb14a..e525a60592b 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -36,7 +36,7 @@
     DistributedType,
     PrecisionType,
     PrepareForLaunch,
-    filter_args,
+    _filter_args,
     is_deepspeed_available,
     is_rich_available,
     is_sagemaker_available,
@@ -640,7 +640,7 @@ def multi_gpu_launcher(args):
         raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
 
     debug = getattr(args, "debug", False)
-    args = filter_args(args, distrib_run.get_args_parser())
+    args = _filter_args(args, distrib_run.get_args_parser())
     with patch_environment(**current_env):
         try:
             distrib_run.run(args)
@@ -757,7 +757,7 @@ def deepspeed_launcher(args):
             raise NotImplementedError("Multi-node training requires pytorch>=1.9.0")
 
         debug = getattr(args, "debug", False)
-        args = filter_args(args, distrib_run.get_args_parser())
+        args = _filter_args(args, distrib_run.get_args_parser())
         with patch_environment(**current_env):
             try:
                 distrib_run.run(args)
@@ -815,7 +815,7 @@ def tpu_pod_launcher(args):
     training_script = args.training_script
     training_script_args = args.training_script_args
 
-    args = filter_args(args, xla_dist.get_args_parser())
+    args = _filter_args(args, xla_dist.get_args_parser())
     args.tpu = tpu_name
     args.positional = ["python3", training_script] + training_script_args
     bad_flags = ""
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index e10ae15f880..5982065019e 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -97,7 +97,7 @@
         HfDeepSpeedConfig,
     )
 
-from .launch import PrepareForLaunch, filter_args, get_launch_prefix, prepare_tpu
+from .launch import PrepareForLaunch, _filter_args, get_launch_prefix, prepare_tpu
 from .megatron_lm import (
     AbstractTrainStep,
     BertTrainStep,
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index eaca5b82296..d43d9f32deb 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -35,7 +35,7 @@ def get_launch_prefix():
     return cmd
 
 
-def filter_args(args, parser):
+def _filter_args(args, parser):
     """
     Filters out all `accelerate` specific args
     """

From adb2309ce7866462e922b54b70adee8941404331 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Wed, 2 Nov 2022 14:01:07 -0400
Subject: [PATCH 04/24] Remove redundancy

---
 src/accelerate/commands/launch.py | 9 ++-------
 src/accelerate/utils/launch.py    | 7 +++----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index e525a60592b..e6c989201d5 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -776,7 +776,7 @@ def tpu_launcher(args):
     if args.no_python:
         raise ValueError("--no_python cannot be used with TPU launcher")
 
-    current_env = prepare_tpu(args, current_env)
+    args, current_env = prepare_tpu(args, current_env)
 
     if args.module:
         mod_name = args.training_script
@@ -805,18 +805,13 @@ def tpu_pod_launcher(args):
     from torch_xla.distributed import xla_dist
 
     current_env = {}
-    current_env = prepare_tpu(args, current_env, True)
-
-    # XLA uses the arg `tpu` to determine the TPU name, which will get erased
-    if args.tpu_name:
-        tpu_name = args.tpu_name
+    args, current_env = prepare_tpu(args, current_env, True)
     debug = getattr(args, "debug", False)
 
     training_script = args.training_script
     training_script_args = args.training_script_args
 
     args = _filter_args(args, xla_dist.get_args_parser())
-    args.tpu = tpu_name
     args.positional = ["python3", training_script] + training_script_args
     bad_flags = ""
     for k, v in vars(args):
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index d43d9f32deb..b0761c73ad1 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -60,10 +60,9 @@ def prepare_tpu(args, current_env, pod=False):
             current_env["XLA_USE_BF16"] = "1"
     if pod:
         current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
-
-    # Take explicit args and set them up for XLA
-    args.vm = args.tpu_vm
-    args.tpu = args.tpu_name
+        # Take explicit args and set them up for XLA
+        args.vm = args.tpu_vm
+        args.tpu = args.tpu_name
     return args, current_env
 
 

From 95527788c7fa47fc80408b877c3d93b533c68de9 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 7 Nov 2022 14:21:56 -0500
Subject: [PATCH 05/24] Working WIP!

---
 src/accelerate/commands/launch.py | 21 ++++++++++++++-------
 src/accelerate/utils/launch.py    |  4 ++--
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index e6c989201d5..db801d76e56 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -285,7 +285,7 @@ def launch_command_parser(subparsers=None):
         help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
     )
     tpu_args.add_argument(
-        "--use_cluster",
+        "--tpu_cluster",
         action="store_true",
         help="Whether to use a GCP TPU pod for training.",
     )
@@ -810,13 +810,16 @@ def tpu_pod_launcher(args):
 
     training_script = args.training_script
     training_script_args = args.training_script_args
-
-    args = _filter_args(args, xla_dist.get_args_parser())
+    args = _filter_args(
+        args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
+    )
     args.positional = ["python3", training_script] + training_script_args
     bad_flags = ""
-    for k, v in vars(args):
-        if k.startswith("docker_") and v != "":
-            bad_flags += f'{k}="{v}"\n'
+    for arg in vars(args):
+        if arg.startswith("docker_"):
+            value = getattr(args, arg)
+            if value != "" and value is not None:
+                bad_flags += f'{arg}="{value}"\n'
     if bad_flags != "":
         raise ValueError(
             f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}"
@@ -1001,6 +1004,7 @@ def launch_command(args):
         if (
             not args.multi_gpu
             and not args.tpu
+            and not args.tpu_cluster
             and not args.use_deepspeed
             and not args.use_fsdp
             and not args.use_mps_device
@@ -1009,6 +1013,7 @@ def launch_command(args):
             args.use_deepspeed = defaults.distributed_type == DistributedType.DEEPSPEED
             args.multi_gpu = defaults.distributed_type == DistributedType.MULTI_GPU
             args.tpu = defaults.distributed_type == DistributedType.TPU
+            args.tpu_cluster = defaults.tpu_cluster and args.tpu
             args.use_fsdp = defaults.distributed_type == DistributedType.FSDP
             args.use_mps_device = defaults.distributed_type == DistributedType.MPS
             args.use_megatron_lm = defaults.distributed_type == DistributedType.MEGATRON_LM
@@ -1091,9 +1096,11 @@ def launch_command(args):
     elif args.multi_gpu and not args.cpu:
         multi_gpu_launcher(args)
     elif args.tpu and not args.cpu:
-        if args.use_cluster:
+        if args.tpu_cluster:
+            print("Calling pod launcher!")
             tpu_pod_launcher(args)
         else:
+            print("Calling tpu launcher!")
             tpu_launcher(args)
     elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
         sagemaker_launcher(defaults, args)
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index b0761c73ad1..a79b3b0decd 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -35,11 +35,11 @@ def get_launch_prefix():
     return cmd
 
 
-def _filter_args(args, parser):
+def _filter_args(args, parser, default_args=[]):
     """
     Filters out all `accelerate` specific args
     """
-    new_args, _ = parser.parse_known_args()
+    new_args, _ = parser.parse_known_args(default_args)
 
     for key, value in vars(args).items():
         if key in vars(new_args).keys():

From c694064da9c1f18ed141e720b0c661c2948c478e Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 7 Nov 2022 14:43:29 -0500
Subject: [PATCH 06/24] Fix arg

---
 src/accelerate/commands/config/config_args.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py
index e4c93a13b38..b1c95447f32 100644
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@@ -157,6 +157,7 @@ class ClusterConfig(BaseConfig):
     tpu_cluster: bool = False
     command_file: str = None
     command: List[str] = None
+    commands: List[str] = None
     tpu_vm: List[str] = None
     tpu_env: List[str] = None
 

From 2d4b8374f89d315f6988cfc2964c4aa2eb84021a Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 7 Nov 2022 14:46:48 -0500
Subject: [PATCH 07/24] rm print

---
 src/accelerate/commands/launch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 2df550f5b61..69b55874c78 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -1113,10 +1113,8 @@ def launch_command(args):
         multi_gpu_launcher(args)
     elif args.tpu and not args.cpu:
         if args.tpu_cluster:
-            print("Calling pod launcher!")
             tpu_pod_launcher(args)
         else:
-            print("Calling tpu launcher!")
             tpu_launcher(args)
     elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMAZON_SAGEMAKER:
         sagemaker_launcher(defaults, args)

From b07d05328fa1fa6c743fb76c25e51fd095e3b8cd Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 7 Nov 2022 15:43:23 -0500
Subject: [PATCH 08/24] Try with no_tpu_cluster

---
 src/accelerate/commands/launch.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 69b55874c78..cbc2613c3a2 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -296,6 +296,12 @@ def launch_command_parser(subparsers=None):
         action="store_true",
         help="Whether to use a GCP TPU pod for training.",
     )
+    tpu_args.add_argument(
+        "--no_tpu_cluster",
+        action="store_false",
+        dest="tpu_cluster",
+        help="Whether to use a GCP TPU pod for training.",
+    )
     tpu_args.add_argument(
         "--vm",
         type=str,
@@ -829,7 +835,7 @@ def tpu_pod_launcher(args):
     args = _filter_args(
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
-    args.positional = ["python3", training_script] + training_script_args
+    args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args
     bad_flags = ""
     for arg in vars(args):
         if arg.startswith("docker_"):

From 81640873dce1aa4e9dd1aadc208332760d378cc1 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 7 Nov 2022 15:53:52 -0500
Subject: [PATCH 09/24] Switch to python3, use different branch

---
 src/accelerate/commands/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index cbc2613c3a2..509ffeaa2be 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -835,7 +835,7 @@ def tpu_pod_launcher(args):
     args = _filter_args(
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
-    args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args
+    args.positional = ["python3", training_script] + training_script_args
     bad_flags = ""
     for arg in vars(args):
         if arg.startswith("docker_"):

From ef3612558610283a16a294d27c8df9330f57c68a Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 8 Nov 2022 12:16:57 -0500
Subject: [PATCH 10/24] Try with just this

---
 src/accelerate/commands/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 509ffeaa2be..cbc2613c3a2 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -835,7 +835,7 @@ def tpu_pod_launcher(args):
     args = _filter_args(
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
-    args.positional = ["python3", training_script] + training_script_args
+    args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args
     bad_flags = ""
     for arg in vars(args):
         if arg.startswith("docker_"):

From c98c71d2b3f70201372a11ae0d7350ba485851ba Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 8 Nov 2022 12:20:15 -0500
Subject: [PATCH 11/24] With python

---
 src/accelerate/commands/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index cbc2613c3a2..509ffeaa2be 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -835,7 +835,7 @@ def tpu_pod_launcher(args):
     args = _filter_args(
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
-    args.positional = ["accelerate", "launch", "--tpu", "--no_tpu_cluster", training_script] + training_script_args
+    args.positional = ["python3", training_script] + training_script_args
     bad_flags = ""
     for arg in vars(args):
         if arg.startswith("docker_"):

From 0f0567deede0176d7a037ba5d4ee01bac81034b0 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 15 Nov 2022 16:34:56 -0500
Subject: [PATCH 12/24] It's working!

---
 src/accelerate/commands/launch.py | 36 ++++++++++++++++++++-----------
 src/accelerate/utils/launch.py    |  4 +++-
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 509ffeaa2be..5fcd064b400 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -832,29 +832,39 @@ def tpu_pod_launcher(args):
 
     training_script = args.training_script
     training_script_args = args.training_script_args
-    args = _filter_args(
+    new_args = _filter_args(
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
-    args.positional = ["python3", training_script] + training_script_args
+
+    new_args.positional = [
+        "accelerate-launch",
+        "--tpu",
+        "--no_tpu_cluster",
+        "--num_processes",
+        str(args.num_processes),
+        training_script,
+    ] + training_script_args
     bad_flags = ""
-    for arg in vars(args):
+    for arg in vars(new_args):
         if arg.startswith("docker_"):
-            value = getattr(args, arg)
+            value = getattr(new_args, arg)
             if value != "" and value is not None:
                 bad_flags += f'{arg}="{value}"\n'
     if bad_flags != "":
         raise ValueError(
             f"Docker containers are not supported for TPU pod launcher currently, please remove the following flags:\n{bad_flags}"
         )
-
-    with patch_environment(**current_env):
-        try:
-            xla_dist.resolve_and_execute(args)
-        except:
-            if is_rich_available() and debug:
-                console = get_console()
-                console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]")
-                console.print_exception(suppress=[__file__], show_locals=False)
+    new_args.env = [f"{k}={v}" for k, v in current_env.items()]
+    new_args.env.append("ACCELERATE_IN_TPU_POD=1")
+    try:
+        xla_dist.resolve_and_execute(new_args)
+    except:
+        if is_rich_available() and debug:
+            console = get_console()
+            console.print("\n[bold red]Using --debug, `torch_xla.xla_dist` Stack Trace:[/bold red]")
+            console.print_exception(suppress=[__file__], show_locals=False)
+        else:
+            raise
 
 
 def _convert_nargs_to_dict(nargs: List[str]) -> Dict[str, str]:
diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index a79b3b0decd..992c62013b4 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -59,10 +59,12 @@ def prepare_tpu(args, current_env, pod=False):
         else:
             current_env["XLA_USE_BF16"] = "1"
     if pod:
-        current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
         # Take explicit args and set them up for XLA
         args.vm = args.tpu_vm
         args.tpu = args.tpu_name
+    elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1":
+        # `xla_dist` will take care of this on pods
+        current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
     return args, current_env
 
 

From e8b694bb7da74d9a67b5b9d31db2d30748c64c94 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 15 Nov 2022 16:59:17 -0500
Subject: [PATCH 13/24] Fixed up CLI, needs a change before final merge and ci
 redo is in

---
 src/accelerate/commands/config/cluster.py | 20 ++++++++++++++------
 src/accelerate/commands/config/config.py  |  2 +-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index 6e7553032a8..0f4da2d97bd 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -375,12 +375,8 @@ def get_cluster_input():
     else:
         main_training_function = "main"
 
-    if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU, DistributedType.TPU]:
-        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "")
-        if machine_type == "TPU":
-            machine_type += " cores"
-        else:
-            machine_type += "(s)"
+    if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU]:
+        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)"
         num_processes = _ask_field(
             f"How many {machine_type} should be used for distributed training? [1]:",
             lambda x: int(x),
@@ -394,6 +390,13 @@ def get_cluster_input():
             default=1,
             error_message="Please enter an integer.",
         )
+    elif distributed_type == DistributedType.TPU:
+        num_processes = _ask_field(
+            "How many TPU core(s) should be used for distributed training (if using pods, on each pod)? [8]:",
+            lambda x: int(x),
+            default=8,
+            error_message="Please enter an integer.",
+        )
     else:
         num_processes = 1
 
@@ -421,6 +424,11 @@ def get_cluster_input():
         )
 
     downcast_bf16 = "no"
+    tpu_vm = None
+    tpu_env = []
+    tpu_name = None
+    use_cluster = False
+
     if distributed_type == DistributedType.TPU:
         if mixed_precision == "bf16":
             downcast_bf16 = _ask_field(
diff --git a/src/accelerate/commands/config/config.py b/src/accelerate/commands/config/config.py
index b504f07adb4..a80506bc13f 100644
--- a/src/accelerate/commands/config/config.py
+++ b/src/accelerate/commands/config/config.py
@@ -76,7 +76,7 @@ def config_command_parser(subparsers=None):
 
 def config_command(args):
     config = get_user_input()
-    if args.config_file is not None:
+    if args.__dict__["config_args.config_file"] is not None:
         config_file = args.config_file
     else:
         if not os.path.isdir(cache_dir):

From 60665b5b0b8d1d3dd33e0ba775fe8a05c0753ff9 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 15 Nov 2022 17:12:38 -0500
Subject: [PATCH 14/24] Merge with main

---
 src/accelerate/commands/config/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/config/config.py b/src/accelerate/commands/config/config.py
index d11a46fc455..72414f2abe6 100644
--- a/src/accelerate/commands/config/config.py
+++ b/src/accelerate/commands/config/config.py
@@ -65,7 +65,7 @@ def config_command_parser(subparsers=None):
 
 def config_command(args):
     config = get_user_input()
-    if args.__dict__["config_args.config_file"] is not None:
+    if args.config_file is not None:
         config_file = args.config_file
     else:
         if not os.path.isdir(cache_dir):

From f3ace09c93c130f52f13d5c0f5092fa668eea9b5 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Tue, 15 Nov 2022 17:24:50 -0500
Subject: [PATCH 15/24] Better doc

---
 src/accelerate/commands/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index c5aadbf25a1..2f327a51811 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -306,7 +306,7 @@ def launch_command_parser(subparsers=None):
         "--no_tpu_cluster",
         action="store_false",
         dest="tpu_cluster",
-        help="Whether to use a GCP TPU pod for training.",
+        help="Should not be passed explicitly, this is for internal use only.",
     )
     tpu_args.add_argument(
         "--vm",

From 9623e0689c1ca92f8c2adb0097892162b4989101 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Thu, 17 Nov 2022 08:54:14 -0500
Subject: [PATCH 16/24] machine_type -> device_type

---
 src/accelerate/commands/config/cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index 0f4da2d97bd..bd7de97077b 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -376,9 +376,9 @@ def get_cluster_input():
         main_training_function = "main"
 
     if distributed_type in [DistributedType.MULTI_CPU, DistributedType.MULTI_GPU]:
-        machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)"
+        device_type = str(distributed_type).split(".")[1].replace("MULTI_", "") + "(s)"
         num_processes = _ask_field(
-            f"How many {machine_type} should be used for distributed training? [1]:",
+            f"How many {device_type} should be used for distributed training? [1]:",
             lambda x: int(x),
             default=1,
             error_message="Please enter an integer.",

From 7d3066f4312f0a552aa6e5039c6e8c729c947c88 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Thu, 17 Nov 2022 08:57:07 -0500
Subject: [PATCH 17/24] Enable bf16 on TPUs through config

---
 src/accelerate/commands/config/cluster.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index bd7de97077b..b5c654e634d 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -416,7 +416,11 @@ def get_cluster_input():
                 _convert_mixed_precision,
             )
     else:
-        mixed_precision = "no"
+        mixed_precision = _ask_options(
+            "Do you wish to use BF16 (mixed precision)?",
+            ["no", "bf16"],
+            _convert_mixed_precision,
+        )
 
     if use_dynamo and mixed_precision == "no" and not use_cpu:
         print(
@@ -426,7 +430,6 @@ def get_cluster_input():
     downcast_bf16 = "no"
     tpu_vm = None
     tpu_env = []
-    tpu_name = None
     use_cluster = False
 
     if distributed_type == DistributedType.TPU:

From f5eb40c39a68cd031a8518c3e95c9016f62c39fa Mon Sep 17 00:00:00 2001
From: DESKTOP-RCJN1PO <muellerzr@gmail.com>
Date: Wed, 14 Dec 2022 14:28:21 -0500
Subject: [PATCH 18/24] Rm XRT_TPU_CONFIG for now

---
 src/accelerate/utils/launch.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accelerate/utils/launch.py b/src/accelerate/utils/launch.py
index 992c62013b4..e888404ca6d 100644
--- a/src/accelerate/utils/launch.py
+++ b/src/accelerate/utils/launch.py
@@ -62,9 +62,9 @@ def prepare_tpu(args, current_env, pod=False):
         # Take explicit args and set them up for XLA
         args.vm = args.tpu_vm
         args.tpu = args.tpu_name
-    elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1":
-        # `xla_dist` will take care of this on pods
-        current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
+    # elif not os.environ.get("ACCELERATE_IN_TPU_POD", "0") == "1":
+    #     # `xla_dist` will take care of this on pods
+    #     current_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011"
     return args, current_env
 
 

From ccb52913cb72c6a23922333005f59cd71d1a5da0 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 19 Dec 2022 14:41:31 -0500
Subject: [PATCH 19/24] New version

---
 src/accelerate/commands/launch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 2f327a51811..1af6c2300ad 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -861,6 +861,7 @@ def tpu_pod_launcher(args):
     )
 
     new_args.positional = [
+        "sudo",
         "accelerate-launch",
         "--tpu",
         "--no_tpu_cluster",

From 6bf5cb8eb61e92b16e34d2c69faa428fca00498e Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 19 Dec 2022 14:48:51 -0500
Subject: [PATCH 20/24] Add training function

---
 src/accelerate/commands/launch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 1af6c2300ad..62c55733b7d 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -867,6 +867,8 @@ def tpu_pod_launcher(args):
         "--no_tpu_cluster",
         "--num_processes",
         str(args.num_processes),
+        "--main_training_function",
+        str(args.main_training_function),
         training_script,
     ] + training_script_args
     bad_flags = ""

From 679f636051b8071cbd6a9f7a76c8d246e21a99de Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 19 Dec 2022 15:05:57 -0500
Subject: [PATCH 21/24] With sudo option

---
 src/accelerate/commands/config/cluster.py     |  6 ++++++
 src/accelerate/commands/config/config_args.py |  1 +
 src/accelerate/commands/launch.py             | 15 +++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index b5c654e634d..462a2b0ddd1 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -493,6 +493,11 @@ def get_cluster_input():
                             default=False,
                             error_message="Please enter yes or no.",
                         )
+            tpu_use_sudo = _ask_field(
+                "To run a python script in your TPU environment should `sudo` be used? [yes/NO]: ",
+                default=None,
+                error_message="Please enter yes or no.",
+            )
             tpu_vm = _ask_field(
                 "If not using an instance group, what are the names of the Compute VM instances to be used, seperated by a comma: ",
                 default="",
@@ -523,6 +528,7 @@ def get_cluster_input():
         same_network=same_network,
         tpu_name=tpu_name,
         tpu_zone=tpu_zone,
+        tpu_use_sudo=tpu_use_sudo,
         tpu_vm=tpu_vm,
         tpu_env=tpu_env,
         tpu_cluster=use_cluster,
diff --git a/src/accelerate/commands/config/config_args.py b/src/accelerate/commands/config/config_args.py
index 4d145083c19..d34a54a5d16 100644
--- a/src/accelerate/commands/config/config_args.py
+++ b/src/accelerate/commands/config/config_args.py
@@ -162,6 +162,7 @@ class ClusterConfig(BaseConfig):
     tpu_name: str = None
     tpu_zone: str = None
     tpu_cluster: bool = False
+    tpu_use_sudo: bool = False
     command_file: str = None
     commands: List[str] = None
     tpu_vm: List[str] = None
diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 62c55733b7d..dc1554387ac 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -308,6 +308,11 @@ def launch_command_parser(subparsers=None):
         dest="tpu_cluster",
         help="Should not be passed explicitly, this is for internal use only.",
     )
+    tpu_args.add_argument(
+        "--tpu_use_sudo",
+        action="store_true",
+        help="Whether to use sudo when running the TPU training script.",
+    )
     tpu_args.add_argument(
         "--vm",
         type=str,
@@ -860,8 +865,12 @@ def tpu_pod_launcher(args):
         args, xla_dist.get_args_parser(), ["--tpu", args.tpu_name, "--positional", "", "--restart-tpuvm-pod-server"]
     )
 
-    new_args.positional = [
-        "sudo",
+    if args.tpu_use_sudo:
+        new_args = ["sudo"]
+    else:
+        new_args = []
+
+    new_args += [
         "accelerate-launch",
         "--tpu",
         "--no_tpu_cluster",
@@ -871,6 +880,8 @@ def tpu_pod_launcher(args):
         str(args.main_training_function),
         training_script,
     ] + training_script_args
+
+    new_args.positional = new_args
     bad_flags = ""
     for arg in vars(new_args):
         if arg.startswith("docker_"):

From d423b7b1287c430ab415986f3ee511a10ae2ca36 Mon Sep 17 00:00:00 2001
From: muellerzr <muellerzr@gmail.com>
Date: Mon, 19 Dec 2022 17:51:55 -0500
Subject: [PATCH 22/24] Push fix

---
 src/accelerate/commands/launch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index dc1554387ac..627f6ed9758 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -866,11 +866,11 @@ def tpu_pod_launcher(args):
     )
 
     if args.tpu_use_sudo:
-        new_args = ["sudo"]
+        new_cmd = ["sudo"]
     else:
-        new_args = []
+        new_cmd = []
 
-    new_args += [
+    new_cmd += [
         "accelerate-launch",
         "--tpu",
         "--no_tpu_cluster",
@@ -881,7 +881,7 @@ def tpu_pod_launcher(args):
         training_script,
     ] + training_script_args
 
-    new_args.positional = new_args
+    new_args.positional = new_cmd
     bad_flags = ""
     for arg in vars(new_args):
         if arg.startswith("docker_"):

From 7effca3665815c96e27ce7e923c70f5a8bb7b056 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Tue, 7 Feb 2023 10:06:09 -0500
Subject: [PATCH 23/24] Update with alpha option

---
 src/accelerate/commands/launch.py | 2 +-
 src/accelerate/commands/tpu.py    | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/accelerate/commands/launch.py b/src/accelerate/commands/launch.py
index 627f6ed9758..1df3ffe6cbd 100644
--- a/src/accelerate/commands/launch.py
+++ b/src/accelerate/commands/launch.py
@@ -284,7 +284,7 @@ def launch_command_parser(subparsers=None):
         help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
     )
 
-    # tpu arguments
+    # TPU arguments
     tpu_args = parser.add_argument_group("TPU", "Arguments related to TPU.")
     tpu_args.add_argument(
         "--main_training_function",
diff --git a/src/accelerate/commands/tpu.py b/src/accelerate/commands/tpu.py
index 6b90770c750..0db53363ec4 100644
--- a/src/accelerate/commands/tpu.py
+++ b/src/accelerate/commands/tpu.py
@@ -51,6 +51,11 @@ def tpu_command_parser(subparsers=None):
         help="The zone of the TPU to use. If not specified, will use the zone specified in the config file.",
     )
     pod_args = parser.add_argument_group("TPU Arguments", "Arguments for options ran inside the TPU.")
+    pod_args.add_argument(
+        "--use_alpha",
+        action="store_true",
+        help="Whether to use `gcloud alpha` when running the TPU training script instead of `gcloud`.",
+    )
     pod_args.add_argument(
         "--command_file",
         default=None,

From 8a0b5c419b5ff5bca7c18e62225ff17a8127f09e Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 8 Feb 2023 13:35:22 -0500
Subject: [PATCH 24/24] Add use_sudo

---
 src/accelerate/commands/config/cluster.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accelerate/commands/config/cluster.py b/src/accelerate/commands/config/cluster.py
index 462a2b0ddd1..8cb61b84651 100644
--- a/src/accelerate/commands/config/cluster.py
+++ b/src/accelerate/commands/config/cluster.py
@@ -431,6 +431,7 @@ def get_cluster_input():
     tpu_vm = None
     tpu_env = []
     use_cluster = False
+    tpu_use_sudo = False
 
     if distributed_type == DistributedType.TPU:
         if mixed_precision == "bf16":