FlagOpen · Ox7c000000 · Feb 8, 2023 · Feb 6, 2023 · Feb 8, 2023 · Feb 8, 2023
diff --git a/training/benchmarks/driver/check.py b/training/benchmarks/driver/check.py
@@ -32,7 +32,6 @@ def check_config(config, model_pt_file):
         raise ValueError(f"data_dir '{data_dir}' not exists.")
     config.data_dir = data_dir
 
-
     train_data = get_config_arg(config, "train_data")
     if train_data is not None:
         config.train_data = ospath.join(data_dir, train_data)

diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py
@@ -23,6 +23,7 @@
 import torch
 from torch.nn.parallel.distributed import DistributedDataParallel as DDP
 
+
 def generate_seeds(rng, size):
     """
     Generate list of random seeds
@@ -203,6 +204,7 @@ def format_step(step):
 
 
 class PyTorchDistributedDataParallel(DDP):
+
     def named_parameters(self, prefix: str = '', recurse: bool = True):
         return self.module.named_parameters(prefix=prefix, recurse=recurse)
 

diff --git a/training/benchmarks/driver/dist_tensorflow2.py b/training/benchmarks/driver/dist_tensorflow2.py
@@ -0,0 +1,268 @@
+# Copyright © 2022 BAAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License")
+# Modified some functions to support FlagPerf.
+#
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import tensorflow as tf
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+
+
+def global_batch_size(config):
+    return config.train_dataset.batch_size * config.runtime.num_gpus  # TODO get_world_size()
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Training Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Training Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    return s
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+    """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
+  """
+    if all_reduce_alg is None:
+        return None
+    mirrored_all_reduce_options = {
+        "nccl": tf.distribute.NcclAllReduce,
+        "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+    }
+    if all_reduce_alg not in mirrored_all_reduce_options:
+        raise ValueError(
+            "When used with `mirrored`, valid values for all_reduce_alg are "
+            "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+                all_reduce_alg))
+    cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+    return cross_device_ops_class(num_packs=num_packs)
+
+
+def tpu_initialize(tpu_address):
+    """Initializes TPU for TF 2.x training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+    cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu=tpu_address)
+    if tpu_address not in ("", "local"):
+        tf.config.experimental_connect_to_cluster(cluster_resolver)
+    tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+    return cluster_resolver
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+    """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+    task_index: index of the worker.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    if tf_config:
+        num_workers = (len(tf_config["cluster"].get("chief", [])) +
+                       len(tf_config["cluster"].get("worker", [])))
+    elif worker_hosts:
+        workers = worker_hosts.split(",")
+        num_workers = len(workers)
+        if num_workers > 1 and task_index < 0:
+            raise ValueError(
+                "Must specify task_index when number of workers > 1")
+        task_index = 0 if num_workers == 1 else task_index
+        os.environ["TF_CONFIG"] = json.dumps({
+            "cluster": {
+                "worker": workers
+            },
+            "task": {
+                "type": "worker",
+                "index": task_index
+            }
+        })
+    else:
+        num_workers = 1
+    return num_workers
+
+
+def _collective_communication(all_reduce_alg):
+    """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+  """
+    collective_communication_options = {
+        None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+        "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+        "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+    }
+    if all_reduce_alg not in collective_communication_options:
+        raise ValueError(
+            "When used with `multi_worker_mirrored`, valid values for "
+            "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+                all_reduce_alg))
+    return collective_communication_options[all_reduce_alg]
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1,
+                              tpu_address=None,
+                              **kwargs):
+    """Return a Strategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are "off", "one_device", "mirrored",
+      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
+      insensitive. "tpu" means to use TPUStrategy using `tpu_address`.
+      "off" means to use the default strategy which is obtained from
+      tf.distribute.get_strategy (for details on the default strategy, see
+      https://www.tensorflow.org/guide/distributed_training#default_strategy).
+    num_gpus: Number of GPUs to run this model.
+    all_reduce_alg: Optional. Specifies which algorithm to use when performing
+      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
+      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
+      "ring" and "nccl".  If None, DistributionStrategy will choose based on
+      device topology.
+    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
+      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not be
+      None if `distribution_strategy` is set to `tpu`.
+    **kwargs: Additional kwargs for internal usages.
+
+  Returns:
+    tf.distribute.Strategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
+  """
+    del kwargs
+    if num_gpus < 0:
+        raise ValueError("`num_gpus` can not be negative.")
+
+    if not isinstance(distribution_strategy, str):
+        msg = ("distribution_strategy must be a string but got: %s." %
+               (distribution_strategy, ))
+        if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+            msg += (
+                " If you meant to pass the string 'off', make sure you add "
+                "quotes around 'off' so that yaml interprets it as a string "
+                "instead of a bool.")
+        raise ValueError(msg)
+
+    distribution_strategy = distribution_strategy.lower()
+    if distribution_strategy == "off":
+        if num_gpus > 1:
+            raise ValueError(
+                f"When {num_gpus} GPUs are specified, "
+                "distribution_strategy flag cannot be set to `off`.")
+        # Return the default distribution strategy.
+        return tf.distribute.get_strategy()
+
+    if distribution_strategy == "tpu":
+        # When tpu_address is an empty string, we communicate with local TPUs.
+        cluster_resolver = tpu_initialize(tpu_address)
+        return tf.distribute.TPUStrategy(cluster_resolver)
+
+    if distribution_strategy == "multi_worker_mirrored":
+        return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+            communication=_collective_communication(all_reduce_alg))
+
+    if distribution_strategy == "one_device":
+        if num_gpus == 0:
+            return tf.distribute.OneDeviceStrategy("device:CPU:0")
+        if num_gpus > 1:
+            raise ValueError(
+                "`OneDeviceStrategy` can not be used for more than "
+                "one device.")
+        return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+    if distribution_strategy == "mirrored":
+        if num_gpus == 0:
+            devices = ["device:CPU:0"]
+        else:
+            devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+        return tf.distribute.MirroredStrategy(
+            devices=devices,
+            cross_device_ops=_mirrored_cross_device_ops(
+                all_reduce_alg, num_packs))
+
+    if distribution_strategy == "parameter_server":
+        cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver(
+        )
+        return tf.distribute.experimental.ParameterServerStrategy(
+            cluster_resolver)
+
+    raise ValueError("Unrecognized Distribution Strategy: %r" %
+                     distribution_strategy)
+
+
+def get_strategy_scope(strategy):
+    if strategy:
+        strategy_scope = strategy.scope()
+    else:
+        strategy_scope = DummyContextManager()
+
+    return strategy_scope
+
+
+class DummyContextManager(object):
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, *args):
+        pass
diff --git a/training/benchmarks/glm/pytorch/dataloaders/dataloader.py b/training/benchmarks/glm/pytorch/dataloaders/dataloader.py
@@ -10,6 +10,7 @@
 from torch.utils.data.dataloader import default_collate
 import h5sparse
 from scipy.sparse import csr_matrix
+
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
 from driver import dist_pytorch

diff --git a/training/benchmarks/glm/pytorch/run_pretraining.py b/training/benchmarks/glm/pytorch/run_pretraining.py
@@ -40,7 +40,8 @@ def main():
 
     dist_pytorch.init_dist_training_env(config)
 
-    check.check_config(config, "blocklm-large-blank/200000/mp_rank_00_model_states.pt")
+    check.check_config(
+        config, "blocklm-large-blank/200000/mp_rank_00_model_states.pt")
 
     dist_pytorch.barrier()
     glm_driver.event(Event.INIT_START)

diff --git a/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py b/training/run_benchmarks/tensorflow2/start_tensorflow2_task.py
@@ -20,7 +20,8 @@ def parse_args():
        args such as --data_dir_xxx. Then pass all useful args to the real
        training script.
     '''
-    parser = ArgumentParser(description="Start tensorflow2 training processes. ")
+    parser = ArgumentParser(
+        description="Start tensorflow2 training processes. ")
     parser.add_argument("--node_rank",
                         type=int,
                         default=0,

diff --git a/training/utils/gen_dummy_benchmark.py b/training/utils/gen_dummy_benchmark.py
@@ -7,7 +7,6 @@
 import sys
 from argparse import ArgumentParser
 
-
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 
 
@@ -26,6 +25,7 @@ class DummyBenchmark():
 for environ in current_env.keys():
      print(environ + ":" + current_env[environ])
 '''
+
     def __init__(self, vendor, framework, config_file, data_dir):
         self.vendor = vendor
         self.framework = framework
@@ -90,11 +90,26 @@ def print_dummy_test_conf(self):
 def _parse_args():
     '''Get command args from input. '''
     parser = ArgumentParser(description="Generate a dummy benchmark case.")
-    parser.add_argument("-v", type=str, metavar='[vendor]', required=True, help="Vendor name")
-    parser.add_argument("-f", type=str, metavar='[framework]', required=True, help="Framework")
-    parser.add_argument("-c", type=str, metavar='[config]', required=True, 
+    parser.add_argument("-v",
+                        type=str,
+                        metavar='[vendor]',
+                        required=True,
+                        help="Vendor name")
+    parser.add_argument("-f",
+                        type=str,
+                        metavar='[framework]',
+                        required=True,
+                        help="Framework")
+    parser.add_argument("-c",
+                        type=str,
+                        metavar='[config]',
+                        required=True,
                         help="Config file name, e.g. config_A100_1x8.")
-    parser.add_argument("-d", type=str, metavar='[data dir]', required=True, help="Dummy data dir")
+    parser.add_argument("-d",
+                        type=str,
+                        metavar='[data dir]',
+                        required=True,
+                        help="Dummy data dir")
     args, _ = parser.parse_known_args()
     return args
 
@@ -106,5 +121,6 @@ def main():
     dummy_benchmark.add_to_perf()
     dummy_benchmark.print_dummy_test_conf()
 
+
 if __name__ == '__main__':
     main()
diff --git a/training/utils/image_manager.py b/training/utils/image_manager.py
@@ -23,7 +23,9 @@ def _parse_args():
           -d [directory]  Directory contains dockerfile and install script
           -f [framework]  AI framework '''
 
-    parser = argparse.ArgumentParser(description='Docker managment script', formatter_class=argparse.RawTextHelpFormatter)
+    parser = argparse.ArgumentParser(
+        description='Docker managment script',
+        formatter_class=argparse.RawTextHelpFormatter)
     parser.add_argument('-o',
                         type=str,
                         metavar='[operation]',