From 82fd8cdaa522d2254640e69a20a226a88e25d8b3 Mon Sep 17 00:00:00 2001 From: Amanpreet Singh Date: Sat, 18 Apr 2020 00:31:47 -0700 Subject: [PATCH] [fix] Seeding fixes, seed random, generate strong seed (#76) Summary: Pull Request resolved: https://github.com/fairinternal/pythia-internal/pull/76 Reviewed By: vedanuj Differential Revision: D21105265 Pulled By: apsdehal fbshipit-source-id: 4f991a81a9614bf4ff2536fe7aff6dac7e2b341a --- mmf/configs/defaults.yaml | 4 +++- mmf/utils/distributed.py | 4 ++-- mmf/utils/env.py | 22 ++++++++++++++++++++++ tools/run.py | 29 +++++++++-------------------- 4 files changed, 36 insertions(+), 23 deletions(-) create mode 100644 mmf/utils/env.py diff --git a/mmf/configs/defaults.yaml b/mmf/configs/defaults.yaml index 71806bfc3..36d7307d3 100644 --- a/mmf/configs/defaults.yaml +++ b/mmf/configs/defaults.yaml @@ -4,7 +4,9 @@ training: trainer: 'base_trainer' # Seed to be used for training. -1 means random seed between 1 and 100000. # Either pass fixed through your config or command line arguments - seed: null + # Pass null to the seed if you don't want it seeded anyhow and + # want to leave it to default + seed: -1 # Name of the experiment, will be used while saving checkpoints # and generating reports experiment_name: run diff --git a/mmf/utils/distributed.py b/mmf/utils/distributed.py index d1b233cd0..2122c2c63 100644 --- a/mmf/utils/distributed.py +++ b/mmf/utils/distributed.py @@ -175,7 +175,7 @@ def distributed_init(config): warnings.warn("Distributed is already initialized, cannot initialize twice!") else: print( - "| distributed init (rank {}): {}".format( + "Distributed Init (Rank {}): {}".format( config.distributed.rank, config.distributed.init_method ), flush=True, @@ -187,7 +187,7 @@ def distributed_init(config): rank=config.distributed.rank, ) print( - "| initialized host {} as rank {}".format( + "Initialized Host {} as Rank {}".format( socket.gethostname(), config.distributed.rank ), flush=True, diff --git a/mmf/utils/env.py b/mmf/utils/env.py new file mode 100644 index 000000000..82f797a5e --- /dev/null +++ b/mmf/utils/env.py @@ -0,0 +1,22 @@ +import os +import random +from datetime import datetime + +import numpy as np +import torch + + +def set_seed(seed): + if seed: + if seed == -1: + # From detectron2 + seed = ( + os.getpid() + + int(datetime.now().strftime("%S%f")) + + int.from_bytes(os.urandom(2), "big") + ) + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) + + return seed diff --git a/tools/run.py b/tools/run.py index c9b1e6ef1..d345fc324 100644 --- a/tools/run.py +++ b/tools/run.py @@ -8,6 +8,7 @@ from mmf.utils.build import build_trainer from mmf.utils.configuration import Configuration from mmf.utils.distributed import distributed_init, infer_init_method +from mmf.utils.env import set_seed from mmf.utils.flags import flags from mmf.utils.general import setup_imports @@ -15,19 +16,18 @@ def main(configuration, init_distributed=False): setup_imports() config = configuration.get_config() + if torch.cuda.is_available(): torch.cuda.set_device(config.device_id) - if config.seed: - if config.seed == -1: - config.seed = random.randint(10000, 20000) - np.random.seed(config.seed) - torch.manual_seed(config.seed) - # TODO: Re-enable after project - # random.seed(config.seed) - # torch.backends.cudnn.benchmark = False - # torch.backends.cudnn.deterministic = True + torch.cuda.init() + if init_distributed: distributed_init(config) + + config.training.seed = set_seed(config.training.seed) + registry.register("seed", config.training.seed) + print("Using seed {}".format(config.training.seed)) + trainer = build_trainer(configuration) trainer.load() trainer.train() @@ -80,17 +80,6 @@ def run(): else: config.device_id = 0 main(configuration) - # Log any errors that occur to log file - # try: - # trainer.load() - # trainer.train() - # except Exception as e: - # writer = getattr(trainer, "writer", None) - - # if writer is not None: - # writer.write(e, "error", donot_print=True) - # if is_main_process(): - # raise if __name__ == "__main__":