diff --git a/examples/atari/reproduction/a3c/train_a3c.py b/examples/atari/reproduction/a3c/train_a3c.py index f3cf1cadf..f4dc506eb 100644 --- a/examples/atari/reproduction/a3c/train_a3c.py +++ b/examples/atari/reproduction/a3c/train_a3c.py @@ -33,7 +33,7 @@ def main(): parser.add_argument("--t-max", type=int, default=5) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") - parser.add_argument("--steps", type=int, default=8 * 10 ** 7) + parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument( "--max-frames", type=int, @@ -84,7 +84,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes - assert process_seeds.max() < 2 ** 31 + assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -92,7 +92,7 @@ def main(): def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] - env_seed = 2 ** 31 - 1 - process_seed if test else process_seed + env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, diff --git a/examples/atari/reproduction/dqn/train_dqn.py b/examples/atari/reproduction/dqn/train_dqn.py index 72c210ad5..ae32bef65 100644 --- a/examples/atari/reproduction/dqn/train_dqn.py +++ b/examples/atari/reproduction/dqn/train_dqn.py @@ -64,13 +64,13 @@ def main(): parser.add_argument( "--steps", type=int, - default=5 * 10 ** 7, + default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, - default=5 * 10 ** 4, + default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument("--eval-n-steps", type=int, default=125000) @@ -87,7 +87,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -133,12 +133,12 @@ def make_env(test): centered=True, ) - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, - decay_steps=10 ** 6, + decay_steps=10**6, random_action_func=lambda: np.random.randint(n_actions), ) @@ -155,7 +155,7 @@ def phi(x): gamma=0.99, explorer=explorer, replay_start_size=args.replay_start_size, - target_update_interval=10 ** 4, + target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator="sum", diff --git a/examples/atari/reproduction/iqn/train_iqn.py b/examples/atari/reproduction/iqn/train_iqn.py index bd3060747..c1f6d2364 100644 --- a/examples/atari/reproduction/iqn/train_iqn.py +++ b/examples/atari/reproduction/iqn/train_iqn.py @@ -31,18 +31,18 @@ def main(): "--pretrained-type", type=str, default="best", choices=["best", "final"] ) parser.add_argument("--load", type=str, default=None) - parser.add_argument("--final-exploration-frames", type=int, default=10 ** 6) + parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) - parser.add_argument("--steps", type=int, default=5 * 10 ** 7) + parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) - parser.add_argument("--replay-start-size", type=int, default=5 * 10 ** 4) - parser.add_argument("--target-update-interval", type=int, default=10 ** 4) + parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) + parser.add_argument("--target-update-interval", type=int, default=10**4) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--update-interval", type=int, default=4) @@ -85,7 +85,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -138,7 +138,7 @@ def make_env(test): # Use the same hyper parameters as https://arxiv.org/abs/1710.10044 opt = torch.optim.Adam(q_func.parameters(), lr=5e-5, eps=1e-2 / args.batch_size) - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, diff --git a/examples/atari/reproduction/rainbow/train_rainbow.py b/examples/atari/reproduction/rainbow/train_rainbow.py index b7dbbaee4..065ed8b24 100644 --- a/examples/atari/reproduction/rainbow/train_rainbow.py +++ b/examples/atari/reproduction/rainbow/train_rainbow.py @@ -35,14 +35,14 @@ def main(): parser.add_argument("--load", type=str, default=None) parser.add_argument("--eval-epsilon", type=float, default=0.0) parser.add_argument("--noisy-net-sigma", type=float, default=0.5) - parser.add_argument("--steps", type=int, default=5 * 10 ** 7) + parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) - parser.add_argument("--replay-start-size", type=int, default=2 * 10 ** 4) + parser.add_argument("--replay-start-size", type=int, default=2 * 10**4) parser.add_argument("--eval-n-steps", type=int, default=125000) parser.add_argument("--eval-interval", type=int, default=250000) parser.add_argument( @@ -77,7 +77,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -123,14 +123,14 @@ def make_env(test): explorer = explorers.Greedy() # Use the same hyper parameters as https://arxiv.org/abs/1710.02298 - opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10 ** -4) + opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) # Prioritized Replay # Anneal beta from beta0 to 1 throughout training update_interval = 4 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( - 10 ** 6, + 10**6, alpha=0.5, beta0=0.4, betasteps=betasteps, diff --git a/examples/atari/train_a2c_ale.py b/examples/atari/train_a2c_ale.py index bed2d71ef..282c91633 100644 --- a/examples/atari/train_a2c_ale.py +++ b/examples/atari/train_a2c_ale.py @@ -28,7 +28,7 @@ def main(): default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) - parser.add_argument("--steps", type=int, default=8 * 10 ** 7) + parser.add_argument("--steps", type=int, default=8 * 10**7) parser.add_argument("--update-steps", type=int, default=5) parser.add_argument("--lr", type=float, default=7e-4) parser.add_argument("--gamma", type=float, default=0.99, help="discount factor") @@ -43,7 +43,7 @@ def main(): parser.add_argument( "--alpha", type=float, default=0.99, help="RMSprop optimizer alpha" ) - parser.add_argument("--eval-interval", type=int, default=10 ** 6) + parser.add_argument("--eval-interval", type=int, default=10**6) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default="") @@ -92,7 +92,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 31 + assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -100,7 +100,7 @@ def main(): def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] - env_seed = 2 ** 31 - 1 - process_seed if test else process_seed + env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, diff --git a/examples/atari/train_acer_ale.py b/examples/atari/train_acer_ale.py index 0cf3bbe71..091377718 100644 --- a/examples/atari/train_acer_ale.py +++ b/examples/atari/train_acer_ale.py @@ -38,7 +38,7 @@ def main(): parser.add_argument("--n-times-replay", type=int, default=4) parser.add_argument("--beta", type=float, default=1e-2) parser.add_argument("--profile", action="store_true") - parser.add_argument("--steps", type=int, default=10 ** 7) + parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, @@ -46,7 +46,7 @@ def main(): help="Maximum number of frames for each episode.", ) parser.add_argument("--lr", type=float, default=7e-4) - parser.add_argument("--eval-interval", type=int, default=10 ** 5) + parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--use-lstm", action="store_true") parser.add_argument("--demo", action="store_true", default=False) @@ -87,7 +87,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.processes) + args.seed * args.processes - assert process_seeds.max() < 2 ** 31 + assert process_seeds.max() < 2**31 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -130,7 +130,7 @@ def main(): model.parameters(), lr=args.lr, eps=4e-3, alpha=0.99 ) - replay_buffer = EpisodicReplayBuffer(10 ** 6 // args.processes) + replay_buffer = EpisodicReplayBuffer(10**6 // args.processes) def phi(x): # Feature extractor @@ -156,7 +156,7 @@ def phi(x): def make_env(process_idx, test): # Use different random seeds for train and test envs process_seed = process_seeds[process_idx] - env_seed = 2 ** 31 - 1 - process_seed if test else process_seed + env_seed = 2**31 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, diff --git a/examples/atari/train_categorical_dqn_ale.py b/examples/atari/train_categorical_dqn_ale.py index 4620209a0..f25c28c08 100644 --- a/examples/atari/train_categorical_dqn_ale.py +++ b/examples/atari/train_categorical_dqn_ale.py @@ -24,19 +24,19 @@ def main(): parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) - parser.add_argument("--final-exploration-frames", type=int, default=10 ** 6) + parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.1) parser.add_argument("--eval-epsilon", type=float, default=0.05) - parser.add_argument("--steps", type=int, default=10 ** 7) + parser.add_argument("--steps", type=int, default=10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) - parser.add_argument("--replay-start-size", type=int, default=5 * 10 ** 4) - parser.add_argument("--target-update-interval", type=int, default=10 ** 4) - parser.add_argument("--eval-interval", type=int, default=10 ** 5) + parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) + parser.add_argument("--target-update-interval", type=int, default=10**4) + parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--batch-size", type=int, default=32) @@ -71,7 +71,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -120,7 +120,7 @@ def make_env(test): # Use the same hyper parameters as https://arxiv.org/abs/1707.06887 opt = torch.optim.Adam(q_func.parameters(), 2.5e-4, eps=1e-2 / args.batch_size) - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, diff --git a/examples/atari/train_dqn_ale.py b/examples/atari/train_dqn_ale.py index 7f7dcc213..e09256b07 100644 --- a/examples/atari/train_dqn_ale.py +++ b/examples/atari/train_dqn_ale.py @@ -86,7 +86,7 @@ def main(): parser.add_argument( "--final-exploration-frames", type=int, - default=10 ** 6, + default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( @@ -112,7 +112,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=5 * 10 ** 7, + default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -124,19 +124,19 @@ def main(): parser.add_argument( "--replay-start-size", type=int, - default=5 * 10 ** 4, + default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, - default=3 * 10 ** 4, + default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, - default=10 ** 5, + default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( @@ -196,7 +196,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -254,14 +254,14 @@ def make_env(test): # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( - 10 ** 6, + 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.num_step_return, ) else: - rbuf = replay_buffers.ReplayBuffer(10 ** 6, args.num_step_return) + rbuf = replay_buffers.ReplayBuffer(10**6, args.num_step_return) def phi(x): # Feature extractor diff --git a/examples/atari/train_dqn_batch_ale.py b/examples/atari/train_dqn_batch_ale.py index 413ab830d..68262dfd4 100644 --- a/examples/atari/train_dqn_batch_ale.py +++ b/examples/atari/train_dqn_batch_ale.py @@ -79,7 +79,7 @@ def main(): parser.add_argument("--gpu", type=int, default=0) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) - parser.add_argument("--final-exploration-frames", type=int, default=10 ** 6) + parser.add_argument("--final-exploration-frames", type=int, default=10**6) parser.add_argument("--final-epsilon", type=float, default=0.01) parser.add_argument("--eval-epsilon", type=float, default=0.001) parser.add_argument("--noisy-net-sigma", type=float, default=None) @@ -89,16 +89,16 @@ def main(): default="doubledqn", choices=["nature", "nips", "dueling", "doubledqn"], ) - parser.add_argument("--steps", type=int, default=5 * 10 ** 7) + parser.add_argument("--steps", type=int, default=5 * 10**7) parser.add_argument( "--max-frames", type=int, default=30 * 60 * 60, # 30 minutes with 60 fps help="Maximum number of frames for each episode.", ) - parser.add_argument("--replay-start-size", type=int, default=5 * 10 ** 4) - parser.add_argument("--target-update-interval", type=int, default=3 * 10 ** 4) - parser.add_argument("--eval-interval", type=int, default=10 ** 5) + parser.add_argument("--replay-start-size", type=int, default=5 * 10**4) + parser.add_argument("--target-update-interval", type=int, default=3 * 10**4) + parser.add_argument("--eval-interval", type=int, default=10**5) parser.add_argument("--update-interval", type=int, default=4) parser.add_argument("--eval-n-runs", type=int, default=10) parser.add_argument("--no-clip-delta", dest="clip_delta", action="store_false") @@ -148,7 +148,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -156,7 +156,7 @@ def main(): def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, @@ -210,14 +210,14 @@ def make_batch_env(test): # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( - 10 ** 6, + 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps, num_steps=args.n_step_return, ) else: - rbuf = replay_buffers.ReplayBuffer(10 ** 6, num_steps=args.n_step_return) + rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, diff --git a/examples/atari/train_drqn_ale.py b/examples/atari/train_drqn_ale.py index effc4a5cd..ccbefa699 100644 --- a/examples/atari/train_drqn_ale.py +++ b/examples/atari/train_drqn_ale.py @@ -49,7 +49,7 @@ def main(): parser.add_argument( "--final-exploration-frames", type=int, - default=10 ** 6, + default=10**6, help="Timesteps after which we stop " + "annealing exploration rate", ) parser.add_argument( @@ -67,7 +67,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=5 * 10 ** 7, + default=5 * 10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -79,13 +79,13 @@ def main(): parser.add_argument( "--replay-start-size", type=int, - default=5 * 10 ** 4, + default=5 * 10**4, help="Minimum replay buffer size before " + "performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, - default=3 * 10 ** 4, + default=3 * 10**4, help="Frequency (in timesteps) at which " + "the target network is updated.", ) parser.add_argument("--demo-n-episodes", type=int, default=30) @@ -173,7 +173,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -222,7 +222,7 @@ def make_env(test): DiscreteActionValueHead(), ) # Replay buffer that stores whole episodes - rbuf = replay_buffers.EpisodicReplayBuffer(10 ** 6) + rbuf = replay_buffers.EpisodicReplayBuffer(10**6) else: # Q-network without LSTM q_func = nn.Sequential( @@ -238,7 +238,7 @@ def make_env(test): DiscreteActionValueHead(), ) # Replay buffer that stores transitions separately - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.LinearDecayEpsilonGreedy( 1.0, diff --git a/examples/atari/train_ppo_ale.py b/examples/atari/train_ppo_ale.py index 7d99c5085..80bac591f 100644 --- a/examples/atari/train_ppo_ale.py +++ b/examples/atari/train_ppo_ale.py @@ -47,7 +47,7 @@ def main(): ), ) parser.add_argument( - "--steps", type=int, default=10 ** 7, help="Total time steps for training." + "--steps", type=int, default=10**7, help="Total time steps for training." ) parser.add_argument( "--max-frames", @@ -169,7 +169,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -177,7 +177,7 @@ def main(): def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed env = atari_wrappers.wrap_deepmind( atari_wrappers.make_atari(args.env, max_frames=args.max_frames), episode_life=not test, diff --git a/examples/atlas/train_soft_actor_critic_atlas.py b/examples/atlas/train_soft_actor_critic_atlas.py index dafda918b..8dc411192 100644 --- a/examples/atlas/train_soft_actor_critic_atlas.py +++ b/examples/atlas/train_soft_actor_critic_atlas.py @@ -29,7 +29,7 @@ def make_env(args, seed, test): assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs - env_seed = 2 ** 32 - 1 - seed if test else seed + env_seed = 2**32 - 1 - seed if test else seed env.seed(int(env_seed)) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) @@ -75,7 +75,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=10 ** 7, + default=10**7, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -145,7 +145,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 def make_batch_env(test): return pfrl.envs.MultiprocessVectorEnv( @@ -213,7 +213,7 @@ def make_q_func_with_optimizer(): q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() - rbuf = replay_buffers.ReplayBuffer(10 ** 6, num_steps=args.n_step_return) + rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return) def burnin_action_func(): """Select random actions until model is updated one or more times.""" diff --git a/examples/grasping/train_dqn_batch_grasping.py b/examples/grasping/train_dqn_batch_grasping.py index 7fc6c63ce..0274a0530 100644 --- a/examples/grasping/train_dqn_batch_grasping.py +++ b/examples/grasping/train_dqn_batch_grasping.py @@ -151,7 +151,7 @@ def main(): parser.add_argument( "--final-exploration-steps", type=int, - default=5 * 10 ** 5, + default=5 * 10**5, help="Timesteps after which we stop annealing exploration rate", ) parser.add_argument( @@ -163,25 +163,25 @@ def main(): parser.add_argument( "--steps", type=int, - default=2 * 10 ** 6, + default=2 * 10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( "--replay-start-size", type=int, - default=5 * 10 ** 4, + default=5 * 10**4, help="Minimum replay buffer size before performing gradient updates.", ) parser.add_argument( "--target-update-interval", type=int, - default=1 * 10 ** 4, + default=1 * 10**4, help="Frequency (in timesteps) at which the target network is updated.", ) parser.add_argument( "--eval-interval", type=int, - default=10 ** 5, + default=10**5, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( @@ -235,7 +235,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -249,7 +249,7 @@ def make_env(idx, test): # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed # Set a random seed for this subprocess utils.set_random_seed(env_seed) env = KukaDiverseObjectEnv( @@ -302,7 +302,7 @@ def make_batch_env(test): # Anneal beta from beta0 to 1 throughout training betasteps = args.steps / args.update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( - 10 ** 6, alpha=0.6, beta0=0.4, betasteps=betasteps + 10**6, alpha=0.6, beta0=0.4, betasteps=betasteps ) explorer = explorers.LinearDecayEpsilonGreedy( diff --git a/examples/gym/train_categorical_dqn_gym.py b/examples/gym/train_categorical_dqn_gym.py index 9971c5a21..7c7105189 100644 --- a/examples/gym/train_categorical_dqn_gym.py +++ b/examples/gym/train_categorical_dqn_gym.py @@ -40,7 +40,7 @@ def main(): parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) - parser.add_argument("--steps", type=int, default=10 ** 8) + parser.add_argument("--steps", type=int, default=10**8) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=50) parser.add_argument("--target-update-interval", type=int, default=100) @@ -67,7 +67,7 @@ def main(): def make_env(test): env = gym.make(args.env) - env_seed = 2 ** 32 - 1 - args.seed if test else args.seed + env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) diff --git a/examples/gym/train_dqn_gym.py b/examples/gym/train_dqn_gym.py index acbf8cc70..9319a9125 100644 --- a/examples/gym/train_dqn_gym.py +++ b/examples/gym/train_dqn_gym.py @@ -45,21 +45,21 @@ def main(): parser.add_argument("--env", type=str, default="Pendulum-v0") parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument("--gpu", type=int, default=0) - parser.add_argument("--final-exploration-steps", type=int, default=10 ** 4) + parser.add_argument("--final-exploration-steps", type=int, default=10**4) parser.add_argument("--start-epsilon", type=float, default=1.0) parser.add_argument("--end-epsilon", type=float, default=0.1) parser.add_argument("--noisy-net-sigma", type=float, default=None) parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) - parser.add_argument("--steps", type=int, default=10 ** 5) + parser.add_argument("--steps", type=int, default=10**5) parser.add_argument("--prioritized-replay", action="store_true") parser.add_argument("--replay-start-size", type=int, default=1000) - parser.add_argument("--target-update-interval", type=int, default=10 ** 2) + parser.add_argument("--target-update-interval", type=int, default=10**2) parser.add_argument("--target-update-method", type=str, default="hard") parser.add_argument("--soft-update-tau", type=float, default=1e-2) parser.add_argument("--update-interval", type=int, default=1) parser.add_argument("--eval-n-runs", type=int, default=100) - parser.add_argument("--eval-interval", type=int, default=10 ** 4) + parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--n-hidden-channels", type=int, default=100) parser.add_argument("--n-hidden-layers", type=int, default=2) parser.add_argument("--gamma", type=float, default=0.99) @@ -94,7 +94,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 def clip_action_filter(a): return np.clip(a, action_space.low, action_space.high) @@ -103,7 +103,7 @@ def make_env(idx=0, test=False): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed utils.set_random_seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) @@ -161,7 +161,7 @@ def make_env(idx=0, test=False): opt = optim.Adam(q_func.parameters()) - rbuf_capacity = 5 * 10 ** 5 + rbuf_capacity = 5 * 10**5 if args.minibatch_size is None: args.minibatch_size = 32 if args.prioritized_replay: diff --git a/examples/gym/train_reinforce_gym.py b/examples/gym/train_reinforce_gym.py index de187cb0d..f2c9eaa61 100644 --- a/examples/gym/train_reinforce_gym.py +++ b/examples/gym/train_reinforce_gym.py @@ -39,8 +39,8 @@ def main(): ) parser.add_argument("--beta", type=float, default=1e-4) parser.add_argument("--batchsize", type=int, default=10) - parser.add_argument("--steps", type=int, default=10 ** 5) - parser.add_argument("--eval-interval", type=int, default=10 ** 4) + parser.add_argument("--steps", type=int, default=10**5) + parser.add_argument("--eval-interval", type=int, default=10**4) parser.add_argument("--eval-n-runs", type=int, default=100) parser.add_argument("--reward-scale-factor", type=float, default=1e-2) parser.add_argument("--render", action="store_true", default=False) @@ -61,7 +61,7 @@ def main(): def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs - env_seed = 2 ** 32 - 1 - args.seed if test else args.seed + env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) diff --git a/examples/mujoco/reproduction/ddpg/train_ddpg.py b/examples/mujoco/reproduction/ddpg/train_ddpg.py index 45614ead9..705cccc5d 100644 --- a/examples/mujoco/reproduction/ddpg/train_ddpg.py +++ b/examples/mujoco/reproduction/ddpg/train_ddpg.py @@ -49,7 +49,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=10 ** 6, + default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -103,7 +103,7 @@ def make_env(test): assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs - env_seed = 2 ** 32 - 1 - args.seed if test else args.seed + env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) @@ -144,7 +144,7 @@ def make_env(test): opt_a = torch.optim.Adam(policy.parameters()) opt_c = torch.optim.Adam(q_func.parameters()) - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high diff --git a/examples/mujoco/reproduction/ppo/train_ppo.py b/examples/mujoco/reproduction/ppo/train_ppo.py index 8bf7fbe5f..a42d8f0af 100644 --- a/examples/mujoco/reproduction/ppo/train_ppo.py +++ b/examples/mujoco/reproduction/ppo/train_ppo.py @@ -46,7 +46,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=2 * 10 ** 6, + default=2 * 10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -107,7 +107,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 args.outdir = experiments.prepare_output_dir(args, args.outdir) @@ -115,7 +115,7 @@ def make_env(process_idx, test): env = gym.make(args.env) # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) diff --git a/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py b/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py index 929cb2925..548a2ae38 100644 --- a/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py +++ b/examples/mujoco/reproduction/soft_actor_critic/train_soft_actor_critic.py @@ -51,7 +51,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=10 ** 6, + default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -115,7 +115,7 @@ def main(): # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3]. # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7]. process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs - assert process_seeds.max() < 2 ** 32 + assert process_seeds.max() < 2**32 def make_env(process_idx, test): env = gym.make(args.env) @@ -124,7 +124,7 @@ def make_env(process_idx, test): env = env.env # Use different random seeds for train and test envs process_seed = int(process_seeds[process_idx]) - env_seed = 2 ** 32 - 1 - process_seed if test else process_seed + env_seed = 2**32 - 1 - process_seed if test else process_seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) @@ -201,7 +201,7 @@ def make_q_func_with_optimizer(): q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): """Select random actions until model is updated one or more times.""" diff --git a/examples/mujoco/reproduction/td3/train_td3.py b/examples/mujoco/reproduction/td3/train_td3.py index 2ca26a44a..7913a3765 100644 --- a/examples/mujoco/reproduction/td3/train_td3.py +++ b/examples/mujoco/reproduction/td3/train_td3.py @@ -46,7 +46,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=10 ** 6, + default=10**6, help="Total number of timesteps to train the agent.", ) parser.add_argument( @@ -100,7 +100,7 @@ def make_env(test): assert isinstance(env, gym.wrappers.TimeLimit) env = env.env # Use different random seeds for train and test envs - env_seed = 2 ** 32 - 1 - args.seed if test else args.seed + env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) @@ -146,7 +146,7 @@ def make_q_func_with_optimizer(): q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() - rbuf = replay_buffers.ReplayBuffer(10 ** 6) + rbuf = replay_buffers.ReplayBuffer(10**6) explorer = explorers.AdditiveGaussian( scale=0.1, low=action_space.low, high=action_space.high diff --git a/examples/mujoco/reproduction/trpo/train_trpo.py b/examples/mujoco/reproduction/trpo/train_trpo.py index f9c88c79f..0a9de705b 100644 --- a/examples/mujoco/reproduction/trpo/train_trpo.py +++ b/examples/mujoco/reproduction/trpo/train_trpo.py @@ -33,7 +33,7 @@ def main(): ), ) parser.add_argument( - "--steps", type=int, default=2 * 10 ** 6, help="Total time steps for training." + "--steps", type=int, default=2 * 10**6, help="Total time steps for training." ) parser.add_argument( "--eval-interval", @@ -98,7 +98,7 @@ def main(): def make_env(test): env = gym.make(args.env) # Use different random seeds for train and test envs - env_seed = 2 ** 32 - 1 - args.seed if test else args.seed + env_seed = 2**32 - 1 - args.seed if test else args.seed env.seed(env_seed) # Cast observations to float32 because our model uses float32 env = pfrl.wrappers.CastObservationToFloat32(env) diff --git a/examples/optuna/optuna_dqn_obs1d.py b/examples/optuna/optuna_dqn_obs1d.py index 51a7638e1..c21e70e8d 100644 --- a/examples/optuna/optuna_dqn_obs1d.py +++ b/examples/optuna/optuna_dqn_obs1d.py @@ -51,7 +51,7 @@ def _objective_core( # Set different random seeds for train and test envs. train_seed = seed - test_seed = 2 ** 31 - 1 - seed + test_seed = 2**31 - 1 - seed def make_env(test=False): env = gym.make(env_id) @@ -275,7 +275,7 @@ def main(): parser.add_argument( "--steps", type=int, - default=4 * 10 ** 5, + default=4 * 10**5, help="Total number of timesteps to train the agent for each trial", ) parser.add_argument( @@ -293,7 +293,7 @@ def main(): parser.add_argument( "--eval-interval", type=int, - default=10 ** 4, + default=10**4, help="Frequency (in timesteps) of evaluation phase.", ) parser.add_argument( @@ -322,7 +322,7 @@ def main(): parser.add_argument( "--optuna-training-steps-budget", type=int, - default=4 * 10 ** 7, + default=4 * 10**7, help=( "Total training steps thoughout the optimization. If the pruner works " "well, this limited training steps can be allocated to promissing trials " @@ -399,7 +399,7 @@ def objective(trial): hyperparams = suggest(trial, args.steps) # seed is generated for each objective - seed = randomizer.randint(0, 2 ** 31 - 1) + seed = randomizer.randint(0, 2**31 - 1) additional_args = dict(seed=seed, **hyperparams) outdir = experiments.prepare_output_dir(args=additional_args, basedir=rootdir) diff --git a/examples/slimevolley/train_rainbow.py b/examples/slimevolley/train_rainbow.py index d5a2fc10d..b309506ed 100644 --- a/examples/slimevolley/train_rainbow.py +++ b/examples/slimevolley/train_rainbow.py @@ -27,7 +27,7 @@ def __init__(self, env): super().__init__(env) assert isinstance(env.action_space, gym.spaces.MultiBinary) self.orig_action_space = env.action_space - self.action_space = gym.spaces.Discrete(2 ** env.action_space.n) + self.action_space = gym.spaces.Discrete(2**env.action_space.n) def action(self, action): return [(action >> i) % 2 for i in range(self.orig_action_space.n)] @@ -84,7 +84,7 @@ def main(): parser.add_argument("--demo", action="store_true", default=False) parser.add_argument("--load", type=str, default=None) parser.add_argument("--noisy-net-sigma", type=float, default=0.1) - parser.add_argument("--steps", type=int, default=2 * 10 ** 6) + parser.add_argument("--steps", type=int, default=2 * 10**6) parser.add_argument("--replay-start-size", type=int, default=1600) parser.add_argument("--eval-n-episodes", type=int, default=1000) parser.add_argument("--eval-interval", type=int, default=250000) @@ -122,7 +122,7 @@ def main(): # Set different random seeds for train and test envs. train_seed = args.seed - test_seed = 2 ** 31 - 1 - args.seed + test_seed = 2**31 - 1 - args.seed args.outdir = experiments.prepare_output_dir(args, args.outdir) print("Output files are saved in {}".format(args.outdir)) @@ -181,7 +181,7 @@ def phi(x): update_interval = 1 betasteps = args.steps / update_interval rbuf = replay_buffers.PrioritizedReplayBuffer( - 10 ** 6, + 10**6, alpha=0.5, beta0=0.4, betasteps=betasteps, diff --git a/pfrl/agents/acer.py b/pfrl/agents/acer.py index 47ef07d5d..332946e13 100644 --- a/pfrl/agents/acer.py +++ b/pfrl/agents/acer.py @@ -321,7 +321,7 @@ def __init__( truncation_threshold=10, disable_online_update=False, n_times_replay=8, - replay_start_size=10 ** 4, + replay_start_size=10**4, normalize_loss_by_steps=True, act_deterministically=False, max_grad_norm=None, diff --git a/pfrl/experiments/train_agent_async.py b/pfrl/experiments/train_agent_async.py index 03bd8fdac..9856bb0a2 100644 --- a/pfrl/experiments/train_agent_async.py +++ b/pfrl/experiments/train_agent_async.py @@ -151,8 +151,8 @@ def train_agent_async( processes, make_env, profile=False, - steps=8 * 10 ** 7, - eval_interval=10 ** 6, + steps=8 * 10**7, + eval_interval=10**6, eval_n_steps=None, eval_n_episodes=10, eval_success_threshold=0.0, diff --git a/pfrl/explorers/additive_ou.py b/pfrl/explorers/additive_ou.py index 4b9819ac3..b2d6096d9 100644 --- a/pfrl/explorers/additive_ou.py +++ b/pfrl/explorers/additive_ou.py @@ -44,7 +44,7 @@ def select_action(self, t, greedy_action_func, action_value=None): if self.start_with_mu: self.ou_state = np.full(a.shape, self.mu, dtype=np.float32) else: - sigma_stable = self.sigma / np.sqrt(2 * self.theta - self.theta ** 2) + sigma_stable = self.sigma / np.sqrt(2 * self.theta - self.theta**2) self.ou_state = np.random.normal( size=a.shape, loc=self.mu, scale=sigma_stable ).astype(np.float32) diff --git a/pfrl/explorers/epsilon_greedy.py b/pfrl/explorers/epsilon_greedy.py index 702869208..c034d6a5c 100644 --- a/pfrl/explorers/epsilon_greedy.py +++ b/pfrl/explorers/epsilon_greedy.py @@ -118,7 +118,7 @@ def __init__( self.epsilon = start_epsilon def compute_epsilon(self, t): - epsilon = self.start_epsilon * (self.decay ** t) + epsilon = self.start_epsilon * (self.decay**t) return max(epsilon, self.end_epsilon) def select_action(self, t, greedy_action_func, action_value=None): diff --git a/pfrl/q_functions/state_q_functions.py b/pfrl/q_functions/state_q_functions.py index 1eedd573d..54762c6d1 100644 --- a/pfrl/q_functions/state_q_functions.py +++ b/pfrl/q_functions/state_q_functions.py @@ -209,7 +209,7 @@ def forward(self, state): tril = lower_triangular_matrix(mat_diag, mat_non_diag) mat = torch.matmul(tril, torch.transpose(tril, 1, 2)) else: - mat = torch.unsqueeze(mat_diag ** 2, dim=2) + mat = torch.unsqueeze(mat_diag**2, dim=2) return QuadraticActionValue( mu, mat, diff --git a/pfrl/replay_buffer.py b/pfrl/replay_buffer.py index 7da0fd3f9..5d2edea8e 100644 --- a/pfrl/replay_buffer.py +++ b/pfrl/replay_buffer.py @@ -182,7 +182,7 @@ def batch_experiences(experiences, device, phi, gamma, batch_states=batch_states ), "reward": torch.as_tensor( [ - sum((gamma ** i) * exp[i]["reward"] for i in range(len(exp))) + sum((gamma**i) * exp[i]["reward"] for i in range(len(exp))) for exp in experiences ], dtype=torch.float32, diff --git a/pfrl/utils/reward_filter.py b/pfrl/utils/reward_filter.py index cf85b3227..9df735f56 100644 --- a/pfrl/utils/reward_filter.py +++ b/pfrl/utils/reward_filter.py @@ -10,8 +10,8 @@ def __call__(self, reward): self.average_reward *= 1 - self.tau self.average_reward += self.tau * reward self.average_reward_squared *= 1 - self.tau - self.average_reward_squared += self.tau * reward ** 2 - var = self.average_reward_squared - self.average_reward ** 2 + self.average_reward_squared += self.tau * reward**2 + var = self.average_reward_squared - self.average_reward**2 stdev = min(var, self.eps) ** 0.5 return self.scale * (reward - self.average_reward) / stdev diff --git a/pfrl/wrappers/monitor.py b/pfrl/wrappers/monitor.py index 7d8924e3d..4e8e842da 100644 --- a/pfrl/wrappers/monitor.py +++ b/pfrl/wrappers/monitor.py @@ -1,8 +1,22 @@ import time from logging import getLogger -from gym.wrappers import Monitor as _GymMonitor -from gym.wrappers.monitoring.stats_recorder import StatsRecorder as _GymStatsRecorder +try: + from gym.wrappers import Monitor as _GymMonitor +except ImportError: + + class _Stub: + def __init__(self, *args, **kwargs): + raise RuntimeError("Monitor is not available in this version of gym") + + class _GymMonitor(_Stub): # type: ignore + pass + + class _GymStatsRecorder(_Stub): + pass + +else: + from gym.wrappers.monitoring.stats_recorder import StatsRecorder as _GymStatsRecorder # type: ignore # isort: skip # noqa: E501 class Monitor(_GymMonitor): diff --git a/tests/agents_tests/basetest_ddpg.py b/tests/agents_tests/basetest_ddpg.py index 4d95508a6..626e1a421 100644 --- a/tests/agents_tests/basetest_ddpg.py +++ b/tests/agents_tests/basetest_ddpg.py @@ -54,7 +54,7 @@ def random_action_func(): return LinearDecayEpsilonGreedy(1.0, 0.2, 1000, random_action_func) def make_replay_buffer(self, env): - return replay_buffers.ReplayBuffer(10 ** 5) + return replay_buffers.ReplayBuffer(10**5) class _TestDDPGOnContinuousPOABC(_TestDDPGOnABC): @@ -84,7 +84,7 @@ def make_env_and_successful_return(self, test): return ABC(discrete=False, partially_observable=True, deterministic=test), 1 def make_replay_buffer(self, env): - return replay_buffers.EpisodicReplayBuffer(10 ** 5) + return replay_buffers.EpisodicReplayBuffer(10**5) class _TestDDPGOnContinuousABC(_TestDDPGOnABC): diff --git a/tests/agents_tests/basetest_dqn_like.py b/tests/agents_tests/basetest_dqn_like.py index f959ea50a..a9ecd37f1 100644 --- a/tests/agents_tests/basetest_dqn_like.py +++ b/tests/agents_tests/basetest_dqn_like.py @@ -64,7 +64,7 @@ def make_optimizer(self, env, q_func): return opt def make_replay_buffer(self, env): - return replay_buffers.ReplayBuffer(10 ** 5) + return replay_buffers.ReplayBuffer(10**5) class _TestDQNOnDiscreteABC(_TestDQNOnABC): @@ -89,7 +89,7 @@ def make_q_func(self, env): ) def make_replay_buffer(self, env): - return replay_buffers.EpisodicReplayBuffer(10 ** 5) + return replay_buffers.EpisodicReplayBuffer(10**5) def make_env_and_successful_return(self, test): return ABC(discrete=True, partially_observable=True, deterministic=test), 1 @@ -102,7 +102,7 @@ def make_optimizer(self, env, q_func): class _TestNStepDQNOnABC(_TestDQNOnABC): def make_replay_buffer(self, env): - return replay_buffers.ReplayBuffer(10 ** 5, num_steps=3) + return replay_buffers.ReplayBuffer(10**5, num_steps=3) class _TestNStepDQNOnDiscreteABC(_TestNStepDQNOnABC): diff --git a/tests/agents_tests/test_acer.py b/tests/agents_tests/test_acer.py index d19158c9a..944412d40 100644 --- a/tests/agents_tests/test_acer.py +++ b/tests/agents_tests/test_acer.py @@ -368,7 +368,7 @@ def make_env(process_idx, test): action_space = sample_env.action_space obs_space = sample_env.observation_space - replay_buffer = EpisodicReplayBuffer(10 ** 4) + replay_buffer = EpisodicReplayBuffer(10**4) obs_size = obs_space.low.size hidden_size = 20 if discrete: diff --git a/tests/agents_tests/test_dqn.py b/tests/agents_tests/test_dqn.py index bb4607a59..28e16e1c2 100644 --- a/tests/agents_tests/test_dqn.py +++ b/tests/agents_tests/test_dqn.py @@ -137,7 +137,7 @@ def make_dqn_agent(self, env, q_func, opt, explorer, rbuf, gpu): def _huber_loss_1(a): if abs(a) < 1: - return 0.5 * a ** 2 + return 0.5 * a**2 else: return abs(a) - 0.5 @@ -157,7 +157,7 @@ def setUp(self, clip_delta, batch_accumulator): [_huber_loss_1(a) for a in self.y - self.t] ) else: - self.gt_losses = torch.FloatTensor([0.5 * a ** 2 for a in self.y - self.t]) + self.gt_losses = torch.FloatTensor([0.5 * a**2 for a in self.y - self.t]) def test_not_weighted(self): loss = compute_value_loss( diff --git a/tests/agents_tests/test_soft_actor_critic.py b/tests/agents_tests/test_soft_actor_critic.py index ac4cb3a9c..a37c5cdf4 100644 --- a/tests/agents_tests/test_soft_actor_critic.py +++ b/tests/agents_tests/test_soft_actor_critic.py @@ -200,7 +200,7 @@ def make_q_func_with_optimizer(): q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() - rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6) + rbuf = pfrl.replay_buffers.ReplayBuffer(10**6) def burnin_action_func(): return np.random.uniform( diff --git a/tests/agents_tests/test_td3.py b/tests/agents_tests/test_td3.py index 17d5abafa..39f62f194 100644 --- a/tests/agents_tests/test_td3.py +++ b/tests/agents_tests/test_td3.py @@ -180,7 +180,7 @@ def make_q_func_with_optimizer(): q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() - rbuf = pfrl.replay_buffers.ReplayBuffer(10 ** 6) + rbuf = pfrl.replay_buffers.ReplayBuffer(10**6) explorer = pfrl.explorers.AdditiveGaussian( scale=0.3, low=env.action_space.low, high=env.action_space.high diff --git a/tests/nn_tests/test_noisy_linear.py b/tests/nn_tests/test_noisy_linear.py index fc0612468..7fafc3fe7 100644 --- a/tests/nn_tests/test_noisy_linear.py +++ b/tests/nn_tests/test_noisy_linear.py @@ -47,7 +47,7 @@ def _test_randomness(self, device): # In fact, (for each element _[i, j],) it holds: # \E[(y2 - y1) ** 2] = 2 * \Var(y) = (4 / pi) * sigma_scale ** 2 - target = (0.4 ** 2) * 2 + target = (0.4**2) * 2 if self.bias: target *= 2 / numpy.pi + numpy.sqrt(2 / numpy.pi) / y1.shape[1] else: diff --git a/tests/replay_buffers_test/test_replay_buffer.py b/tests/replay_buffers_test/test_replay_buffer.py index bf2b2b037..ba7962913 100644 --- a/tests/replay_buffers_test/test_replay_buffer.py +++ b/tests/replay_buffers_test/test_replay_buffer.py @@ -860,6 +860,6 @@ def test_batch_experiences(self): ) self.assertSequenceEqual( list(batch["discount"]), - list(np.asarray([0.99 ** 3, 0.99 ** 1, 0.99 ** 4], dtype=np.float32)), + list(np.asarray([0.99**3, 0.99**1, 0.99**4], dtype=np.float32)), ) self.assertSequenceEqual(list(batch["next_state"]), list(np.asarray([2, 1, 5]))) diff --git a/tests/test_action_value.py b/tests/test_action_value.py index 997bcf487..3dea81ae4 100644 --- a/tests/test_action_value.py +++ b/tests/test_action_value.py @@ -288,7 +288,7 @@ def setUp(self, batch_size, action_size, has_maximizer): def evaluator(actions): # negative square norm of actions - return -torch.sum(actions ** 2, dim=1) + return -torch.sum(actions**2, dim=1) self.evaluator = evaluator diff --git a/tests/utils_tests/test_pretrained_models.py b/tests/utils_tests/test_pretrained_models.py index 4d4eabd10..f80d89b1e 100644 --- a/tests/utils_tests/test_pretrained_models.py +++ b/tests/utils_tests/test_pretrained_models.py @@ -44,7 +44,7 @@ def _test_load_dqn(self, gpu): explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, - decay_steps=10 ** 6, + decay_steps=10**6, random_action_func=lambda: np.random.randint(4), ) @@ -56,7 +56,7 @@ def _test_load_dqn(self, gpu): gamma=0.99, explorer=explorer, replay_start_size=50, - target_update_interval=10 ** 4, + target_update_interval=10**4, clip_delta=True, update_interval=4, batch_accumulator="sum", @@ -115,7 +115,7 @@ def _test_load_iqn(self, gpu): explorer = explorers.LinearDecayEpsilonGreedy( start_epsilon=1.0, end_epsilon=0.1, - decay_steps=10 ** 6, + decay_steps=10**6, random_action_func=lambda: np.random.randint(4), ) @@ -127,7 +127,7 @@ def _test_load_iqn(self, gpu): gamma=0.99, explorer=explorer, replay_start_size=50, - target_update_interval=10 ** 4, + target_update_interval=10**4, update_interval=4, batch_accumulator="mean", phi=lambda x: x, @@ -163,7 +163,7 @@ def _test_load_rainbow(self, gpu): q_func = DistributionalDuelingDQN(4, 51, -10, 10) pnn.to_factorized_noisy(q_func, sigma_scale=0.5) explorer = explorers.Greedy() - opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10 ** -4) + opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4) rbuf = replay_buffers.ReplayBuffer(100) agent = agents.CategoricalDoubleDQN( q_func,