diff --git a/README.md b/README.md index de9eeaa4..37ed345f 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ The training times of our implementations compared to the ones of Stable Baselin SheepRL v0.4.0 SheepRL v0.4.9 SheepRL v0.5.2
(Numpy Buffers) + SheepRL v0.5.5
(Numpy Buffers) StableBaselines31 @@ -101,6 +102,7 @@ The training times of our implementations compared to the ones of Stable Baselin 192.31s ± 1.11 138.3s ± 0.16 80.81s ± 0.68 + 81.27s ± 0.47 77.21s ± 0.36 @@ -108,6 +110,24 @@ The training times of our implementations compared to the ones of Stable Baselin 85.42s ± 2.27 59.53s ± 0.78 46.09s ± 0.59 + 36.88s ± 0.30 + N.D. + + + A2C + 1 device + N.D. + N.D. + N.D. + 84.76s ± 0.37 + 84.22s ± 0.99 + + + 2 devices + N.D. + N.D. + N.D. + 28.95s ± 0.75 N.D. @@ -116,6 +136,7 @@ The training times of our implementations compared to the ones of Stable Baselin 421.37s ± 5.27 363.74s ± 3.44 318.06s ± 4.46 + 320.21 ± 6.29 336.06s ± 12.26 @@ -123,6 +144,7 @@ The training times of our implementations compared to the ones of Stable Baselin 264.29s ± 1.81 238.88s ± 4.97 210.07s ± 27 + 225.95 ± 3.65 N.D. @@ -131,6 +153,7 @@ The training times of our implementations compared to the ones of Stable Baselin 4201.23s N.D. 2921.38s + 2207.13s N.D. @@ -139,6 +162,7 @@ The training times of our implementations compared to the ones of Stable Baselin 1874.62s N.D. 1148.1s + 906.42s N.D. @@ -147,6 +171,7 @@ The training times of our implementations compared to the ones of Stable Baselin 2022.99s N.D. 1378.01s + 1589.30s N.D. diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py index 714c6657..e59b443d 100644 --- a/benchmarks/benchmark.py +++ b/benchmarks/benchmark.py @@ -17,6 +17,17 @@ # "algo.per_rank_batch_size=128" ] + # A2C Arguments + # args = [ + # os.path.join(ROOT_DIR, "__main__.py"), + # "exp=a2c_benchmarks", + # # Decomment below to run with 2 devices + # # "fabric.devices=2", + # # "env.num_envs=2", + # # "algo.per_rank_batch_size=10", + # # "algo.rollout_steps=20", + # ] + # SAC Arguments # args = [ # os.path.join(ROOT_DIR, "__main__.py"), diff --git a/benchmarks/benchmark_sb3.py b/benchmarks/benchmark_sb3.py index e7bfe251..5cf92b1c 100644 --- a/benchmarks/benchmark_sb3.py +++ b/benchmarks/benchmark_sb3.py @@ -1,6 +1,6 @@ import gymnasium as gym import stable_baselines3 as sb3 -from stable_baselines3 import PPO, SAC # noqa: F401 +from stable_baselines3 import A2C, PPO, SAC # noqa: F401 from torchmetrics import SumMetric from sheeprl.utils.timer import timer @@ -15,6 +15,18 @@ print(sb3.common.evaluation.evaluate_policy(model.policy, env)) +# Stable Baselines3 - A2C - CartPolev1 +# Decomment below to run A2C benchmarks + +# if __name__ == "__main__": +# with timer("run_time", SumMetric, sync_on_compute=False): +# env = gym.make("CartPole-v1", render_mode="rgb_array") +# model = A2C("MlpPolicy", env, verbose=0, device="cpu", vf_coef=1.0) +# model.learn(total_timesteps=1024 * 64, log_interval=None) +# print(timer.compute()) +# print(sb3.common.evaluation.evaluate_policy(model.policy, env)) + + # Stable Baselines3 SAC - LunarLanderContinuous-v2 # Decomment below to run SAC benchmarks @@ -23,7 +35,7 @@ # env = sb3.common.vec_env.DummyVecEnv( # [lambda: gym.make("LunarLanderContinuous-v2", render_mode="rgb_array") for _ in range(4)] # ) -# model = SAC("MlpPolicy", env, verbose=0, device="cpu", ent_coef=1.0) +# model = SAC("MlpPolicy", env, verbose=0, device="cpu") # model.learn(total_timesteps=1024 * 64, log_interval=None) # print(timer.compute()) # print(sb3.common.evaluation.evaluate_policy(model.policy, env.envs[0])) diff --git a/examples/ratio.py b/examples/ratio.py index 03712916..d005c9f5 100644 --- a/examples/ratio.py +++ b/examples/ratio.py @@ -1,55 +1,9 @@ -import warnings -from typing import Any, Dict, Mapping - - -class Ratio: - """Directly taken from Hafner et al. (2023) implementation: - https://github.com/danijar/dreamerv3/blob/8fa35f83eee1ce7e10f3dee0b766587d0a713a60/dreamerv3/embodied/core/when.py#L26 - """ - - def __init__(self, ratio: float, pretrain_steps: int = 0): - if pretrain_steps < 0: - raise ValueError(f"'pretrain_steps' must be non-negative, got {pretrain_steps}") - if ratio < 0: - raise ValueError(f"'ratio' must be non-negative, got {ratio}") - self._pretrain_steps = pretrain_steps - self._ratio = ratio - self._prev = None - - def __call__(self, step: int) -> int: - if self._ratio == 0: - return 0 - if self._prev is None: - self._prev = step - repeats = 1 - if self._pretrain_steps > 0: - if step < self._pretrain_steps: - warnings.warn( - "The number of pretrain steps is greater than the number of current steps. This could lead to " - f"a higher ratio than the one specified ({self._ratio}). Setting the 'pretrain_steps' equal to " - "the number of current steps." - ) - self._pretrain_steps = step - repeats = round(self._pretrain_steps * self._ratio) - return repeats - repeats = round((step - self._prev) * self._ratio) - self._prev += repeats / self._ratio - return repeats - - def state_dict(self) -> Dict[str, Any]: - return {"_ratio": self._ratio, "_prev": self._prev, "_pretrain_steps": self._pretrain_steps} - - def load_state_dict(self, state_dict: Mapping[str, Any]): - self._ratio = state_dict["_ratio"] - self._prev = state_dict["_prev"] - self._pretrain_steps = state_dict["_pretrain_steps"] - return self - +from sheeprl.utils.utils import Ratio if __name__ == "__main__": num_envs = 1 world_size = 1 - replay_ratio = 0.5 + replay_ratio = 0.0625 per_rank_batch_size = 16 per_rank_sequence_length = 64 replayed_steps = world_size * per_rank_batch_size * per_rank_sequence_length @@ -62,7 +16,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any]): for i in range(0, total_policy_steps, policy_steps): if i >= 128: per_rank_repeats = r(i / world_size) - if per_rank_repeats > 0 and not printed: + if per_rank_repeats > 0: # and not printed: print( f"Training the agent with {per_rank_repeats} repeats on every rank " f"({per_rank_repeats * world_size} global repeats) at global iteration {i}" diff --git a/sheeprl/algos/a2c/a2c.py b/sheeprl/algos/a2c/a2c.py index 6d0cdfd4..07241c80 100644 --- a/sheeprl/algos/a2c/a2c.py +++ b/sheeprl/algos/a2c/a2c.py @@ -358,7 +358,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): if ( (cfg.checkpoint.every > 0 and policy_step - last_checkpoint >= cfg.checkpoint.every) or cfg.dry_run - or update == num_updates + or (update == num_updates and cfg.checkpoint.save_last) ): last_checkpoint = policy_step state = { @@ -370,7 +370,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): fabric.call("on_checkpoint_coupled", fabric=fabric, ckpt_path=ckpt_path, state=state) envs.close() - if fabric.is_global_zero: + if fabric.is_global_zero and cfg.algo.run_test: test(player, fabric, cfg, log_dir) if not cfg.model_manager.disabled and fabric.is_global_zero: diff --git a/sheeprl/algos/a2c/agent.py b/sheeprl/algos/a2c/agent.py index 98ae1882..a63dd22d 100644 --- a/sheeprl/algos/a2c/agent.py +++ b/sheeprl/algos/a2c/agent.py @@ -30,7 +30,7 @@ def __init__( super().__init__() self.keys = keys self.input_dim = input_dim - self.output_dim = features_dim + self.output_dim = features_dim if features_dim else dense_units self.model = MLP( input_dim, features_dim, @@ -96,18 +96,22 @@ def __init__( ) # Actor - actor_backbone = MLP( - input_dims=features_dim, - output_dim=None, - hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers, - activation=hydra.utils.get_class(actor_cfg.dense_act), - flatten_dim=None, - norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None, - norm_args=( - [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)] - if actor_cfg.layer_norm - else None - ), + actor_backbone = ( + MLP( + input_dims=features_dim, + output_dim=None, + hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers, + activation=hydra.utils.get_class(actor_cfg.dense_act), + flatten_dim=None, + norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None, + norm_args=( + [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)] + if actor_cfg.layer_norm + else None + ), + ) + if actor_cfg.mlp_layers > 0 + else nn.Identity() ) if is_continuous: # Output is a tuple of two elements: mean and log_std, one for every action diff --git a/sheeprl/algos/sac/sac.py b/sheeprl/algos/sac/sac.py index 4560c28d..774754fc 100644 --- a/sheeprl/algos/sac/sac.py +++ b/sheeprl/algos/sac/sac.py @@ -297,7 +297,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]): # Train the agent if update >= learning_starts: - per_rank_gradient_steps = ratio(policy_step / world_size) + per_rank_gradient_steps = ratio(policy_step / world_size) if not cfg.run_benchmarks else 1 if per_rank_gradient_steps > 0: # We sample one time to reduce the communications between processes sample = rb.sample_tensors( diff --git a/sheeprl/configs/exp/a2c_benchmarks.yaml b/sheeprl/configs/exp/a2c_benchmarks.yaml new file mode 100644 index 00000000..48379234 --- /dev/null +++ b/sheeprl/configs/exp/a2c_benchmarks.yaml @@ -0,0 +1,59 @@ +# @package _global_ + +defaults: + - override /algo: a2c + - override /env: gym + - _self_ + +# Environment +env: + capture_video: False + num_envs: 1 + sync_env: True + +# Algorithm +algo: + name: a2c + rollout_steps: 5 + loss_reduction: mean + normalize_advantages: False + max_grad_norm: 0.5 + encoder: + mlp_layers: 2 + mlp_features_dim: null + actor: + mlp_layers: 0 + critic: + mlp_layers: 0 + optimizer: + lr: 7e-4 + eps: 1e-5 + alpha: 0.99 + per_rank_batch_size: 5 + # # If you want to run this benchmark with older versions, + # you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file. + run_test: False + # If you want to run this benchmark with older versions, + # you need to move the `total_steps` and the `mlp_keys` config from `algo` to the root. + total_steps: 65536 + mlp_keys: + encoder: [state] + +# Buffer +buffer: + share_data: False + size: ${algo.rollout_steps} + memmap: False + +fabric: + devices: 1 + accelerator: cpu + +checkpoint: + every: 70000 + save_last: False + +metric: + log_every: 70000 + log_level: 0 + disable_timer: True \ No newline at end of file diff --git a/sheeprl/configs/exp/default.yaml b/sheeprl/configs/exp/default.yaml index e69de29b..e3e743da 100644 --- a/sheeprl/configs/exp/default.yaml +++ b/sheeprl/configs/exp/default.yaml @@ -0,0 +1 @@ +run_benchmarks: False \ No newline at end of file diff --git a/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml index 12f29b1f..d170b20f 100644 --- a/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml +++ b/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml @@ -5,9 +5,6 @@ defaults: - override /env: atari - _self_ -# Experiment -seed: 5 - # Environment env: num_envs: 1 @@ -26,6 +23,7 @@ buffer: # Algorithm algo: learning_starts: 1024 + replay_ratio: 0.0625 dense_units: 8 mlp_layers: 1 diff --git a/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml index cfa2977a..e5d237f3 100644 --- a/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml +++ b/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml @@ -5,9 +5,6 @@ defaults: - override /env: atari - _self_ -# Experiment -seed: 5 - # Environment env: num_envs: 1 @@ -26,10 +23,11 @@ buffer: # Algorithm algo: learning_starts: 1024 - per_rank_pretrain_steps: 1 + per_rank_pretrain_steps: 0 + replay_ratio: 0.0625 dense_units: 8 - mlp_layers: + mlp_layers: 1 world_model: discrete_size: 4 stochastic_size: 4 diff --git a/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml index e10dfd96..d787375b 100644 --- a/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml +++ b/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml @@ -5,9 +5,6 @@ defaults: - override /env: atari - _self_ -# Experiment -seed: 5 - # Environment env: num_envs: 1 @@ -26,7 +23,7 @@ buffer: # Algorithm algo: learning_starts: 1024 - replay_ratio: 1 + replay_ratio: 0.0625 dense_units: 8 mlp_layers: 1 world_model: diff --git a/sheeprl/configs/exp/ppo_benchmarks.yaml b/sheeprl/configs/exp/ppo_benchmarks.yaml index 9f3cf40e..be54afdd 100644 --- a/sheeprl/configs/exp/ppo_benchmarks.yaml +++ b/sheeprl/configs/exp/ppo_benchmarks.yaml @@ -14,9 +14,11 @@ env: # Algorithm algo: name: ppo + vf_coef: 0.5 + clip_vloss: False + max_grad_norm: 0.5 rollout_steps: 128 normalize_advantages: True - max_grad_norm: 0.5 encoder: mlp_features_dim: null actor: @@ -24,7 +26,8 @@ algo: critic: mlp_layers: 0 optimizer: - lr: 1e-3 + lr: 3e-4 + eps: 1e-5 per_rank_batch_size: 64 # # If you want to run this benchmark with older versions, # you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file. diff --git a/sheeprl/configs/exp/sac_benchmarks.yaml b/sheeprl/configs/exp/sac_benchmarks.yaml index b3ce9a7d..43e08945 100644 --- a/sheeprl/configs/exp/sac_benchmarks.yaml +++ b/sheeprl/configs/exp/sac_benchmarks.yaml @@ -1,28 +1,34 @@ # @package _global_ defaults: + - default - override /algo: sac - override /env: gym - _self_ +run_benchmarks: True + # Environment env: id: LunarLanderContinuous-v2 capture_video: False - num_envs: 8 + num_envs: 4 # Algorithm algo: name: sac learning_starts: 100 - per_rank_batch_size: 512 + per_rank_batch_size: 256 # # If you want to run this benchmark with older versions, # you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file. run_test: False # If you want to run this benchmark with older versions, # you need to move the `total_steps` and the `mlp_keys` config from `algo` to the root. total_steps: 65536 + optimier: + lr: 3e-4 + eps: 1e-5 mlp_keys: encoder: [state] @@ -34,7 +40,7 @@ buffer: size: 65537 fabric: - devices: 2 + devices: 1 accelerator: cpu checkpoint: diff --git a/sheeprl/utils/utils.py b/sheeprl/utils/utils.py index a66f5d84..f8719708 100644 --- a/sheeprl/utils/utils.py +++ b/sheeprl/utils/utils.py @@ -286,9 +286,9 @@ def __call__(self, step: int) -> int: "the number of current steps." ) self._pretrain_steps = step - repeats = round(self._pretrain_steps * self._ratio) + repeats = int(self._pretrain_steps * self._ratio) return repeats - repeats = round((step - self._prev) * self._ratio) + repeats = int((step - self._prev) * self._ratio) self._prev += repeats / self._ratio return repeats