diff --git a/README.md b/README.md
index de9eeaa4..37ed345f 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,7 @@ The training times of our implementations compared to the ones of Stable Baselin
SheepRL v0.4.0 |
SheepRL v0.4.9 |
SheepRL v0.5.2 (Numpy Buffers) |
+ SheepRL v0.5.5 (Numpy Buffers) |
StableBaselines31 |
@@ -101,6 +102,7 @@ The training times of our implementations compared to the ones of Stable Baselin
192.31s ± 1.11 |
138.3s ± 0.16 |
80.81s ± 0.68 |
+ 81.27s ± 0.47 |
77.21s ± 0.36 |
@@ -108,6 +110,24 @@ The training times of our implementations compared to the ones of Stable Baselin
85.42s ± 2.27 |
59.53s ± 0.78 |
46.09s ± 0.59 |
+ 36.88s ± 0.30 |
+ N.D. |
+
+
+ A2C |
+ 1 device |
+ N.D. |
+ N.D. |
+ N.D. |
+ 84.76s ± 0.37 |
+ 84.22s ± 0.99 |
+
+
+ 2 devices |
+ N.D. |
+ N.D. |
+ N.D. |
+ 28.95s ± 0.75 |
N.D. |
@@ -116,6 +136,7 @@ The training times of our implementations compared to the ones of Stable Baselin
421.37s ± 5.27 |
363.74s ± 3.44 |
318.06s ± 4.46 |
+ 320.21 ± 6.29 |
336.06s ± 12.26 |
@@ -123,6 +144,7 @@ The training times of our implementations compared to the ones of Stable Baselin
264.29s ± 1.81 |
238.88s ± 4.97 |
210.07s ± 27 |
+ 225.95 ± 3.65 |
N.D. |
@@ -131,6 +153,7 @@ The training times of our implementations compared to the ones of Stable Baselin
4201.23s |
N.D. |
2921.38s |
+ 2207.13s |
N.D. |
@@ -139,6 +162,7 @@ The training times of our implementations compared to the ones of Stable Baselin
1874.62s |
N.D. |
1148.1s |
+ 906.42s |
N.D. |
@@ -147,6 +171,7 @@ The training times of our implementations compared to the ones of Stable Baselin
2022.99s |
N.D. |
1378.01s |
+ 1589.30s |
N.D. |
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
index 714c6657..e59b443d 100644
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
@@ -17,6 +17,17 @@
# "algo.per_rank_batch_size=128"
]
+ # A2C Arguments
+ # args = [
+ # os.path.join(ROOT_DIR, "__main__.py"),
+ # "exp=a2c_benchmarks",
+ # # Decomment below to run with 2 devices
+ # # "fabric.devices=2",
+ # # "env.num_envs=2",
+ # # "algo.per_rank_batch_size=10",
+ # # "algo.rollout_steps=20",
+ # ]
+
# SAC Arguments
# args = [
# os.path.join(ROOT_DIR, "__main__.py"),
diff --git a/benchmarks/benchmark_sb3.py b/benchmarks/benchmark_sb3.py
index e7bfe251..5cf92b1c 100644
--- a/benchmarks/benchmark_sb3.py
+++ b/benchmarks/benchmark_sb3.py
@@ -1,6 +1,6 @@
import gymnasium as gym
import stable_baselines3 as sb3
-from stable_baselines3 import PPO, SAC # noqa: F401
+from stable_baselines3 import A2C, PPO, SAC # noqa: F401
from torchmetrics import SumMetric
from sheeprl.utils.timer import timer
@@ -15,6 +15,18 @@
print(sb3.common.evaluation.evaluate_policy(model.policy, env))
+# Stable Baselines3 - A2C - CartPolev1
+# Decomment below to run A2C benchmarks
+
+# if __name__ == "__main__":
+# with timer("run_time", SumMetric, sync_on_compute=False):
+# env = gym.make("CartPole-v1", render_mode="rgb_array")
+# model = A2C("MlpPolicy", env, verbose=0, device="cpu", vf_coef=1.0)
+# model.learn(total_timesteps=1024 * 64, log_interval=None)
+# print(timer.compute())
+# print(sb3.common.evaluation.evaluate_policy(model.policy, env))
+
+
# Stable Baselines3 SAC - LunarLanderContinuous-v2
# Decomment below to run SAC benchmarks
@@ -23,7 +35,7 @@
# env = sb3.common.vec_env.DummyVecEnv(
# [lambda: gym.make("LunarLanderContinuous-v2", render_mode="rgb_array") for _ in range(4)]
# )
-# model = SAC("MlpPolicy", env, verbose=0, device="cpu", ent_coef=1.0)
+# model = SAC("MlpPolicy", env, verbose=0, device="cpu")
# model.learn(total_timesteps=1024 * 64, log_interval=None)
# print(timer.compute())
# print(sb3.common.evaluation.evaluate_policy(model.policy, env.envs[0]))
diff --git a/examples/ratio.py b/examples/ratio.py
index 03712916..d005c9f5 100644
--- a/examples/ratio.py
+++ b/examples/ratio.py
@@ -1,55 +1,9 @@
-import warnings
-from typing import Any, Dict, Mapping
-
-
-class Ratio:
- """Directly taken from Hafner et al. (2023) implementation:
- https://github.com/danijar/dreamerv3/blob/8fa35f83eee1ce7e10f3dee0b766587d0a713a60/dreamerv3/embodied/core/when.py#L26
- """
-
- def __init__(self, ratio: float, pretrain_steps: int = 0):
- if pretrain_steps < 0:
- raise ValueError(f"'pretrain_steps' must be non-negative, got {pretrain_steps}")
- if ratio < 0:
- raise ValueError(f"'ratio' must be non-negative, got {ratio}")
- self._pretrain_steps = pretrain_steps
- self._ratio = ratio
- self._prev = None
-
- def __call__(self, step: int) -> int:
- if self._ratio == 0:
- return 0
- if self._prev is None:
- self._prev = step
- repeats = 1
- if self._pretrain_steps > 0:
- if step < self._pretrain_steps:
- warnings.warn(
- "The number of pretrain steps is greater than the number of current steps. This could lead to "
- f"a higher ratio than the one specified ({self._ratio}). Setting the 'pretrain_steps' equal to "
- "the number of current steps."
- )
- self._pretrain_steps = step
- repeats = round(self._pretrain_steps * self._ratio)
- return repeats
- repeats = round((step - self._prev) * self._ratio)
- self._prev += repeats / self._ratio
- return repeats
-
- def state_dict(self) -> Dict[str, Any]:
- return {"_ratio": self._ratio, "_prev": self._prev, "_pretrain_steps": self._pretrain_steps}
-
- def load_state_dict(self, state_dict: Mapping[str, Any]):
- self._ratio = state_dict["_ratio"]
- self._prev = state_dict["_prev"]
- self._pretrain_steps = state_dict["_pretrain_steps"]
- return self
-
+from sheeprl.utils.utils import Ratio
if __name__ == "__main__":
num_envs = 1
world_size = 1
- replay_ratio = 0.5
+ replay_ratio = 0.0625
per_rank_batch_size = 16
per_rank_sequence_length = 64
replayed_steps = world_size * per_rank_batch_size * per_rank_sequence_length
@@ -62,7 +16,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any]):
for i in range(0, total_policy_steps, policy_steps):
if i >= 128:
per_rank_repeats = r(i / world_size)
- if per_rank_repeats > 0 and not printed:
+ if per_rank_repeats > 0: # and not printed:
print(
f"Training the agent with {per_rank_repeats} repeats on every rank "
f"({per_rank_repeats * world_size} global repeats) at global iteration {i}"
diff --git a/sheeprl/algos/a2c/a2c.py b/sheeprl/algos/a2c/a2c.py
index 6d0cdfd4..07241c80 100644
--- a/sheeprl/algos/a2c/a2c.py
+++ b/sheeprl/algos/a2c/a2c.py
@@ -358,7 +358,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
if (
(cfg.checkpoint.every > 0 and policy_step - last_checkpoint >= cfg.checkpoint.every)
or cfg.dry_run
- or update == num_updates
+ or (update == num_updates and cfg.checkpoint.save_last)
):
last_checkpoint = policy_step
state = {
@@ -370,7 +370,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
fabric.call("on_checkpoint_coupled", fabric=fabric, ckpt_path=ckpt_path, state=state)
envs.close()
- if fabric.is_global_zero:
+ if fabric.is_global_zero and cfg.algo.run_test:
test(player, fabric, cfg, log_dir)
if not cfg.model_manager.disabled and fabric.is_global_zero:
diff --git a/sheeprl/algos/a2c/agent.py b/sheeprl/algos/a2c/agent.py
index 98ae1882..a63dd22d 100644
--- a/sheeprl/algos/a2c/agent.py
+++ b/sheeprl/algos/a2c/agent.py
@@ -30,7 +30,7 @@ def __init__(
super().__init__()
self.keys = keys
self.input_dim = input_dim
- self.output_dim = features_dim
+ self.output_dim = features_dim if features_dim else dense_units
self.model = MLP(
input_dim,
features_dim,
@@ -96,18 +96,22 @@ def __init__(
)
# Actor
- actor_backbone = MLP(
- input_dims=features_dim,
- output_dim=None,
- hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers,
- activation=hydra.utils.get_class(actor_cfg.dense_act),
- flatten_dim=None,
- norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None,
- norm_args=(
- [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)]
- if actor_cfg.layer_norm
- else None
- ),
+ actor_backbone = (
+ MLP(
+ input_dims=features_dim,
+ output_dim=None,
+ hidden_sizes=[actor_cfg.dense_units] * actor_cfg.mlp_layers,
+ activation=hydra.utils.get_class(actor_cfg.dense_act),
+ flatten_dim=None,
+ norm_layer=[nn.LayerNorm] * actor_cfg.mlp_layers if actor_cfg.layer_norm else None,
+ norm_args=(
+ [{"normalized_shape": actor_cfg.dense_units} for _ in range(actor_cfg.mlp_layers)]
+ if actor_cfg.layer_norm
+ else None
+ ),
+ )
+ if actor_cfg.mlp_layers > 0
+ else nn.Identity()
)
if is_continuous:
# Output is a tuple of two elements: mean and log_std, one for every action
diff --git a/sheeprl/algos/sac/sac.py b/sheeprl/algos/sac/sac.py
index 4560c28d..774754fc 100644
--- a/sheeprl/algos/sac/sac.py
+++ b/sheeprl/algos/sac/sac.py
@@ -297,7 +297,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
# Train the agent
if update >= learning_starts:
- per_rank_gradient_steps = ratio(policy_step / world_size)
+ per_rank_gradient_steps = ratio(policy_step / world_size) if not cfg.run_benchmarks else 1
if per_rank_gradient_steps > 0:
# We sample one time to reduce the communications between processes
sample = rb.sample_tensors(
diff --git a/sheeprl/configs/exp/a2c_benchmarks.yaml b/sheeprl/configs/exp/a2c_benchmarks.yaml
new file mode 100644
index 00000000..48379234
--- /dev/null
+++ b/sheeprl/configs/exp/a2c_benchmarks.yaml
@@ -0,0 +1,59 @@
+# @package _global_
+
+defaults:
+ - override /algo: a2c
+ - override /env: gym
+ - _self_
+
+# Environment
+env:
+ capture_video: False
+ num_envs: 1
+ sync_env: True
+
+# Algorithm
+algo:
+ name: a2c
+ rollout_steps: 5
+ loss_reduction: mean
+ normalize_advantages: False
+ max_grad_norm: 0.5
+ encoder:
+ mlp_layers: 2
+ mlp_features_dim: null
+ actor:
+ mlp_layers: 0
+ critic:
+ mlp_layers: 0
+ optimizer:
+ lr: 7e-4
+ eps: 1e-5
+ alpha: 0.99
+ per_rank_batch_size: 5
+ # # If you want to run this benchmark with older versions,
+ # you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file.
+ run_test: False
+ # If you want to run this benchmark with older versions,
+ # you need to move the `total_steps` and the `mlp_keys` config from `algo` to the root.
+ total_steps: 65536
+ mlp_keys:
+ encoder: [state]
+
+# Buffer
+buffer:
+ share_data: False
+ size: ${algo.rollout_steps}
+ memmap: False
+
+fabric:
+ devices: 1
+ accelerator: cpu
+
+checkpoint:
+ every: 70000
+ save_last: False
+
+metric:
+ log_every: 70000
+ log_level: 0
+ disable_timer: True
\ No newline at end of file
diff --git a/sheeprl/configs/exp/default.yaml b/sheeprl/configs/exp/default.yaml
index e69de29b..e3e743da 100644
--- a/sheeprl/configs/exp/default.yaml
+++ b/sheeprl/configs/exp/default.yaml
@@ -0,0 +1 @@
+run_benchmarks: False
\ No newline at end of file
diff --git a/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml
index 12f29b1f..d170b20f 100644
--- a/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml
+++ b/sheeprl/configs/exp/dreamer_v1_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
- override /env: atari
- _self_
-# Experiment
-seed: 5
-
# Environment
env:
num_envs: 1
@@ -26,6 +23,7 @@ buffer:
# Algorithm
algo:
learning_starts: 1024
+ replay_ratio: 0.0625
dense_units: 8
mlp_layers: 1
diff --git a/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml
index cfa2977a..e5d237f3 100644
--- a/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml
+++ b/sheeprl/configs/exp/dreamer_v2_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
- override /env: atari
- _self_
-# Experiment
-seed: 5
-
# Environment
env:
num_envs: 1
@@ -26,10 +23,11 @@ buffer:
# Algorithm
algo:
learning_starts: 1024
- per_rank_pretrain_steps: 1
+ per_rank_pretrain_steps: 0
+ replay_ratio: 0.0625
dense_units: 8
- mlp_layers:
+ mlp_layers: 1
world_model:
discrete_size: 4
stochastic_size: 4
diff --git a/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml b/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml
index e10dfd96..d787375b 100644
--- a/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml
+++ b/sheeprl/configs/exp/dreamer_v3_benchmarks.yaml
@@ -5,9 +5,6 @@ defaults:
- override /env: atari
- _self_
-# Experiment
-seed: 5
-
# Environment
env:
num_envs: 1
@@ -26,7 +23,7 @@ buffer:
# Algorithm
algo:
learning_starts: 1024
- replay_ratio: 1
+ replay_ratio: 0.0625
dense_units: 8
mlp_layers: 1
world_model:
diff --git a/sheeprl/configs/exp/ppo_benchmarks.yaml b/sheeprl/configs/exp/ppo_benchmarks.yaml
index 9f3cf40e..be54afdd 100644
--- a/sheeprl/configs/exp/ppo_benchmarks.yaml
+++ b/sheeprl/configs/exp/ppo_benchmarks.yaml
@@ -14,9 +14,11 @@ env:
# Algorithm
algo:
name: ppo
+ vf_coef: 0.5
+ clip_vloss: False
+ max_grad_norm: 0.5
rollout_steps: 128
normalize_advantages: True
- max_grad_norm: 0.5
encoder:
mlp_features_dim: null
actor:
@@ -24,7 +26,8 @@ algo:
critic:
mlp_layers: 0
optimizer:
- lr: 1e-3
+ lr: 3e-4
+ eps: 1e-5
per_rank_batch_size: 64
# # If you want to run this benchmark with older versions,
# you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file.
diff --git a/sheeprl/configs/exp/sac_benchmarks.yaml b/sheeprl/configs/exp/sac_benchmarks.yaml
index b3ce9a7d..43e08945 100644
--- a/sheeprl/configs/exp/sac_benchmarks.yaml
+++ b/sheeprl/configs/exp/sac_benchmarks.yaml
@@ -1,28 +1,34 @@
# @package _global_
defaults:
+ - default
- override /algo: sac
- override /env: gym
- _self_
+run_benchmarks: True
+
# Environment
env:
id: LunarLanderContinuous-v2
capture_video: False
- num_envs: 8
+ num_envs: 4
# Algorithm
algo:
name: sac
learning_starts: 100
- per_rank_batch_size: 512
+ per_rank_batch_size: 256
# # If you want to run this benchmark with older versions,
# you need to comment the test function in the `./sheeprl/algos/ppo/ppo.py` file.
run_test: False
# If you want to run this benchmark with older versions,
# you need to move the `total_steps` and the `mlp_keys` config from `algo` to the root.
total_steps: 65536
+ optimier:
+ lr: 3e-4
+ eps: 1e-5
mlp_keys:
encoder: [state]
@@ -34,7 +40,7 @@ buffer:
size: 65537
fabric:
- devices: 2
+ devices: 1
accelerator: cpu
checkpoint:
diff --git a/sheeprl/utils/utils.py b/sheeprl/utils/utils.py
index a66f5d84..f8719708 100644
--- a/sheeprl/utils/utils.py
+++ b/sheeprl/utils/utils.py
@@ -286,9 +286,9 @@ def __call__(self, step: int) -> int:
"the number of current steps."
)
self._pretrain_steps = step
- repeats = round(self._pretrain_steps * self._ratio)
+ repeats = int(self._pretrain_steps * self._ratio)
return repeats
- repeats = round((step - self._prev) * self._ratio)
+ repeats = int((step - self._prev) * self._ratio)
self._prev += repeats / self._ratio
return repeats