Eclectic-Sheep · belerico · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
@@ -373,7 +373,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.ppo.utils import log_models

@@ -2,6 +2,7 @@
 
 from typing import Any, Dict, Sequence
 
+import gymnasium as gym
 import numpy as np
 import torch
 from lightning import Fabric
@@ -21,8 +22,9 @@ def prepare_obs(
 
 
 @torch.no_grad()
-def test(agent: PPOPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str):
-    env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
+def test(agent: PPOPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str, env: gym.Env | None = None):
+    if env is None:
+        env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
     agent.eval()
     done = False
     cumulative_rew = 0

@@ -740,7 +740,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -782,7 +782,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -124,6 +124,7 @@ def test(
     log_dir: str,
     test_name: str = "",
     greedy: bool = True,
+    env: gym.Env | gym.Wrapper | None = None,
 ):
     """Test the model on the environment with the frozen model.
 
@@ -136,8 +137,11 @@ def test(
             Default to "".
         greedy (bool): whether or not to sample actions.
             Default to True.
+        env (gym.Env | gym.Wrapper): the environment to test on.
+            Default to None.
     """
-    env: gym.Env = make_env(cfg, cfg.seed, 0, log_dir, "test" + (f"_{test_name}" if test_name != "" else ""))()
+    if env is None:
+        env: gym.Env = make_env(cfg, cfg.seed, 0, log_dir, "test" + (f"_{test_name}" if test_name != "" else ""))()
     done = False
     cumulative_rew = 0
     obs = env.reset(seed=cfg.seed)[0]

@@ -764,7 +764,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir, greedy=False)
+        test(player, fabric, cfg, log_dir, greedy=False, env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -99,6 +99,7 @@ def test(
     log_dir: str,
     test_name: str = "",
     greedy: bool = True,
+    env: gym.Env | None = None,
 ):
     """Test the model on the environment with the frozen model.
 
@@ -111,8 +112,11 @@ def test(
             Default to "".
         greedy (bool): whether or not to sample the actions.
             Default to True.
+        env (gym.Env | gym.Wrapper): the environment to test on.
+            Default to None.
     """
-    env: gym.Env = make_env(cfg, cfg.seed, 0, log_dir, "test" + (f"_{test_name}" if test_name != "" else ""))()
+    if env is None:
+        env: gym.Env = make_env(cfg, cfg.seed, 0, log_dir, "test" + (f"_{test_name}" if test_name != "" else ""))()
     done = False
     cumulative_rew = 0
     obs = env.reset(seed=cfg.seed)[0]

@@ -426,7 +426,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.sac.utils import log_models

@@ -784,7 +784,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
         player.actor_type = "task"
         fabric_player = get_single_device_fabric(fabric)
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "zero-shot")
+        test(player, fabric, cfg, log_dir, "zero-shot", env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -431,7 +431,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any], exploration_cfg: Dict[str, Any]):
     if fabric.is_global_zero and cfg.algo.run_test:
         player.actor_type = "task"
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "few-shot")
+        test(player, fabric, cfg, log_dir, "few-shot", env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -939,7 +939,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
         player.actor_type = "task"
         fabric_player = get_single_device_fabric(fabric)
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "zero-shot")
+        test(player, fabric, cfg, log_dir, "zero-shot", env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -459,7 +459,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any], exploration_cfg: Dict[str, Any]):
     if fabric.is_global_zero and cfg.algo.run_test:
         player.actor_type = "task"
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "few-shot")
+        test(player, fabric, cfg, log_dir, "few-shot", env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -1032,7 +1032,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
         player.actor_type = "task"
         fabric_player = get_single_device_fabric(fabric)
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "zero-shot", greedy=False)
+        test(player, fabric, cfg, log_dir, "zero-shot", greedy=False, env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -461,7 +461,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any], exploration_cfg: Dict[str, Any]):
     if fabric.is_global_zero and cfg.algo.run_test:
         player.actor_type = "task"
         player.actor = fabric_player.setup_module(unwrap_fabric(actor_task))
-        test(player, fabric, cfg, log_dir, "few-shot", greedy=False)
+        test(player, fabric, cfg, log_dir, "few-shot", greedy=False, env=envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.dreamer_v1.utils import log_models

@@ -442,7 +442,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.ppo.utils import log_models

@@ -355,7 +355,7 @@ def player(
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(agent, fabric, cfg, log_dir)
+        test(agent, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.ppo.utils import log_models

@@ -36,8 +36,9 @@ def prepare_obs(
 
 
 @torch.no_grad()
-def test(agent: PPOPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str):
-    env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
+def test(agent: PPOPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str, env: gym.Env | None = None):
+    if env is None:
+        env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
     agent.eval()
     done = False
     cumulative_rew = 0

@@ -515,7 +515,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.ppo.utils import log_models

@@ -39,8 +39,9 @@ def prepare_obs(
 
 
 @torch.no_grad()
-def test(agent: "RecurrentPPOPlayer", fabric: Fabric, cfg: Dict[str, Any], log_dir: str):
-    env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
+def test(agent: "RecurrentPPOPlayer", fabric: Fabric, cfg: Dict[str, Any], log_dir: str, env: gym.Env | None = None):
+    if env is None:
+        env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
     agent.eval()
     done = False
     cumulative_rew = 0

@@ -37,8 +37,9 @@ def prepare_obs(
 
 
 @torch.no_grad()
-def test(actor: SACPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str):
-    env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
+def test(actor: SACPlayer, fabric: Fabric, cfg: Dict[str, Any], log_dir: str, env: gym.Env | None = None):
+    if env is None:
+        env = make_env(cfg, None, 0, log_dir, "test", vector_env_idx=0)()
     actor.eval()
     done = False
     cumulative_rew = 0

@@ -492,7 +492,7 @@ def main(fabric: Fabric, cfg: Dict[str, Any]):
 
     envs.close()
     if fabric.is_global_zero and cfg.algo.run_test:
-        test(player, fabric, cfg, log_dir)
+        test(player, fabric, cfg, log_dir, envs.envs[0])
 
     if not cfg.model_manager.disabled and fabric.is_global_zero:
         from sheeprl.algos.sac_ae.utils import log_models

@@ -40,8 +40,9 @@ def prepare_obs(
 
 
 @torch.no_grad()
-def test(actor: "SACAEPlayer", fabric: Fabric, cfg: Dict[str, Any], log_dir: str):
-    env = make_env(cfg, cfg.seed, 0, log_dir, "test", vector_env_idx=0)()
+def test(actor: "SACAEPlayer", fabric: Fabric, cfg: Dict[str, Any], log_dir: str, env: gym.Env | None = None):
+    if env is None:
+        env = make_env(cfg, cfg.seed, 0, log_dir, "test", vector_env_idx=0)()
     actor.eval()
     done = False
     cumulative_rew = 0

@@ -5,8 +5,13 @@ sync_env: False
 screen_size: 64
 action_repeat: 1
 grayscale: False
+clip_actions: False
 clip_rewards: False
+clip_obs: False
+clip_obs_range: null
 capture_video: True
+normalize_obs: False
+normalize_rewards: False
 frame_stack_dilation: 1
 actions_as_observation:
   num_stack: -1

@@ -8,6 +8,7 @@
 import gymnasium as gym
 import numpy as np
 from gymnasium.core import Env, RenderFrame
+from gymnasium.wrappers.normalize import NormalizeObservation, RunningMeanStd
 
 
 class MaskVelocityWrapper(gym.ObservationWrapper):
@@ -340,3 +341,53 @@ def _get_actions_stack(self) -> np.ndarray:
         actions_stack = list(self._actions)[self._dilation - 1 :: self._dilation]
         actions = np.concatenate(actions_stack, axis=-1)
         return actions.astype(np.float32)
+
+
+class NormalizeObservationWrapper(NormalizeObservation):
+    """This wrapper will normalize observations s.t. each coordinate is centered with unit variance.
+
+    Note:
+        The normalization depends on past trajectories and observations
+        will not be normalized correctly if the wrapper was
+        newly instantiated or the policy was changed recently.
+    """
+
+    def __init__(self, env: gym.Env, epsilon: float = 1e-8):
+        """This wrapper will normalize observations s.t. each coordinate is centered with unit variance.
+
+        Args:
+            env (Env): The environment to apply the wrapper
+            epsilon: A stability parameter that is used when scaling the observations.
+        """
+        super().__init__(env, epsilon=epsilon)
+        self._is_dict_space = False
+        if isinstance(env.observation_space, gym.spaces.Dict):
+            self._is_dict_space = True
+            self.obs_rms = {
+                k: RunningMeanStd(shape=self.observation_space[k].shape) for k in self.observation_space.keys()
+            }
+
+    def step(self, action):
+        """Steps through the environment and normalizes the observation."""
+        if not self._is_dict_space:
+            return super().step(action)
+        obs, rews, terminateds, truncateds, infos = self.env.step(action)
+        obs = self.normalize(obs)
+        return obs, rews, terminateds, truncateds, infos
+
+    def reset(self, **kwargs):
+        """Resets the environment and normalizes the observation."""
+        if not self._is_dict_space:
+            return super().reset(**kwargs)
+        obs, info = self.env.reset(**kwargs)
+        return self.normalize(obs), info
+
+    def normalize(self, obs):
+        """Normalises the observation using the running mean and variance of the observations."""
+        if not self._is_dict_space:
+            return super().normalize(obs)
+        new_obs = {}
+        for k in self.observation_space.keys():
+            self.obs_rms[k].update(obs[k][np.newaxis])
+            new_obs[k] = ((obs[k][np.newaxis] - self.obs_rms[k].mean) / np.sqrt(self.obs_rms[k].var + self.epsilon))[0]
+        return new_obs
@@ -1,6 +1,6 @@
 import os
 import warnings
-from typing import Any, Callable, Dict, Optional
+from typing import Any, Callable, Dict, Optional, Sequence
 
 import cv2
 import gymnasium as gym
@@ -13,6 +13,7 @@
     FrameStack,
     GrayscaleRenderWrapper,
     MaskVelocityWrapper,
+    NormalizeObservationWrapper,
     RewardAsObservationWrapper,
 )
 from sheeprl.utils.imports import _IS_DIAMBRA_ARENA_AVAILABLE, _IS_DIAMBRA_AVAILABLE, _IS_DMC_AVAILABLE
@@ -211,6 +212,33 @@ def transform_obs(obs: Dict[str, Any]):
         if cfg.env.actions_as_observation.num_stack > 0 and "diambra" not in cfg.env.wrapper._target_:
             env = ActionsAsObservationWrapper(env, **cfg.env.actions_as_observation)
 
+        if cfg.env.normalize_obs:
+            env = NormalizeObservationWrapper(env)
+
+        if cfg.env.clip_obs:
+            if (
+                isinstance(cfg.env.clip_obs_range, Sequence)
+                and not isinstance(cfg.env.clip_obs_range, str)
+                and len(cfg.env.clip_obs_range) != 2
+            ):
+                raise ValueError(
+                    f"clip_obs_range must be a sequence of length 2, got: {cfg.env.clip_obs_range} of type "
+                    f"{type(cfg.env.clip_obs_range)}"
+                )
+            env = gym.wrappers.TransformObservation(
+                env,
+                lambda obs: {
+                    k: np.clip(obs[k], cfg.env.clip_obs_range[0], cfg.env.clip_obs_range[1]) if k in obs else obs[k]
+                    for k in cfg.algo.mlp_keys.encoder + cfg.algo.cnn_keys.encoder
+                },
+            )
+
+        if cfg.env.clip_actions:
+            env = gym.wrappers.ClipAction(env)
+
+        if cfg.env.normalize_rewards:
+            env = gym.wrappers.NormalizeReward(env, gamma=cfg.algo.gamma)
+
         if cfg.env.reward_as_observation:
             env = RewardAsObservationWrapper(env)