From d5986da0353a282892edc6f594090d3373d234a2 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Thu, 16 Jul 2020 22:02:00 +0300
Subject: [PATCH 1/9] Fix storing correct episode dones

---
 stable_baselines3/common/base_class.py          | 2 ++
 stable_baselines3/common/on_policy_algorithm.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 1ba7cc0ce..58e5cdb02 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -123,6 +123,7 @@ def __init__(
         self.tensorboard_log = tensorboard_log
         self.lr_schedule = None  # type: Optional[Callable]
         self._last_obs = None  # type: Optional[np.ndarray]
+        self._last_dones = None  # type: Optional[np.ndarray]
         # When using VecNormalize:
         self._last_original_obs = None  # type: Optional[np.ndarray]
         self._episode_num = 0
@@ -474,6 +475,7 @@ def _setup_learn(
         # Avoid resetting the environment when calling ``.learn()`` consecutive times
         if reset_num_timesteps or self._last_obs is None:
             self._last_obs = self.env.reset()
+            self._last_dones = np.zeros((self._last_obs.shape[0],), dtype=np.bool)
             # Retrieve unnormalized observation for saving into the buffer
             if self._vec_normalize_env is not None:
                 self._last_original_obs = self._vec_normalize_env.get_original_obs()
diff --git a/stable_baselines3/common/on_policy_algorithm.py b/stable_baselines3/common/on_policy_algorithm.py
index f84d18f34..8c9cb8b7e 100644
--- a/stable_baselines3/common/on_policy_algorithm.py
+++ b/stable_baselines3/common/on_policy_algorithm.py
@@ -173,8 +173,9 @@ def collect_rollouts(
             if isinstance(self.action_space, gym.spaces.Discrete):
                 # Reshape in case of discrete action
                 actions = actions.reshape(-1, 1)
-            rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs)
+            rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs)
             self._last_obs = new_obs
+            self._last_dones = dones
 
         rollout_buffer.compute_returns_and_advantage(values, dones=dones)
 

From a58fb991e4c34b18ccbef42d11c909e42a966662 Mon Sep 17 00:00:00 2001
From: Anssi 'Miffyli' Kanervisto <kaneran21@hotmail.com>
Date: Fri, 17 Jul 2020 01:38:43 +0300
Subject: [PATCH 2/9] Fix number of filters in NatureCNN network

---
 stable_baselines3/common/torch_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable_baselines3/common/torch_layers.py b/stable_baselines3/common/torch_layers.py
index 9c74017cb..9429a86eb 100644
--- a/stable_baselines3/common/torch_layers.py
+++ b/stable_baselines3/common/torch_layers.py
@@ -74,7 +74,7 @@ def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
             nn.ReLU(),
             nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
             nn.ReLU(),
-            nn.Conv2d(64, 32, kernel_size=3, stride=1, padding=0),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
             nn.ReLU(),
             nn.Flatten(),
         )

From bf3137325e0883d8d2af22fa76b4e3c56a470cee Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Sat, 25 Jul 2020 13:55:42 +0300
Subject: [PATCH 3/9] Add TF-like RMSprop for matching performance with sb2

---
 stable_baselines3/a2c/a2c.py                  |  11 ++
 .../common/sb2_compat/__init__.py             |   0
 .../common/sb2_compat/rmsprop_tf_like.py      | 126 ++++++++++++++++++
 3 files changed, 137 insertions(+)
 create mode 100644 stable_baselines3/common/sb2_compat/__init__.py
 create mode 100644 stable_baselines3/common/sb2_compat/rmsprop_tf_like.py

diff --git a/stable_baselines3/a2c/a2c.py b/stable_baselines3/a2c/a2c.py
index c2c7b34e1..e438690f5 100644
--- a/stable_baselines3/a2c/a2c.py
+++ b/stable_baselines3/a2c/a2c.py
@@ -116,6 +116,7 @@ def train(self) -> None:
         # Update optimizer learning rate
         self._update_learning_rate(self.policy.optimizer)
 
+        # This will only loop once (get all data in one go)
         for rollout_data in self.rollout_buffer.get(batch_size=None):
 
             actions = rollout_data.actions
@@ -151,8 +152,17 @@ def train(self) -> None:
             self.policy.optimizer.zero_grad()
             loss.backward()
 
+            # Check gradient norm
+            grad_norm = 0
+            for p in self.policy.parameters():
+                param_norm = p.grad.data.norm(2)
+                grad_norm += param_norm.item() ** 2
+            import math as m
+            grad_norm = m.sqrt(grad_norm)
+
             # Clip grad norm
             th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
+
             self.policy.optimizer.step()
 
         explained_var = explained_variance(self.rollout_buffer.returns.flatten(), self.rollout_buffer.values.flatten())
@@ -163,6 +173,7 @@ def train(self) -> None:
         logger.record("train/entropy_loss", entropy_loss.item())
         logger.record("train/policy_loss", policy_loss.item())
         logger.record("train/value_loss", value_loss.item())
+        logger.record("train/grad_norm", grad_norm)
         if hasattr(self.policy, "log_std"):
             logger.record("train/std", th.exp(self.policy.log_std).mean().item())
 
diff --git a/stable_baselines3/common/sb2_compat/__init__.py b/stable_baselines3/common/sb2_compat/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py b/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py
new file mode 100644
index 000000000..2ae8890c8
--- /dev/null
+++ b/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py
@@ -0,0 +1,126 @@
+import torch
+from torch.optim import Optimizer
+
+
+class RMSpropTFLike(Optimizer):
+    r"""Implements RMSprop algorithm with closer match to Tensorflow version.
+
+    For reproducibility with original stable-baselines. Use this
+    version with e.g. A2C for stabler learning than with the PyTorch
+    RMSProp. Based on the PyTorch v1.5.0 implementation of RMSprop.
+
+    See a more throughout conversion in pytorch-image-models repository:
+        https://github.com/rwightman/pytorch-image-models/blob/master/timm/optim/rmsprop_tf.py
+
+    Changes to the original RMSprop:
+        - Move epsilon inside square root
+        - Initialize squared gradient to ones rather than zeros
+
+    Proposed by G. Hinton in his
+    `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.
+
+    The centered version first appears in `Generating Sequences
+    With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.
+
+    The implementation here takes the square root of the gradient average before
+    adding epsilon (note that TensorFlow interchanges these two operations). The effective
+    learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha`
+    is the scheduled learning rate and :math:`v` is the weighted moving average
+    of the squared gradient.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-2)
+        momentum (float, optional): momentum factor (default: 0)
+        alpha (float, optional): smoothing constant (default: 0.99)
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        centered (bool, optional) : if ``True``, compute the centered RMSProp,
+            the gradient is normalized by an estimation of its variance
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+
+    """
+
+    def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= momentum:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= alpha:
+            raise ValueError("Invalid alpha value: {}".format(alpha))
+
+        defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay)
+        super(RMSpropTFLike, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RMSpropTFLike, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('momentum', 0)
+            group.setdefault('centered', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+                if grad.is_sparse:
+                    raise RuntimeError('RMSpropTF does not support sparse gradients')
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # PyTorch initialized to zeros here
+                    state['square_avg'] = torch.ones_like(p, memory_format=torch.preserve_format)
+                    if group['momentum'] > 0:
+                        state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if group['centered']:
+                        state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                square_avg = state['square_avg']
+                alpha = group['alpha']
+
+                state['step'] += 1
+
+                if group['weight_decay'] != 0:
+                    grad = grad.add(p, alpha=group['weight_decay'])
+
+                square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
+
+                if group['centered']:
+                    grad_avg = state['grad_avg']
+                    grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
+                    # PyTorch added epsilon after square root
+                    # avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(group['eps'])
+                    avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).add_(group['eps']).sqrt_()
+                else:
+                    # PyTorch added epsilon after square root
+                    # avg = square_avg.sqrt().add_(group['eps'])
+                    avg = square_avg.add(group['eps']).sqrt_()
+
+                if group['momentum'] > 0:
+                    buf = state['momentum_buffer']
+                    buf.mul_(group['momentum']).addcdiv_(grad, avg)
+                    p.add_(buf, alpha=-group['lr'])
+                else:
+                    p.addcdiv_(grad, avg, value=-group['lr'])
+
+        return loss

From be19c7121106e2584a674d6a7992878b516ca504 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Sat, 25 Jul 2020 14:13:37 +0300
Subject: [PATCH 4/9] Remove stuff that was accidentally included

---
 stable_baselines3/a2c/a2c.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/stable_baselines3/a2c/a2c.py b/stable_baselines3/a2c/a2c.py
index e438690f5..ecf51f700 100644
--- a/stable_baselines3/a2c/a2c.py
+++ b/stable_baselines3/a2c/a2c.py
@@ -152,17 +152,8 @@ def train(self) -> None:
             self.policy.optimizer.zero_grad()
             loss.backward()
 
-            # Check gradient norm
-            grad_norm = 0
-            for p in self.policy.parameters():
-                param_norm = p.grad.data.norm(2)
-                grad_norm += param_norm.item() ** 2
-            import math as m
-            grad_norm = m.sqrt(grad_norm)
-
             # Clip grad norm
             th.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
-
             self.policy.optimizer.step()
 
         explained_var = explained_variance(self.rollout_buffer.returns.flatten(), self.rollout_buffer.values.flatten())
@@ -173,7 +164,6 @@ def train(self) -> None:
         logger.record("train/entropy_loss", entropy_loss.item())
         logger.record("train/policy_loss", policy_loss.item())
         logger.record("train/value_loss", value_loss.item())
-        logger.record("train/grad_norm", grad_norm)
         if hasattr(self.policy, "log_std"):
             logger.record("train/std", th.exp(self.policy.log_std).mean().item())
 

From e078c43313eb1a24809405c0c14594d61b68abf0 Mon Sep 17 00:00:00 2001
From: Antonin RAFFIN <antonin.raffin@ensta.org>
Date: Thu, 30 Jul 2020 10:03:55 +0200
Subject: [PATCH 5/9] Reformat

---
 .../common/sb2_compat/rmsprop_tf_like.py      | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py b/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py
index 2ae8890c8..46ef4b06f 100644
--- a/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py
+++ b/stable_baselines3/common/sb2_compat/rmsprop_tf_like.py
@@ -60,8 +60,8 @@ def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, moment
     def __setstate__(self, state):
         super(RMSpropTFLike, self).__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('momentum', 0)
-            group.setdefault('centered', False)
+            group.setdefault("momentum", 0)
+            group.setdefault("centered", False)
 
     @torch.no_grad()
     def step(self, closure=None):
@@ -77,50 +77,50 @@ def step(self, closure=None):
                 loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad
                 if grad.is_sparse:
-                    raise RuntimeError('RMSpropTF does not support sparse gradients')
+                    raise RuntimeError("RMSpropTF does not support sparse gradients")
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
+                    state["step"] = 0
                     # PyTorch initialized to zeros here
-                    state['square_avg'] = torch.ones_like(p, memory_format=torch.preserve_format)
-                    if group['momentum'] > 0:
-                        state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    if group['centered']:
-                        state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["square_avg"] = torch.ones_like(p, memory_format=torch.preserve_format)
+                    if group["momentum"] > 0:
+                        state["momentum_buffer"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if group["centered"]:
+                        state["grad_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
 
-                square_avg = state['square_avg']
-                alpha = group['alpha']
+                square_avg = state["square_avg"]
+                alpha = group["alpha"]
 
-                state['step'] += 1
+                state["step"] += 1
 
-                if group['weight_decay'] != 0:
-                    grad = grad.add(p, alpha=group['weight_decay'])
+                if group["weight_decay"] != 0:
+                    grad = grad.add(p, alpha=group["weight_decay"])
 
                 square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha)
 
-                if group['centered']:
-                    grad_avg = state['grad_avg']
+                if group["centered"]:
+                    grad_avg = state["grad_avg"]
                     grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha)
                     # PyTorch added epsilon after square root
                     # avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(group['eps'])
-                    avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).add_(group['eps']).sqrt_()
+                    avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).add_(group["eps"]).sqrt_()
                 else:
                     # PyTorch added epsilon after square root
                     # avg = square_avg.sqrt().add_(group['eps'])
-                    avg = square_avg.add(group['eps']).sqrt_()
+                    avg = square_avg.add(group["eps"]).sqrt_()
 
-                if group['momentum'] > 0:
-                    buf = state['momentum_buffer']
-                    buf.mul_(group['momentum']).addcdiv_(grad, avg)
-                    p.add_(buf, alpha=-group['lr'])
+                if group["momentum"] > 0:
+                    buf = state["momentum_buffer"]
+                    buf.mul_(group["momentum"]).addcdiv_(grad, avg)
+                    p.add_(buf, alpha=-group["lr"])
                 else:
-                    p.addcdiv_(grad, avg, value=-group['lr'])
+                    p.addcdiv_(grad, avg, value=-group["lr"])
 
         return loss

From 407096f3f88d0675d5fe8ae86c52d8044b137c97 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Mon, 3 Aug 2020 20:05:55 +0300
Subject: [PATCH 6/9] Clarify variable naming

---
 stable_baselines3/common/base_class.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable_baselines3/common/base_class.py b/stable_baselines3/common/base_class.py
index 58e5cdb02..a3de8cef8 100644
--- a/stable_baselines3/common/base_class.py
+++ b/stable_baselines3/common/base_class.py
@@ -475,7 +475,7 @@ def _setup_learn(
         # Avoid resetting the environment when calling ``.learn()`` consecutive times
         if reset_num_timesteps or self._last_obs is None:
             self._last_obs = self.env.reset()
-            self._last_dones = np.zeros((self._last_obs.shape[0],), dtype=np.bool)
+            self._last_dones = np.zeros((self.env.num_envs,), dtype=np.bool)
             # Retrieve unnormalized observation for saving into the buffer
             if self._vec_normalize_env is not None:
                 self._last_original_obs = self._vec_normalize_env.get_original_obs()

From b28da6ed21742718999cdf0167d472c577304af2 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Mon, 3 Aug 2020 20:10:54 +0300
Subject: [PATCH 7/9] Update changelog

---
 docs/misc/changelog.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
index 16df2ab1a..a8f2619b0 100644
--- a/docs/misc/changelog.rst
+++ b/docs/misc/changelog.rst
@@ -34,6 +34,8 @@ Bug Fixes:
 - Fixed a bug with orthogonal initialization when `bias=False` in custom policy (@rk37)
 - Fixed approximate entropy calculation in PPO and A2C. (@andyshih12)
 - Fixed DQN target network sharing feature extractor with the main network.
+- Fixed storing correct ``dones`` in on-policy algorithm rollout collection. (@andyshih12)
+- Fixed number of filters in final convolutional layer in NatureCNN to match original implementation.
 
 Deprecations:
 ^^^^^^^^^^^^^
@@ -49,6 +51,7 @@ Others:
 - Ignored errors from newer pytype version
 - Added a check when using ``gSDE``
 - Removed codacy dependency from Dockerfile
+- Added ``common.sb2_compat.RMSpropTFLike`` optimizer, which corresponds closer to the implementation of RMSprop from Tensorflow.
 
 Documentation:
 ^^^^^^^^^^^^^^

From 3f3916cbea0486792ce951ad7f0383229250be60 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Mon, 3 Aug 2020 22:53:54 +0300
Subject: [PATCH 8/9] Add comment on RMSprop implementations to A2C

---
 docs/modules/a2c.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/modules/a2c.rst b/docs/modules/a2c.rst
index 141fa0bb1..460d1a6e3 100644
--- a/docs/modules/a2c.rst
+++ b/docs/modules/a2c.rst
@@ -10,6 +10,14 @@ A synchronous, deterministic variant of `Asynchronous Advantage Actor Critic (A3
 It uses multiple workers to avoid the use of a replay buffer.
 
 
+.. warning::
+  
+  If you find training unstable or want to match performance of stable-baselines A2C, consider using
+  ``RMSpropTFLike`` optimizer from ``stable_baselines3.common.sb2_compat.rmsprop_tf_like``.
+  You can change optimizer with ``A2C(policy_kwargs=dict(optimizer_class=RMSpropTFLike))``.
+  Read more `here <https://github.com/DLR-RM/stable-baselines3/pull/110#issuecomment-663255241>`_.
+
+
 Notes
 -----
 

From 85f96aa4b74a424cf7bc6e0cdf857f42a73e2470 Mon Sep 17 00:00:00 2001
From: "Anssi \"Miffyli\" Kanervisto" <kaneran21@hotmail.com>
Date: Mon, 3 Aug 2020 22:58:21 +0300
Subject: [PATCH 9/9] Add test for RMSpropTFLike

---
 tests/test_custom_policy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_custom_policy.py b/tests/test_custom_policy.py
index cd379126c..c1e08dfac 100644
--- a/tests/test_custom_policy.py
+++ b/tests/test_custom_policy.py
@@ -2,6 +2,7 @@
 import torch as th
 
 from stable_baselines3 import A2C, PPO, SAC, TD3
+from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike
 
 
 @pytest.mark.parametrize(
@@ -32,3 +33,8 @@ def test_custom_offpolicy(model_class, net_arch):
 def test_custom_optimizer(model_class, optimizer_kwargs):
     policy_kwargs = dict(optimizer_class=th.optim.AdamW, optimizer_kwargs=optimizer_kwargs, net_arch=[32])
     _ = model_class("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs).learn(1000)
+
+
+def test_tf_like_rmsprop_optimizer():
+    policy_kwargs = dict(optimizer_class=RMSpropTFLike, net_arch=[32])
+    _ = A2C("MlpPolicy", "Pendulum-v0", policy_kwargs=policy_kwargs).learn(1000)