From d53832aa8f407236da61f03eef6522d694fbf60e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 2 Jun 2023 07:08:50 +0200
Subject: [PATCH 01/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py     | 54 ++++++++-------------
 rllib/algorithms/impala/impala.py           | 21 +++-----
 rllib/algorithms/impala/impala_tf_policy.py | 40 +++++----------
 rllib/evaluation/postprocessing.py          | 38 ++++++++++-----
 rllib/policy/sample_batch.py                |  9 ++--
 5 files changed, 71 insertions(+), 91 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 8441f8032ede..1869388f6795 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -19,6 +19,7 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -144,7 +145,6 @@ def loss(
                 is_multidiscrete = False
                 output_hidden_shape = 1
 
-            # TODO: (sven) deprecate this when trajectory view API gets activated.
             def make_time_major(*args, **kw):
                 return _make_time_major(
                     self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
@@ -159,12 +159,14 @@ def make_time_major(*args, **kw):
             prev_action_dist = dist_class(behaviour_logits, self.model)
             values = self.model.value_function()
             values_time_major = make_time_major(values)
+            bootstrap_value = train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            bootstrap_value = make_time_major(bootstrap_value)[-1]
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
                 mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
                 mask = tf.reshape(mask, [-1])
-                mask = make_time_major(mask, drop_last=self.config["vtrace"])
+                mask = make_time_major(mask)
 
                 def reduce_mean_valid(t):
                     return tf.reduce_mean(tf.boolean_mask(t, mask))
@@ -173,11 +175,7 @@ def reduce_mean_valid(t):
                 reduce_mean_valid = tf.reduce_mean
 
             if self.config["vtrace"]:
-                drop_last = self.config["vtrace_drop_last_ts"]
-                logger.debug(
-                    "Using V-Trace surrogate loss (vtrace=True; "
-                    f"drop_last={drop_last})"
-                )
+                logger.debug("Using V-Trace surrogate loss (vtrace=True)")
 
                 # Prepare actions for loss.
                 loss_actions = (
@@ -189,7 +187,7 @@ def reduce_mean_valid(t):
 
                 # Prepare KL for Loss
                 mean_kl = make_time_major(
-                    old_policy_action_dist.multi_kl(action_dist), drop_last=drop_last
+                    old_policy_action_dist.multi_kl(action_dist)
                 )
 
                 unpacked_behaviour_logits = tf.split(
@@ -203,26 +201,22 @@ def reduce_mean_valid(t):
                 with tf.device("/cpu:0"):
                     vtrace_returns = vtrace.multi_from_logits(
                         behaviour_policy_logits=make_time_major(
-                            unpacked_behaviour_logits, drop_last=drop_last
+                            unpacked_behaviour_logits
                         ),
                         target_policy_logits=make_time_major(
-                            unpacked_old_policy_behaviour_logits, drop_last=drop_last
+                            unpacked_old_policy_behaviour_logits
                         ),
                         actions=tf.unstack(
-                            make_time_major(loss_actions, drop_last=drop_last), axis=2
+                            make_time_major(loss_actions), axis=2
                         ),
                         discounts=tf.cast(
-                            ~make_time_major(
-                                tf.cast(dones, tf.bool), drop_last=drop_last
-                            ),
+                            ~make_time_major(tf.cast(dones, tf.bool)),
                             tf.float32,
                         )
                         * self.config["gamma"],
-                        rewards=make_time_major(rewards, drop_last=drop_last),
-                        values=values_time_major[:-1]
-                        if drop_last
-                        else values_time_major,
-                        bootstrap_value=values_time_major[-1],
+                        rewards=make_time_major(rewards),
+                        values=values_time_major,
+                        bootstrap_value=bootstrap_value,
                         dist_class=Categorical if is_multidiscrete else dist_class,
                         model=model,
                         clip_rho_threshold=tf.cast(
@@ -233,14 +227,10 @@ def reduce_mean_valid(t):
                         ),
                     )
 
-                actions_logp = make_time_major(
-                    action_dist.logp(actions), drop_last=drop_last
-                )
-                prev_actions_logp = make_time_major(
-                    prev_action_dist.logp(actions), drop_last=drop_last
-                )
+                actions_logp = make_time_major(action_dist.logp(actions))
+                prev_actions_logp = make_time_major(prev_action_dist.logp(actions))
                 old_policy_actions_logp = make_time_major(
-                    old_policy_action_dist.logp(actions), drop_last=drop_last
+                    old_policy_action_dist.logp(actions)
                 )
 
                 is_ratio = tf.clip_by_value(
@@ -267,17 +257,12 @@ def reduce_mean_valid(t):
                 mean_policy_loss = -reduce_mean_valid(surrogate_loss)
 
                 # The value function loss.
-                if drop_last:
-                    delta = values_time_major[:-1] - vtrace_returns.vs
-                else:
-                    delta = values_time_major - vtrace_returns.vs
+                delta = values_time_major - vtrace_returns.vs
                 value_targets = vtrace_returns.vs
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
-                actions_entropy = make_time_major(
-                    action_dist.multi_entropy(), drop_last=True
-                )
+                actions_entropy = make_time_major(action_dist.multi_entropy())
                 mean_entropy = reduce_mean_valid(actions_entropy)
 
             else:
@@ -353,7 +338,6 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
                 self,
                 train_batch.get(SampleBatch.SEQ_LENS),
                 self.model.value_function(),
-                drop_last=self.config["vtrace"] and self.config["vtrace_drop_last_ts"],
             )
 
             stats_dict = {
@@ -392,6 +376,8 @@ def postprocess_trajectory(
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
+            else:
+                sample_batch = compute_bootstrap_value(sample_batch, self)
 
             return sample_batch
 
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index 4403936aab15..3990d36b20ad 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -112,11 +112,6 @@ def __init__(self, algo_class=None):
         self.vtrace = True
         self.vtrace_clip_rho_threshold = 1.0
         self.vtrace_clip_pg_rho_threshold = 1.0
-        # TODO (sven): Deprecate this setting. It makes no sense to drop the last ts.
-        #  It's actually dangerous if there are important rewards "hiding" in that ts.
-        #  This setting is already ignored (always False) on the new Learner API
-        #  (if _enable_learner_api=True).
-        self.vtrace_drop_last_ts = False
         self.num_multi_gpu_tower_stacks = 1
         self.minibatch_buffer_size = 1
         self.num_sgd_iter = 1
@@ -171,6 +166,7 @@ def __init__(self, algo_class=None):
 
         # Deprecated value.
         self.num_data_loader_buffers = DEPRECATED_VALUE
+        self.vtrace_drop_last_ts = DEPRECATED_VALUE
 
     @override(AlgorithmConfig)
     def training(
@@ -179,7 +175,6 @@ def training(
         vtrace: Optional[bool] = NotProvided,
         vtrace_clip_rho_threshold: Optional[float] = NotProvided,
         vtrace_clip_pg_rho_threshold: Optional[float] = NotProvided,
-        vtrace_drop_last_ts: Optional[bool] = NotProvided,
         gamma: Optional[float] = NotProvided,
         num_multi_gpu_tower_stacks: Optional[int] = NotProvided,
         minibatch_buffer_size: Optional[int] = NotProvided,
@@ -206,6 +201,8 @@ def training(
         _separate_vf_optimizer: Optional[bool] = NotProvided,
         _lr_vf: Optional[float] = NotProvided,
         after_train_step: Optional[Callable[[dict], None]] = NotProvided,
+        # deprecated.
+        vtrace_drop_last_ts=None,
         **kwargs,
     ) -> "ImpalaConfig":
         """Sets the training related configuration.
@@ -214,13 +211,6 @@ def training(
             vtrace: V-trace params (see vtrace_tf/torch.py).
             vtrace_clip_rho_threshold:
             vtrace_clip_pg_rho_threshold:
-            vtrace_drop_last_ts: If True, drop the last timestep for the vtrace
-                calculations, such that all data goes into the calculations as [B x T-1]
-                (+ the bootstrap value). This is the default and legacy RLlib behavior,
-                however, could potentially have a destabilizing effect on learning,
-                especially in sparse reward or reward-at-goal environments.
-                False for not dropping the last timestep.
-                System params.
             gamma: Float specifying the discount factor of the Markov Decision process.
             num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many
                 slots should we reserve for parallel data loading? Set this to >1 to
@@ -293,6 +283,9 @@ def training(
         Returns:
             This updated AlgorithmConfig object.
         """
+        if vtrace_drop_last_ts is not None:
+            deprecation_warning(old="vtrace_drop_last_ts", error=False)
+
         # Pass kwargs onto super's `training()` method.
         super().training(**kwargs)
 
@@ -302,8 +295,6 @@ def training(
             self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold
         if vtrace_clip_pg_rho_threshold is not NotProvided:
             self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold
-        if vtrace_drop_last_ts is not NotProvided:
-            self.vtrace_drop_last_ts = vtrace_drop_last_ts
         if num_multi_gpu_tower_stacks is not NotProvided:
             self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks
         if minibatch_buffer_size is not NotProvided:
diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index d8b830ef7653..74cb92f80149 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -131,14 +131,13 @@ def __init__(
             self.total_loss += self.vf_loss * vf_loss_coeff
 
 
-def _make_time_major(policy, seq_lens, tensor, drop_last=False):
+def _make_time_major(policy, seq_lens, tensor):
     """Swaps batch and trajectory axis.
 
     Args:
         policy: Policy reference
         seq_lens: Sequence lengths if recurrent or None
         tensor: A tensor or list of tensors to reshape.
-        drop_last: A bool indicating whether to drop the last
         trajectory item.
 
     Returns:
@@ -146,7 +145,7 @@ def _make_time_major(policy, seq_lens, tensor, drop_last=False):
         swapped axes.
     """
     if isinstance(tensor, list):
-        return [_make_time_major(policy, seq_lens, t, drop_last) for t in tensor]
+        return [_make_time_major(policy, seq_lens, t) for t in tensor]
 
     if policy.is_recurrent():
         B = tf.shape(seq_lens)[0]
@@ -163,8 +162,6 @@ def _make_time_major(policy, seq_lens, tensor, drop_last=False):
     # swap B and T axes
     res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
 
-    if drop_last:
-        return res[:-1]
     return res
 
 
@@ -363,30 +360,21 @@ def make_time_major(*args, **kw):
             )
 
             # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc.
-            drop_last = self.config["vtrace_drop_last_ts"]
             self.vtrace_loss = VTraceLoss(
-                actions=make_time_major(loss_actions, drop_last=drop_last),
-                actions_logp=make_time_major(
-                    action_dist.logp(actions), drop_last=drop_last
-                ),
-                actions_entropy=make_time_major(
-                    action_dist.multi_entropy(), drop_last=drop_last
-                ),
-                dones=make_time_major(dones, drop_last=drop_last),
-                behaviour_action_logp=make_time_major(
-                    behaviour_action_logp, drop_last=drop_last
-                ),
-                behaviour_logits=make_time_major(
-                    unpacked_behaviour_logits, drop_last=drop_last
-                ),
-                target_logits=make_time_major(unpacked_outputs, drop_last=drop_last),
+                actions=make_time_major(loss_actions),
+                actions_logp=make_time_major(action_dist.logp(actions)),
+                actions_entropy=make_time_major(action_dist.multi_entropy()),
+                dones=make_time_major(dones),
+                behaviour_action_logp=make_time_major(behaviour_action_logp),
+                behaviour_logits=make_time_major(unpacked_behaviour_logits),
+                target_logits=make_time_major(unpacked_outputs),
                 discount=self.config["gamma"],
-                rewards=make_time_major(rewards, drop_last=drop_last),
-                values=make_time_major(values, drop_last=drop_last),
-                bootstrap_value=make_time_major(values)[-1],
+                rewards=make_time_major(rewards),
+                values=make_time_major(values),
+                bootstrap_value=make_time_major(values)[-1],#TODO
                 dist_class=Categorical if is_multidiscrete else dist_class,
                 model=model,
-                valid_mask=make_time_major(mask, drop_last=drop_last),
+                valid_mask=make_time_major(mask),
                 config=self.config,
                 vf_loss_coeff=self.config["vf_loss_coeff"],
                 entropy_coeff=self.entropy_coeff,
@@ -401,12 +389,10 @@ def make_time_major(*args, **kw):
 
         @override(base)
         def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
-            drop_last = self.config["vtrace"] and self.config["vtrace_drop_last_ts"]
             values_batched = _make_time_major(
                 self,
                 train_batch.get(SampleBatch.SEQ_LENS),
                 self.model.value_function(),
-                drop_last=drop_last,
             )
 
             return {
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 21355cefd455..09ee8cf13e3c 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -179,10 +179,29 @@ def compute_gae_for_sample_batch(
     Returns:
         The postprocessed, modified SampleBatch (or a new one).
     """
+    sample_batch = compute_bootstrap_value(sample_batch, policy)
 
+    # Adds the policy logits, VF preds, and advantages to the batch,
+    # using GAE ("generalized advantage estimation") or not.
+    batch = compute_advantages(
+        rollout=sample_batch,
+        last_r=sample_batch[SampleBatch.VALUES_BOOTSTRAPPED][-1],
+        gamma=policy.config["gamma"],
+        lambda_=policy.config["lambda"],
+        use_gae=policy.config["use_gae"],
+        use_critic=policy.config.get("use_critic", True),
+    )
+
+    return batch
+
+
+@DeveloperAPI
+def compute_bootstrap_value(sample_batch, policy):
+    """TODO (sven): docstr """
     # Trajectory is actually complete -> last r=0.0.
     if sample_batch[SampleBatch.TERMINATEDS][-1]:
         last_r = 0.0
+
     # Trajectory has been truncated -> last r=VF estimate of last obs.
     else:
         # Input dict is provided to us automatically via the Model's
@@ -192,7 +211,6 @@ def compute_gae_for_sample_batch(
         input_dict = sample_batch.get_single_step_input_dict(
             policy.model.view_requirements, index="last"
         )
-
         if policy.config.get("_enable_rl_module_api"):
             # Note: During sampling you are using the parameters at the beginning of
             # the sampling process. If I'll be using this advantages during training
@@ -217,18 +235,14 @@ def compute_gae_for_sample_batch(
         else:
             last_r = policy._value(**input_dict)
 
-    # Adds the policy logits, VF preds, and advantages to the batch,
-    # using GAE ("generalized advantage estimation") or not.
-    batch = compute_advantages(
-        sample_batch,
-        last_r,
-        policy.config["gamma"],
-        policy.config["lambda"],
-        use_gae=policy.config["use_gae"],
-        use_critic=policy.config.get("use_critic", True),
-    )
+    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all zeros, except for the
+    # very last timestep (where this bootstrapping value is actually needed), which
+    # we set to the computed `last_r`.
+    values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS])
+    values_bootstrapped[-1] = last_r
+    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped
 
-    return batch
+    return sample_batch
 
 
 @DeveloperAPI
diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py
index ccc0ca0147e7..3d7e5b3af18a 100644
--- a/rllib/policy/sample_batch.py
+++ b/rllib/policy/sample_batch.py
@@ -150,6 +150,10 @@ class SampleBatch(dict):
 
     # Value function predictions emitted by the behaviour policy.
     VF_PREDS = "vf_preds"
+    # Values one ts beyond the last ts taken. These are usually calculated via the value
+    # function network using the final observation (and in case of an RNN: the last
+    # returned internal state).
+    VALUES_BOOTSTRAPPED = "values_bootstrapped"
 
     # RE 3
     # This is only computed and used when RE3 exploration strategy is enabled.
@@ -161,9 +165,8 @@ class SampleBatch(dict):
 
     # Deprecated keys:
 
-    # SampleBatches must already not be constructed anymore by setting this key
-    # directly. Instead, the values under this key are auto-computed via the values of
-    # the new TERMINATEDS and TRUNCATEDS keys.
+    # Do not set this key directly. Instead, the values under this key are
+    # auto-computed via the values of the TERMINATEDS and TRUNCATEDS keys.
     DONES = "dones"
     # Use SampleBatch.OBS instead.
     CUR_OBS = "obs"

From faab512f5831025143ce2f9865d3be7a4f0588fb Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 2 Jun 2023 10:03:41 +0200
Subject: [PATCH 02/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_torch_policy.py    | 41 ++++++++-----------
 .../algorithms/impala/impala_torch_policy.py  | 32 ++++++---------
 rllib/algorithms/impala/tf/vtrace_tf_v2.py    |  7 +---
 .../impala/torch/vtrace_torch_v2.py           |  7 +---
 4 files changed, 31 insertions(+), 56 deletions(-)

diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 4a7754830f32..6f30d32edfce 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -158,13 +158,11 @@ def _make_time_major(*args, **kwargs):
         values = model.value_function()
         values_time_major = _make_time_major(values)
 
-        drop_last = self.config["vtrace"] and self.config["vtrace_drop_last_ts"]
-
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
             mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
             mask = torch.reshape(mask, [-1])
-            mask = _make_time_major(mask, drop_last=drop_last)
+            mask = _make_time_major(mask)
             num_valid = torch.sum(mask)
 
             def reduce_mean_valid(t):
@@ -175,7 +173,7 @@ def reduce_mean_valid(t):
 
         if self.config["vtrace"]:
             logger.debug(
-                "Using V-Trace surrogate loss (vtrace=True; " f"drop_last={drop_last})"
+                "Using V-Trace surrogate loss (vtrace=True)"
             )
 
             old_policy_behaviour_logits = target_model_out.detach()
@@ -202,26 +200,24 @@ def reduce_mean_valid(t):
             )
 
             # Prepare KL for loss.
-            action_kl = _make_time_major(
-                old_policy_action_dist.kl(action_dist), drop_last=drop_last
-            )
+            action_kl = _make_time_major(old_policy_action_dist.kl(action_dist))
 
             # Compute vtrace on the CPU for better perf.
             vtrace_returns = vtrace.multi_from_logits(
                 behaviour_policy_logits=_make_time_major(
-                    unpacked_behaviour_logits, drop_last=drop_last
+                    unpacked_behaviour_logits
                 ),
                 target_policy_logits=_make_time_major(
-                    unpacked_old_policy_behaviour_logits, drop_last=drop_last
+                    unpacked_old_policy_behaviour_logits
                 ),
                 actions=torch.unbind(
-                    _make_time_major(loss_actions, drop_last=drop_last), dim=2
+                    _make_time_major(loss_actions), dim=2
                 ),
-                discounts=(1.0 - _make_time_major(dones, drop_last=drop_last).float())
+                discounts=(1.0 - _make_time_major(dones).float())
                 * self.config["gamma"],
-                rewards=_make_time_major(rewards, drop_last=drop_last),
-                values=values_time_major[:-1] if drop_last else values_time_major,
-                bootstrap_value=values_time_major[-1],
+                rewards=_make_time_major(rewards),
+                values=values_time_major,
+                bootstrap_value=bootstrap_values[-1],
                 dist_class=TorchCategorical if is_multidiscrete else dist_class,
                 model=model,
                 clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
@@ -229,13 +225,13 @@ def reduce_mean_valid(t):
             )
 
             actions_logp = _make_time_major(
-                action_dist.logp(actions), drop_last=drop_last
+                action_dist.logp(actions)
             )
             prev_actions_logp = _make_time_major(
-                prev_action_dist.logp(actions), drop_last=drop_last
+                prev_action_dist.logp(actions)
             )
             old_policy_actions_logp = _make_time_major(
-                old_policy_action_dist.logp(actions), drop_last=drop_last
+                old_policy_action_dist.logp(actions)
             )
             is_ratio = torch.clamp(
                 torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
@@ -259,15 +255,12 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = vtrace_returns.vs.to(values_time_major.device)
-            if drop_last:
-                delta = values_time_major[:-1] - value_targets
-            else:
-                delta = values_time_major - value_targets
+            delta = values_time_major - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
             mean_entropy = reduce_mean_valid(
-                _make_time_major(action_dist.entropy(), drop_last=drop_last)
+                _make_time_major(action_dist.entropy())
             )
 
         else:
@@ -323,9 +316,7 @@ def reduce_mean_valid(t):
         model.tower_stats["value_targets"] = value_targets
         model.tower_stats["vf_explained_var"] = explained_variance(
             torch.reshape(value_targets, [-1]),
-            torch.reshape(
-                values_time_major[:-1] if drop_last else values_time_major, [-1]
-            ),
+            torch.reshape(values_time_major, [-1]),
         )
 
         return total_loss
diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index 71aed0320601..3cc6fefeffc0 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -126,22 +126,20 @@ def __init__(
         )
 
 
-def make_time_major(policy, seq_lens, tensor, drop_last=False):
+def make_time_major(policy, seq_lens, tensor):
     """Swaps batch and trajectory axis.
 
     Args:
         policy: Policy reference
         seq_lens: Sequence lengths if recurrent or None
         tensor: A tensor or list of tensors to reshape.
-        drop_last: A bool indicating whether to drop the last
-        trajectory item.
 
     Returns:
         res: A tensor with swapped axes or a list of tensors with
         swapped axes.
     """
     if isinstance(tensor, (list, tuple)):
-        return [make_time_major(policy, seq_lens, t, drop_last) for t in tensor]
+        return [make_time_major(policy, seq_lens, t) for t in tensor]
 
     if policy.is_recurrent():
         B = seq_lens.shape[0]
@@ -158,8 +156,6 @@ def make_time_major(policy, seq_lens, tensor, drop_last=False):
     # Swap B and T axes.
     res = torch.transpose(rs, 1, 0)
 
-    if drop_last:
-        return res[:-1]
     return res
 
 
@@ -277,30 +273,29 @@ def _make_time_major(*args, **kw):
         loss_actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1)
 
         # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc.
-        drop_last = self.config["vtrace_drop_last_ts"]
         loss = VTraceLoss(
-            actions=_make_time_major(loss_actions, drop_last=drop_last),
+            actions=_make_time_major(loss_actions),
             actions_logp=_make_time_major(
-                action_dist.logp(actions), drop_last=drop_last
+                action_dist.logp(actions)
             ),
             actions_entropy=_make_time_major(
-                action_dist.entropy(), drop_last=drop_last
+                action_dist.entropy()
             ),
-            dones=_make_time_major(dones, drop_last=drop_last),
+            dones=_make_time_major(dones),
             behaviour_action_logp=_make_time_major(
-                behaviour_action_logp, drop_last=drop_last
+                behaviour_action_logp
             ),
             behaviour_logits=_make_time_major(
-                unpacked_behaviour_logits, drop_last=drop_last
+                unpacked_behaviour_logits
             ),
-            target_logits=_make_time_major(unpacked_outputs, drop_last=drop_last),
+            target_logits=_make_time_major(unpacked_outputs),
             discount=self.config["gamma"],
-            rewards=_make_time_major(rewards, drop_last=drop_last),
-            values=_make_time_major(values, drop_last=drop_last),
-            bootstrap_value=_make_time_major(values)[-1],
+            rewards=_make_time_major(rewards),
+            values=_make_time_major(values),
+            bootstrap_value=_make_time_major(bootstrap_values)[-1],
             dist_class=TorchCategorical if is_multidiscrete else dist_class,
             model=model,
-            valid_mask=_make_time_major(mask, drop_last=drop_last),
+            valid_mask=_make_time_major(mask),
             config=self.config,
             vf_loss_coeff=self.config["vf_loss_coeff"],
             entropy_coeff=self.entropy_coeff,
@@ -320,7 +315,6 @@ def _make_time_major(*args, **kw):
             self,
             train_batch.get(SampleBatch.SEQ_LENS),
             values,
-            drop_last=self.config["vtrace"] and drop_last,
         )
         model.tower_stats["vf_explained_var"] = explained_variance(
             torch.reshape(loss.value_targets, [-1]), torch.reshape(values_batched, [-1])
diff --git a/rllib/algorithms/impala/tf/vtrace_tf_v2.py b/rllib/algorithms/impala/tf/vtrace_tf_v2.py
index 5f878ddbc1c1..4aecc6c65549 100644
--- a/rllib/algorithms/impala/tf/vtrace_tf_v2.py
+++ b/rllib/algorithms/impala/tf/vtrace_tf_v2.py
@@ -9,7 +9,6 @@ def make_time_major(
     *,
     trajectory_len: int = None,
     recurrent_seq_len: int = None,
-    drop_last: bool = False,
 ):
     """Swaps batch and trajectory axis.
 
@@ -21,8 +20,6 @@ def make_time_major(
             If None then `recurrent_seq_len` must be set.
         recurrent_seq_len: Sequence lengths if recurrent.
             If None then `trajectory_len` must be set.
-        drop_last: A bool indicating whether to drop the last
-            trajectory item.
 
     Note: Either `trajectory_len` or `recurrent_seq_len` must be set. `trajectory_len`
         should be used in cases where tensor is not produced from a
@@ -33,7 +30,7 @@ def make_time_major(
     """
     if isinstance(tensor, list):
         return [
-            make_time_major(_tensor, trajectory_len, recurrent_seq_len, drop_last)
+            make_time_major(_tensor, trajectory_len, recurrent_seq_len)
             for _tensor in tensor
         ]
 
@@ -52,8 +49,6 @@ def make_time_major(
     # swap B and T axes
     res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
 
-    if drop_last:
-        return res[:-1]
     return res
 
 
diff --git a/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/rllib/algorithms/impala/torch/vtrace_torch_v2.py
index 1f4b6f9411fa..83ba8879d955 100644
--- a/rllib/algorithms/impala/torch/vtrace_torch_v2.py
+++ b/rllib/algorithms/impala/torch/vtrace_torch_v2.py
@@ -9,7 +9,6 @@ def make_time_major(
     *,
     trajectory_len: int = None,
     recurrent_seq_len: int = None,
-    drop_last: bool = False,
 ):
     """Swaps batch and trajectory axis.
 
@@ -21,8 +20,6 @@ def make_time_major(
             If None then `recurrent_seq_len` must be set.
         recurrent_seq_len: Sequence lengths if recurrent.
             If None then `trajectory_len` must be set.
-        drop_last: A bool indicating whether to drop the last
-            trajectory item.
 
     Returns:
         res: A tensor with swapped axes or a list of tensors with
@@ -30,7 +27,7 @@ def make_time_major(
     """
     if isinstance(tensor, (list, tuple)):
         return [
-            make_time_major(_tensor, trajectory_len, recurrent_seq_len, drop_last)
+            make_time_major(_tensor, trajectory_len, recurrent_seq_len)
             for _tensor in tensor
         ]
 
@@ -53,8 +50,6 @@ def make_time_major(
     # Swap B and T axes.
     res = torch.transpose(rs, 1, 0)
 
-    if drop_last:
-        return res[:-1]
     return res
 
 

From 935b7f77d2cecdcbc92f9ee982ed055bdf4344a7 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 2 Jun 2023 12:54:04 +0200
Subject: [PATCH 03/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py | 32 +++++++++++++++++++++----
 rllib/evaluation/postprocessing.py      |  2 +-
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 1869388f6795..9c12654b8877 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -159,8 +159,30 @@ def make_time_major(*args, **kw):
             prev_action_dist = dist_class(behaviour_logits, self.model)
             values = self.model.value_function()
             values_time_major = make_time_major(values)
-            bootstrap_value = train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
-            bootstrap_value = make_time_major(bootstrap_value)[-1]
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+
+            # Then add the shifted-by-one bootstrapped values to that to yield the final
+            # value tensor. Use the last ts in that resulting tensor as the
+            # "bootstrapped" values for vtrace.
+            shape = tf.shape(values_time_major)
+            T, B = shape[0], shape[1]
+            # Augment `values_time_major` by one timestep at the end (all zeros).
+            values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+            # Augment `bootstrap_values_time_major` by one timestep at the beginning
+            # (all zeros).
+            bootstrap_values_time_major = tf.concat(
+                [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+            )
+            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+            # ONLY at the last ts of a trajectory (for the following timestep,
+            # which is one past(!) the last ts). All other values in that tensor are
+            # zero.
+            # Adding values and bootstrap_values yields the correct values+bootstrap
+            # configuration, from which we can then take t=-1 (last timestep) to get
+            # the bootstrap_value arg for the vtrace function below.
+            values_time_major += bootstrap_values_time_major
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
@@ -215,8 +237,8 @@ def reduce_mean_valid(t):
                         )
                         * self.config["gamma"],
                         rewards=make_time_major(rewards),
-                        values=values_time_major,
-                        bootstrap_value=bootstrap_value,
+                        values=values_time_major[:-1],
+                        bootstrap_value=values_time_major[-1],
                         dist_class=Categorical if is_multidiscrete else dist_class,
                         model=model,
                         clip_rho_threshold=tf.cast(
@@ -257,8 +279,8 @@ def reduce_mean_valid(t):
                 mean_policy_loss = -reduce_mean_valid(surrogate_loss)
 
                 # The value function loss.
-                delta = values_time_major - vtrace_returns.vs
                 value_targets = vtrace_returns.vs
+                delta = values_time_major[:-1] - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 09ee8cf13e3c..754cb256e39a 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -235,7 +235,7 @@ def compute_bootstrap_value(sample_batch, policy):
         else:
             last_r = policy._value(**input_dict)
 
-    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all zeros, except for the
+    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the
     # very last timestep (where this bootstrapping value is actually needed), which
     # we set to the computed `last_r`.
     values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS])

From 4a44f70a8d676de15e3d187e20d5f11fa47eedcd Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 2 Jun 2023 13:20:40 +0200
Subject: [PATCH 04/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py       |  2 +-
 rllib/algorithms/appo/appo_torch_policy.py    | 34 ++++++++++++++---
 rllib/algorithms/appo/tf/appo_tf_learner.py   | 38 +++++++++++++++----
 .../appo/torch/appo_torch_learner.py          | 37 ++++++++++++++----
 4 files changed, 89 insertions(+), 22 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 9c12654b8877..064ff0b9cf7f 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -319,7 +319,7 @@ def reduce_mean_valid(t):
                 value_targets = make_time_major(
                     train_batch[Postprocessing.VALUE_TARGETS]
                 )
-                delta = values_time_major - value_targets
+                delta = values_time_major[:-1] - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 6f30d32edfce..fe4a092045fa 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -19,6 +19,7 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -157,6 +158,29 @@ def _make_time_major(*args, **kwargs):
         prev_action_dist = dist_class(behaviour_logits, model)
         values = model.value_function()
         values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+
+        # Then add the shifted-by-one bootstrapped values to that to yield the final
+        # value tensor. Use the last ts in that resulting tensor as the
+        # "bootstrapped" values for vtrace.
+        T, B = values_time_major.shape
+        # Augment `values_time_major` by one timestep at the end (all zeros).
+        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        # Augment `bootstrap_values_time_major` by one timestep at the beginning
+        # (all zeros).
+        bootstrap_values_time_major = torch.cat(
+            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        )
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        # Adding values and bootstrap_values yields the correct values+bootstrap
+        # configuration, from which we can then take t=-1 (last timestep) to get
+        # the bootstrap_value arg for the vtrace function below.
+        values_time_major += bootstrap_values_time_major
 
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
@@ -216,8 +240,8 @@ def reduce_mean_valid(t):
                 discounts=(1.0 - _make_time_major(dones).float())
                 * self.config["gamma"],
                 rewards=_make_time_major(rewards),
-                values=values_time_major,
-                bootstrap_value=bootstrap_values[-1],
+                values=values_time_major[:-1],
+                bootstrap_value=values_time_major[-1],
                 dist_class=TorchCategorical if is_multidiscrete else dist_class,
                 model=model,
                 clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
@@ -255,7 +279,7 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = vtrace_returns.vs.to(values_time_major.device)
-            delta = values_time_major - value_targets
+            delta = values_time_major[:-1] - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
@@ -289,7 +313,7 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS])
-            delta = values_time_major - value_targets
+            delta = values_time_major[:-1] - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
@@ -316,7 +340,7 @@ def reduce_mean_valid(t):
         model.tower_stats["value_targets"] = value_targets
         model.tower_stats["vf_explained_var"] = explained_variance(
             torch.reshape(value_targets, [-1]),
-            torch.reshape(values_time_major, [-1]),
+            torch.reshape(values_time_major[:-1], [-1]),
         )
 
         return total_loss
diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
index 84fc8f4714cc..c9959c670db7 100644
--- a/rllib/algorithms/appo/tf/appo_tf_learner.py
+++ b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -61,17 +61,39 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        values_time_major = make_time_major(
-            values,
+        rewards_time_major = make_time_major(
+            batch[SampleBatch.REWARDS],
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        bootstrap_value = values_time_major[-1]
-        rewards_time_major = make_time_major(
-            batch[SampleBatch.REWARDS],
+        values_time_major = make_time_major(
+            values,
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_values_time_major = make_time_major(
+            batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        # Then add the shifted-by-one bootstrapped values to that to yield the final
+        # value tensor. Use the last ts in that resulting tensor as the
+        # "bootstrapped" values for vtrace.
+        shape = tf.shape(values_time_major)
+        T, B = shape[0], shape[1]
+        # Augment `values_time_major` by one timestep at the end (all zeros).
+        values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+        # Augment `bootstrap_values_time_major` by one timestep at the beginning
+        # (all zeros).
+        bootstrap_values_time_major = tf.concat(
+            [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+        )
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        # Adding values and bootstrap_values yields the correct values+bootstrap
+        # configuration, from which we can then take t=-1 (last timestep) to get
+        # the bootstrap_value arg for the vtrace function below.
+        values_time_major += bootstrap_values_time_major
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
@@ -93,8 +115,8 @@ def compute_loss_for_module(
             behaviour_action_log_probs=behaviour_actions_logp_time_major,
             discounts=discounts_time_major,
             rewards=rewards_time_major,
-            values=values_time_major,
-            bootstrap_value=bootstrap_value,
+            values=values_time_major[:-1],
+            bootstrap_value=values_time_major[-1],
             clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold,
             clip_rho_threshold=hps.vtrace_clip_rho_threshold,
         )
@@ -127,7 +149,7 @@ def compute_loss_for_module(
         mean_pi_loss = -tf.math.reduce_mean(surrogate_loss)
 
         # The baseline loss.
-        delta = values_time_major - vtrace_adjusted_target_values
+        delta = values_time_major[:-1] - vtrace_adjusted_target_values
         mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2)
 
         # The entropy loss.
diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py
index e1f170a092dc..80fa01ffc3db 100644
--- a/rllib/algorithms/appo/torch/appo_torch_learner.py
+++ b/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -74,17 +74,38 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        values_time_major = make_time_major(
-            values,
+        rewards_time_major = make_time_major(
+            batch[SampleBatch.REWARDS],
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        bootstrap_value = values_time_major[-1]
-        rewards_time_major = make_time_major(
-            batch[SampleBatch.REWARDS],
+        values_time_major = make_time_major(
+            values,
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_values_time_major = make_time_major(
+            batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        # Then add the shifted-by-one bootstrapped values to that to yield the final
+        # value tensor. Use the last ts in that resulting tensor as the
+        # "bootstrapped" values for vtrace.
+        T, B = values_time_major.shape
+        # Augment `values_time_major` by one timestep at the end (all zeros).
+        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        # Augment `bootstrap_values_time_major` by one timestep at the beginning
+        # (all zeros).
+        bootstrap_values_time_major = torch.cat(
+            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        )
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        # Adding values and bootstrap_values yields the correct values+bootstrap
+        # configuration, from which we can then take t=-1 (last timestep) to get
+        # the bootstrap_value arg for the vtrace function below.
+        values_time_major += bootstrap_values_time_major
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
@@ -103,8 +124,8 @@ def compute_loss_for_module(
             behaviour_action_log_probs=behaviour_actions_logp_time_major,
             discounts=discounts_time_major,
             rewards=rewards_time_major,
-            values=values_time_major,
-            bootstrap_value=bootstrap_value,
+            values=values_time_major[:-1],
+            bootstrap_value=values_time_major[-1],
             clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold,
             clip_rho_threshold=hps.vtrace_clip_rho_threshold,
         )
@@ -133,7 +154,7 @@ def compute_loss_for_module(
         mean_pi_loss = -torch.mean(surrogate_loss)
 
         # The baseline loss.
-        delta = values_time_major - vtrace_adjusted_target_values
+        delta = values_time_major[:-1] - vtrace_adjusted_target_values
         mean_vf_loss = 0.5 * torch.mean(delta**2)
 
         # The entropy loss.

From 7b3d7b5b1b155e6c42f5d9a8fb753bcf5591263d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 2 Jun 2023 15:27:07 +0200
Subject: [PATCH 05/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py       | 38 +++++-------
 rllib/algorithms/appo/appo_torch_policy.py    | 49 +++++-----------
 rllib/algorithms/appo/tf/appo_tf_learner.py   |  2 +-
 rllib/algorithms/impala/impala_tf_policy.py   | 43 +++++++++++++-
 .../algorithms/impala/impala_torch_policy.py  | 58 ++++++++++++++-----
 rllib/evaluation/postprocessing.py            |  2 +-
 6 files changed, 114 insertions(+), 78 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 064ff0b9cf7f..de1713c366d6 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -19,7 +19,6 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
-    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -162,26 +161,18 @@ def make_time_major(*args, **kw):
             bootstrap_values_time_major = make_time_major(
                 train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
             )
-
-            # Then add the shifted-by-one bootstrapped values to that to yield the final
-            # value tensor. Use the last ts in that resulting tensor as the
-            # "bootstrapped" values for vtrace.
+            # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+            # with T being the rollout length (max trajectory len).
+            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+            # ONLY at the last ts of a trajectory (for the following timestep,
+            # which is one past(!) the last ts). All other values in that tensor are
+            # zero.
             shape = tf.shape(values_time_major)
-            T, B = shape[0], shape[1]
-            # Augment `values_time_major` by one timestep at the end (all zeros).
+            B = shape[1]
             values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-            # Augment `bootstrap_values_time_major` by one timestep at the beginning
-            # (all zeros).
             bootstrap_values_time_major = tf.concat(
                 [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
             )
-            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-            # ONLY at the last ts of a trajectory (for the following timestep,
-            # which is one past(!) the last ts). All other values in that tensor are
-            # zero.
-            # Adding values and bootstrap_values yields the correct values+bootstrap
-            # configuration, from which we can then take t=-1 (last timestep) to get
-            # the bootstrap_value arg for the vtrace function below.
             values_time_major += bootstrap_values_time_major
 
             if self.is_recurrent():
@@ -208,9 +199,7 @@ def reduce_mean_valid(t):
                 old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
 
                 # Prepare KL for Loss
-                mean_kl = make_time_major(
-                    old_policy_action_dist.multi_kl(action_dist)
-                )
+                mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist))
 
                 unpacked_behaviour_logits = tf.split(
                     behaviour_logits, output_hidden_shape, axis=1
@@ -228,9 +217,7 @@ def reduce_mean_valid(t):
                         target_policy_logits=make_time_major(
                             unpacked_old_policy_behaviour_logits
                         ),
-                        actions=tf.unstack(
-                            make_time_major(loss_actions), axis=2
-                        ),
+                        actions=tf.unstack(make_time_major(loss_actions), axis=2),
                         discounts=tf.cast(
                             ~make_time_major(tf.cast(dones, tf.bool)),
                             tf.float32,
@@ -394,12 +381,15 @@ def postprocess_trajectory(
             other_agent_batches: Optional[SampleBatch] = None,
             episode: Optional["Episode"] = None,
         ):
+            # Call super's postprocess_trajectory first.
+            sample_batch = super().postprocess_trajectory(
+                sample_batch, other_agent_batches, episode
+            )
+
             if not self.config["vtrace"]:
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
-            else:
-                sample_batch = compute_bootstrap_value(sample_batch, self)
 
             return sample_batch
 
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index fe4a092045fa..167c4070e134 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -19,7 +19,6 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
-    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -158,28 +157,21 @@ def _make_time_major(*args, **kwargs):
         prev_action_dist = dist_class(behaviour_logits, model)
         values = model.value_function()
         values_time_major = _make_time_major(values)
-        bootstrap_values_time_major = make_time_major(
+        bootstrap_values_time_major = _make_time_major(
             train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
         )
 
-        # Then add the shifted-by-one bootstrapped values to that to yield the final
-        # value tensor. Use the last ts in that resulting tensor as the
-        # "bootstrapped" values for vtrace.
-        T, B = values_time_major.shape
-        # Augment `values_time_major` by one timestep at the end (all zeros).
-        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        # Augment `bootstrap_values_time_major` by one timestep at the beginning
-        # (all zeros).
-        bootstrap_values_time_major = torch.cat(
-            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        )
+        # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+        # with T being the rollout length (max trajectory len).
         # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
         # ONLY at the last ts of a trajectory (for the following timestep,
         # which is one past(!) the last ts). All other values in that tensor are
         # zero.
-        # Adding values and bootstrap_values yields the correct values+bootstrap
-        # configuration, from which we can then take t=-1 (last timestep) to get
-        # the bootstrap_value arg for the vtrace function below.
+        _, B = values_time_major.shape
+        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        bootstrap_values_time_major = torch.cat(
+            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        )
         values_time_major += bootstrap_values_time_major
 
         if self.is_recurrent():
@@ -196,9 +188,7 @@ def reduce_mean_valid(t):
             reduce_mean_valid = torch.mean
 
         if self.config["vtrace"]:
-            logger.debug(
-                "Using V-Trace surrogate loss (vtrace=True)"
-            )
+            logger.debug("Using V-Trace surrogate loss (vtrace=True)")
 
             old_policy_behaviour_logits = target_model_out.detach()
             old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
@@ -228,15 +218,11 @@ def reduce_mean_valid(t):
 
             # Compute vtrace on the CPU for better perf.
             vtrace_returns = vtrace.multi_from_logits(
-                behaviour_policy_logits=_make_time_major(
-                    unpacked_behaviour_logits
-                ),
+                behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits),
                 target_policy_logits=_make_time_major(
                     unpacked_old_policy_behaviour_logits
                 ),
-                actions=torch.unbind(
-                    _make_time_major(loss_actions), dim=2
-                ),
+                actions=torch.unbind(_make_time_major(loss_actions), dim=2),
                 discounts=(1.0 - _make_time_major(dones).float())
                 * self.config["gamma"],
                 rewards=_make_time_major(rewards),
@@ -248,12 +234,8 @@ def reduce_mean_valid(t):
                 clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
             )
 
-            actions_logp = _make_time_major(
-                action_dist.logp(actions)
-            )
-            prev_actions_logp = _make_time_major(
-                prev_action_dist.logp(actions)
-            )
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
             old_policy_actions_logp = _make_time_major(
                 old_policy_action_dist.logp(actions)
             )
@@ -283,9 +265,7 @@ def reduce_mean_valid(t):
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
-            mean_entropy = reduce_mean_valid(
-                _make_time_major(action_dist.entropy())
-            )
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
 
         else:
             logger.debug("Using PPO surrogate loss (vtrace=False)")
@@ -417,6 +397,7 @@ def postprocess_trajectory(
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
+
         return sample_batch
 
     @override(TorchPolicyV2)
diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
index c9959c670db7..9418649969d4 100644
--- a/rllib/algorithms/appo/tf/appo_tf_learner.py
+++ b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -78,7 +78,7 @@ def compute_loss_for_module(
         # value tensor. Use the last ts in that resulting tensor as the
         # "bootstrapped" values for vtrace.
         shape = tf.shape(values_time_major)
-        T, B = shape[0], shape[1]
+        B = shape[1]
         # Augment `values_time_major` by one timestep at the end (all zeros).
         values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
         # Augment `bootstrap_values_time_major` by one timestep at the beginning
diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index 74cb92f80149..114af959474b 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -5,9 +5,11 @@
 import numpy as np
 import logging
 import gymnasium as gym
-from typing import Dict, List, Type, Union
+from typing import Dict, List, Optional, Type, Union
 
 from ray.rllib.algorithms.impala import vtrace_tf as vtrace
+from ray.rllib.evaluation.episode import Episode
+from ray.rllib.evaluation.postprocessing import compute_bootstrap_value
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution
 from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2
@@ -346,6 +348,24 @@ def make_time_major(*args, **kw):
             )
             unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1)
             values = model.value_function()
+            values_time_major = make_time_major(values)
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+
+            # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+            # with T being the rollout length (max trajectory len).
+            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+            # ONLY at the last ts of a trajectory (for the following timestep,
+            # which is one past(!) the last ts). All other values in that tensor are
+            # zero.
+            shape = tf.shape(values_time_major)
+            B = shape[1]
+            values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+            bootstrap_values_time_major = tf.concat(
+                [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+            )
+            values_time_major += bootstrap_values_time_major
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
@@ -370,8 +390,8 @@ def make_time_major(*args, **kw):
                 target_logits=make_time_major(unpacked_outputs),
                 discount=self.config["gamma"],
                 rewards=make_time_major(rewards),
-                values=make_time_major(values),
-                bootstrap_value=make_time_major(values)[-1],#TODO
+                values=values_time_major[:-1],
+                bootstrap_value=values_time_major[-1],
                 dist_class=Categorical if is_multidiscrete else dist_class,
                 model=model,
                 valid_mask=make_time_major(mask),
@@ -408,6 +428,23 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
                 ),
             }
 
+        @override(base)
+        def postprocess_trajectory(
+            self,
+            sample_batch: SampleBatch,
+            other_agent_batches: Optional[SampleBatch] = None,
+            episode: Optional["Episode"] = None,
+        ):
+            # Call super's postprocess_trajectory first.
+            sample_batch = super().postprocess_trajectory(
+                sample_batch, other_agent_batches, episode
+            )
+
+            if self.config["vtrace"]:
+                sample_batch = compute_bootstrap_value(sample_batch, self)
+
+            return sample_batch
+
         @override(base)
         def get_batch_divisibility_req(self) -> int:
             return self.config["rollout_fragment_length"]
diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index 3cc6fefeffc0..6048de80f8f3 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -1,9 +1,11 @@
 import gymnasium as gym
 import logging
 import numpy as np
-from typing import Dict, List, Type, Union
+from typing import Dict, List, Optional, Type, Union
 
 import ray
+from ray.rllib.evaluation.episode import Episode
+from ray.rllib.evaluation.postprocessing import compute_bootstrap_value
 from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.models.action_dist import ActionDistribution
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical
@@ -261,6 +263,23 @@ def _make_time_major(*args, **kw):
             )
             unpacked_outputs = torch.chunk(model_out, output_hidden_shape, dim=1)
         values = model.value_function()
+        values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = _make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+
+        # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+        # with T being the rollout length (max trajectory len).
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        _, B = values_time_major.shape
+        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        bootstrap_values_time_major = torch.cat(
+            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        )
+        values_time_major += bootstrap_values_time_major
 
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
@@ -275,24 +294,16 @@ def _make_time_major(*args, **kw):
         # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc.
         loss = VTraceLoss(
             actions=_make_time_major(loss_actions),
-            actions_logp=_make_time_major(
-                action_dist.logp(actions)
-            ),
-            actions_entropy=_make_time_major(
-                action_dist.entropy()
-            ),
+            actions_logp=_make_time_major(action_dist.logp(actions)),
+            actions_entropy=_make_time_major(action_dist.entropy()),
             dones=_make_time_major(dones),
-            behaviour_action_logp=_make_time_major(
-                behaviour_action_logp
-            ),
-            behaviour_logits=_make_time_major(
-                unpacked_behaviour_logits
-            ),
+            behaviour_action_logp=_make_time_major(behaviour_action_logp),
+            behaviour_logits=_make_time_major(unpacked_behaviour_logits),
             target_logits=_make_time_major(unpacked_outputs),
             discount=self.config["gamma"],
             rewards=_make_time_major(rewards),
-            values=_make_time_major(values),
-            bootstrap_value=_make_time_major(bootstrap_values)[-1],
+            values=values_time_major[:-1],
+            bootstrap_value=values_time_major[-1],
             dist_class=TorchCategorical if is_multidiscrete else dist_class,
             model=model,
             valid_mask=_make_time_major(mask),
@@ -343,6 +354,23 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
             }
         )
 
+    @override(TorchPolicyV2)
+    def postprocess_trajectory(
+        self,
+        sample_batch: SampleBatch,
+        other_agent_batches: Optional[SampleBatch] = None,
+        episode: Optional["Episode"] = None,
+    ):
+        # Call super's postprocess_trajectory first.
+        sample_batch = super().postprocess_trajectory(
+            sample_batch, other_agent_batches, episode
+        )
+
+        if self.config["vtrace"]:
+            sample_batch = compute_bootstrap_value(sample_batch, self)
+
+        return sample_batch
+
     @override(TorchPolicyV2)
     def extra_grad_process(
         self, optimizer: "torch.optim.Optimizer", loss: TensorType
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 754cb256e39a..4b70b03a425d 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -197,7 +197,7 @@ def compute_gae_for_sample_batch(
 
 @DeveloperAPI
 def compute_bootstrap_value(sample_batch, policy):
-    """TODO (sven): docstr """
+    """TODO (sven): docstr"""
     # Trajectory is actually complete -> last r=0.0.
     if sample_batch[SampleBatch.TERMINATEDS][-1]:
         last_r = 0.0

From c2451bb55099be04396acd34f9479c854773e61f Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 3 Jun 2023 11:51:24 +0200
Subject: [PATCH 06/22] fix test

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/impala/impala_tf_policy.py        | 4 +++-
 rllib/evaluation/tests/test_trajectory_view_api.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index 114af959474b..670081682225 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -20,7 +20,7 @@
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_utils import explained_variance
-from ray.rllib.policy.tf_mixins import GradStatsMixin
+from ray.rllib.policy.tf_mixins import GradStatsMixin, ValueNetworkMixin
 from ray.rllib.utils.typing import (
     LocalOptimizer,
     ModelGradients,
@@ -273,6 +273,7 @@ class ImpalaTFPolicy(
         LearningRateSchedule,
         EntropyCoeffSchedule,
         GradStatsMixin,
+        ValueNetworkMixin,
         base,
     ):
         def __init__(
@@ -295,6 +296,7 @@ def __init__(
                 existing_inputs=existing_inputs,
                 existing_model=existing_model,
             )
+            ValueNetworkMixin.__init__(self, config)
 
             # If Learner API is used, we don't need any loss-specific mixins.
             # However, we also would like to avoid creating special Policy-subclasses
diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py
index 6bcf6225444a..0a7d76e6959f 100644
--- a/rllib/evaluation/tests/test_trajectory_view_api.py
+++ b/rllib/evaluation/tests/test_trajectory_view_api.py
@@ -129,7 +129,7 @@ def test_traj_view_lstm_prev_actions_and_rewards(self):
             view_req_policy = policy.view_requirements
             # 7=obs, prev-a + r, 2x state-in, 2x state-out.
             assert len(view_req_model) == 7, view_req_model
-            assert len(view_req_policy) == 22, (len(view_req_policy), view_req_policy)
+            assert len(view_req_policy) == 23, (len(view_req_policy), view_req_policy)
             for key in [
                 SampleBatch.OBS,
                 SampleBatch.ACTIONS,

From 5f0b92c8f8997a5c9e55d0098ee423c80ce76c24 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 3 Jun 2023 12:34:39 +0200
Subject: [PATCH 07/22] fix test

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/policy/tf_mixins.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py
index fe5e23a330e8..49ba23a0f039 100644
--- a/rllib/policy/tf_mixins.py
+++ b/rllib/policy/tf_mixins.py
@@ -293,9 +293,9 @@ class ValueNetworkMixin:
     """
 
     def __init__(self, config):
-        # When doing GAE, we need the value function estimate on the
+        # When doing GAE or vtrace, we need the value function estimate on the
         # observation.
-        if config["use_gae"]:
+        if config.get("use_gae") or config.get("vtrace"):
 
             # Input dict is provided to us automatically via the Model's
             # requirements. It's a single-timestep (last one in trajectory)

From 605cf6ef3d4d484b6dc89a0029baa80bf0db0165 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Sat, 3 Jun 2023 21:09:52 +0200
Subject: [PATCH 08/22] fix test

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py    |  3 +++
 rllib/algorithms/appo/appo_torch_policy.py | 14 +++++++++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index de1713c366d6..b23c5bf45a1e 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -19,6 +19,7 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -390,6 +391,8 @@ def postprocess_trajectory(
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
+            else:
+                sample_batch = compute_bootstrap_value(sample_batch, self)
 
             return sample_batch
 
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 167c4070e134..6a1e8f865b34 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -19,6 +19,7 @@
 )
 from ray.rllib.evaluation.episode import Episode
 from ray.rllib.evaluation.postprocessing import (
+    compute_bootstrap_value,
     compute_gae_for_sample_batch,
     Postprocessing,
 )
@@ -389,14 +390,17 @@ def postprocess_trajectory(
         sample_batch = super().postprocess_trajectory(
             sample_batch, other_agent_batches, episode
         )
-        if not self.config["vtrace"]:
-            # Do all post-processing always with no_grad().
-            # Not using this here will introduce a memory leak
-            # in torch (issue #6962).
-            with torch.no_grad():
+
+        # Do all post-processing always with no_grad().
+        # Not using this here will introduce a memory leak
+        # in torch (issue #6962).
+        with torch.no_grad():
+            if not self.config["vtrace"]:
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
+            else:
+                sample_batch = compute_bootstrap_value(sample_batch, self)
 
         return sample_batch
 

From 8a0e5c9e25ebcdb06e9cf4dbdc68966e8ce35429 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 5 Jun 2023 11:11:23 +0200
Subject: [PATCH 09/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/impala/impala_torch_policy.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index 6048de80f8f3..bde4cfbb6234 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -13,6 +13,7 @@
 from ray.rllib.policy.torch_mixins import (
     EntropyCoeffSchedule,
     LearningRateSchedule,
+    ValueNetworkMixin,
 )
 from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2
 from ray.rllib.utils.annotations import override
@@ -190,6 +191,7 @@ class ImpalaTorchPolicy(
     VTraceOptimizer,
     LearningRateSchedule,
     EntropyCoeffSchedule,
+    ValueNetworkMixin,
     TorchPolicyV2,
 ):
     """PyTorch policy class used with Impala."""
@@ -220,6 +222,8 @@ def __init__(self, observation_space, action_space, config):
             max_seq_len=config["model"]["max_seq_len"],
         )
 
+        ValueNetworkMixin.__init__(self, config)
+
         self._initialize_loss_from_dummy_batch()
 
     @override(TorchPolicyV2)

From 95bd73f3b8800135ed16bdc045179e6921bfb2ed Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Mon, 5 Jun 2023 14:55:44 +0200
Subject: [PATCH 10/22] fix

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/policy/torch_mixins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py
index b258c1d74560..359eb31089a5 100644
--- a/rllib/policy/torch_mixins.py
+++ b/rllib/policy/torch_mixins.py
@@ -126,7 +126,7 @@ class ValueNetworkMixin:
     def __init__(self, config):
         # When doing GAE, we need the value function estimate on the
         # observation.
-        if config["use_gae"]:
+        if config.get("use_gae") or config.get("vtrace"):
             # Input dict is provided to us automatically via the Model's
             # requirements. It's a single-timestep (last one in trajectory)
             # input_dict.

From 957e185000d76d0f5fa8ebfffac5194575da5778 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 9 Jun 2023 12:54:36 +0200
Subject: [PATCH 11/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py       | 12 ++---
 rllib/algorithms/appo/appo_torch_policy.py    | 10 ++--
 rllib/algorithms/impala/impala.py             |  9 +++-
 rllib/algorithms/impala/impala_tf_policy.py   | 12 ++---
 .../algorithms/impala/impala_torch_policy.py  | 10 ++--
 rllib/evaluation/postprocessing.py            | 49 ++++++++++++++++++-
 6 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index b23c5bf45a1e..c228b31489ab 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -162,12 +162,10 @@ def make_time_major(*args, **kw):
             bootstrap_values_time_major = make_time_major(
                 train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
             )
-            # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
-            # with T being the rollout length (max trajectory len).
-            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-            # ONLY at the last ts of a trajectory (for the following timestep,
-            # which is one past(!) the last ts). All other values in that tensor are
-            # zero.
+            # See docstring of:
+            # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
+            # details on the following computation to yield correct t=1 to T+1
+            # trajectories, with T being the rollout length (max trajectory len).
             shape = tf.shape(values_time_major)
             B = shape[1]
             values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
@@ -392,6 +390,8 @@ def postprocess_trajectory(
                     self, sample_batch, other_agent_batches, episode
                 )
             else:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
                 sample_batch = compute_bootstrap_value(sample_batch, self)
 
             return sample_batch
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 6a1e8f865b34..51820ad086fd 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -162,12 +162,10 @@ def _make_time_major(*args, **kwargs):
             train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
         )
 
-        # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+        # See docstring of:
+        # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
+        # on the following computation to yield correct t=1 to T+1 trajectories,
         # with T being the rollout length (max trajectory len).
-        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-        # ONLY at the last ts of a trajectory (for the following timestep,
-        # which is one past(!) the last ts). All other values in that tensor are
-        # zero.
         _, B = values_time_major.shape
         values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
         bootstrap_values_time_major = torch.cat(
@@ -400,6 +398,8 @@ def postprocess_trajectory(
                     self, sample_batch, other_agent_batches, episode
                 )
             else:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
                 sample_batch = compute_bootstrap_value(sample_batch, self)
 
         return sample_batch
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index 3990d36b20ad..af2b8f937ff9 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -284,7 +284,14 @@ def training(
             This updated AlgorithmConfig object.
         """
         if vtrace_drop_last_ts is not None:
-            deprecation_warning(old="vtrace_drop_last_ts", error=False)
+            deprecation_warning(
+                old="vtrace_drop_last_ts",
+                help="The v-trace operations in RLlib have been enhanced and we are "
+                     "now using proper value bootstrapping at the end of each "
+                     "trajectory, such that no timesteps in our loss functions have to "
+                     "be dropped anymore.",
+                error=True,
+            )
 
         # Pass kwargs onto super's `training()` method.
         super().training(**kwargs)
diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index 670081682225..30f726458c90 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -355,12 +355,10 @@ def make_time_major(*args, **kw):
                 train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
             )
 
-            # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
-            # with T being the rollout length (max trajectory len).
-            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-            # ONLY at the last ts of a trajectory (for the following timestep,
-            # which is one past(!) the last ts). All other values in that tensor are
-            # zero.
+            # See docstring of:
+            # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
+            # details on the following computation to yield correct t=1 to T+1
+            # trajectories, with T being the rollout length (max trajectory len).
             shape = tf.shape(values_time_major)
             B = shape[1]
             values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
@@ -443,6 +441,8 @@ def postprocess_trajectory(
             )
 
             if self.config["vtrace"]:
+                # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+                # inside the loss for vtrace calculations.
                 sample_batch = compute_bootstrap_value(sample_batch, self)
 
             return sample_batch
diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index bde4cfbb6234..8ade5bee4272 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -272,12 +272,10 @@ def _make_time_major(*args, **kw):
             train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
         )
 
-        # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+        # See docstring of:
+        # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
+        # on the following computation to yield correct t=1 to T+1 trajectories,
         # with T being the rollout length (max trajectory len).
-        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-        # ONLY at the last ts of a trajectory (for the following timestep,
-        # which is one past(!) the last ts). All other values in that tensor are
-        # zero.
         _, B = values_time_major.shape
         values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
         bootstrap_values_time_major = torch.cat(
@@ -371,6 +369,8 @@ def postprocess_trajectory(
         )
 
         if self.config["vtrace"]:
+            # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
+            # inside the loss for vtrace calculations.
             sample_batch = compute_bootstrap_value(sample_batch, self)
 
         return sample_batch
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 4b70b03a425d..685d8b900fe2 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -179,6 +179,8 @@ def compute_gae_for_sample_batch(
     Returns:
         The postprocessed, modified SampleBatch (or a new one).
     """
+    # Compute the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need for the
+    # following `last_r` arg in `compute_advantages()`.
     sample_batch = compute_bootstrap_value(sample_batch, policy)
 
     # Adds the policy logits, VF preds, and advantages to the batch,
@@ -196,8 +198,51 @@ def compute_gae_for_sample_batch(
 
 
 @DeveloperAPI
-def compute_bootstrap_value(sample_batch, policy):
-    """TODO (sven): docstr"""
+def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> SampleBatch:
+    """Performs a value function computation at the end of a trajectory.
+
+    If the trajectory is terminated (not truncated), will not use the value function,
+    but assume that the value of the last timestep is 0.0.
+    In all other cases, will use the given policy's value function to compute the
+    "bootstrapped" value estimate at the end of the given trajectory. To do so, the
+    very last observation (sample_batch[NEXT_OBS][-1]) and - if applicable -
+    the very last state output (sample_batch[STATE_OUT][-1]) wil be used as inputs to
+    the value function.
+
+    The thus computed value estimate will be stored in a new column of the
+    `sample_batch`: SampleBatch.VALUES_BOOTSTRAPPED. Thereby, values at all timesteps
+    in this column are set to 0.0, except or the last timestep, which receives the
+    computed bootstrapped value.
+    This is done, such that in any loss function (which processes raw, intact
+    trajectories, such as those of IMPALA and APPO) can use this new column as follows:
+
+    Example: numbers=ts in episode, '|'=episode boundary (terminal),
+    X=bootstrapped value (!= 0.0 b/c ts=12 is not a terminal).
+    ts=5 is NOT a terminal.
+    T:                     8   9  10  11  12 <- no terminal
+    VF_PREDS:              .   .   .   .   .
+    VALUES_BOOTSTRAPPED:   0   0   0   0   X
+
+    Then in the loss, we can now shift VALUES_BOOTSTRAPPED by one to the right ->
+    T:                     8   9  10  11  12  13 <- next timestep
+    VF_PREDS:              .   .   .   .   . 0.0 <- fill right side ts (13) with 0.0
+    VALUES_BOOTSTRAPPED: 0.0   0   0   0   0   X <- fill left side ts (8) with 0.0
+
+    Then add the two columns together to yield the correct value estimates for each
+    timestep.
+    T:                   8   9  10  11  12  13 <- next (bootstrapped) timestep
+    RESULTING VALUES:    .   .   .   .   . 0+X
+
+    Args:
+        sample_batch: The SampleBatch (single trajectory) for which to compute the
+            bootstrap value at the end. This SampleBatch will be altered in place
+            (by adding a new column: SampleBatch.VALUES_BOOTSTRAPPED).
+        policy: The Policy object, whose value function to use.
+
+    Returns:
+         The altered SampleBatch (with the extra SampleBatch.VALUES_BOOTSTRAPPED
+         column).
+    """
     # Trajectory is actually complete -> last r=0.0.
     if sample_batch[SampleBatch.TERMINATEDS][-1]:
         last_r = 0.0

From a2730a46ec060377b1c5eeef377699c55be7e444 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Fri, 9 Jun 2023 13:12:10 +0200
Subject: [PATCH 12/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/tf/appo_tf_learner.py       | 4 +++-
 rllib/algorithms/appo/torch/appo_torch_learner.py | 6 ++++--
 rllib/algorithms/impala/impala.py                 | 6 +++---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
index 9418649969d4..abfbea25b34b 100644
--- a/rllib/algorithms/appo/tf/appo_tf_learner.py
+++ b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -72,7 +72,9 @@ def compute_loss_for_module(
             recurrent_seq_len=hps.recurrent_seq_len,
         )
         bootstrap_values_time_major = make_time_major(
-            batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            batch[SampleBatch.VALUES_BOOTSTRAPPED],
+            trajectory_len=hps.rollout_frag_or_episode_len,
+            recurrent_seq_len=hps.recurrent_seq_len,
         )
         # Then add the shifted-by-one bootstrapped values to that to yield the final
         # value tensor. Use the last ts in that resulting tensor as the
diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py
index 80fa01ffc3db..79e89ad41906 100644
--- a/rllib/algorithms/appo/torch/appo_torch_learner.py
+++ b/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -85,12 +85,14 @@ def compute_loss_for_module(
             recurrent_seq_len=hps.recurrent_seq_len,
         )
         bootstrap_values_time_major = make_time_major(
-            batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            batch[SampleBatch.VALUES_BOOTSTRAPPED],
+            trajectory_len=hps.rollout_frag_or_episode_len,
+            recurrent_seq_len=hps.recurrent_seq_len,
         )
         # Then add the shifted-by-one bootstrapped values to that to yield the final
         # value tensor. Use the last ts in that resulting tensor as the
         # "bootstrapped" values for vtrace.
-        T, B = values_time_major.shape
+        _, B = values_time_major.shape
         # Augment `values_time_major` by one timestep at the end (all zeros).
         values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
         # Augment `bootstrap_values_time_major` by one timestep at the beginning
diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py
index af2b8f937ff9..670934d8f03f 100644
--- a/rllib/algorithms/impala/impala.py
+++ b/rllib/algorithms/impala/impala.py
@@ -287,9 +287,9 @@ def training(
             deprecation_warning(
                 old="vtrace_drop_last_ts",
                 help="The v-trace operations in RLlib have been enhanced and we are "
-                     "now using proper value bootstrapping at the end of each "
-                     "trajectory, such that no timesteps in our loss functions have to "
-                     "be dropped anymore.",
+                "now using proper value bootstrapping at the end of each "
+                "trajectory, such that no timesteps in our loss functions have to "
+                "be dropped anymore.",
                 error=True,
             )
 

From 4305ffab98d9b99e956d6f42512e879accb6c72b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 20 Jun 2023 15:14:10 +0200
Subject: [PATCH 13/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py       | 38 ++++++++++---------
 rllib/algorithms/appo/appo_torch_policy.py    | 29 +++++++-------
 rllib/algorithms/appo/tf/appo_tf_learner.py   | 22 ++++++-----
 .../appo/torch/appo_torch_learner.py          | 20 +++++-----
 rllib/algorithms/impala/impala_tf_policy.py   | 25 ++++++------
 .../algorithms/impala/impala_torch_policy.py  | 23 +++++------
 6 files changed, 83 insertions(+), 74 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index c228b31489ab..2afb021e6720 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -162,17 +162,19 @@ def make_time_major(*args, **kw):
             bootstrap_values_time_major = make_time_major(
                 train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
             )
-            # See docstring of:
-            # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
-            # details on the following computation to yield correct t=1 to T+1
-            # trajectories, with T being the rollout length (max trajectory len).
-            shape = tf.shape(values_time_major)
-            B = shape[1]
-            values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-            bootstrap_values_time_major = tf.concat(
-                [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-            )
-            values_time_major += bootstrap_values_time_major
+            bootstrap_value = bootstrap_values_time_major[-1]
+
+            ## See docstring of:
+            ## `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
+            ## details on the following computation to yield correct t=1 to T+1
+            ## trajectories, with T being the rollout length (max trajectory len).
+            #shape = tf.shape(values_time_major)
+            #B = shape[1]
+            #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+            #bootstrap_values_time_major = tf.concat(
+            #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+            #)
+            #values_time_major += bootstrap_values_time_major
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
@@ -223,8 +225,8 @@ def reduce_mean_valid(t):
                         )
                         * self.config["gamma"],
                         rewards=make_time_major(rewards),
-                        values=values_time_major[:-1],
-                        bootstrap_value=values_time_major[-1],
+                        values=values_time_major,
+                        bootstrap_value=bootstrap_value,
                         dist_class=Categorical if is_multidiscrete else dist_class,
                         model=model,
                         clip_rho_threshold=tf.cast(
@@ -266,7 +268,7 @@ def reduce_mean_valid(t):
 
                 # The value function loss.
                 value_targets = vtrace_returns.vs
-                delta = values_time_major[:-1] - value_targets
+                delta = values_time_major - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
@@ -305,7 +307,7 @@ def reduce_mean_valid(t):
                 value_targets = make_time_major(
                     train_batch[Postprocessing.VALUE_TARGETS]
                 )
-                delta = values_time_major[:-1] - value_targets
+                delta = values_time_major - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
@@ -381,9 +383,9 @@ def postprocess_trajectory(
             episode: Optional["Episode"] = None,
         ):
             # Call super's postprocess_trajectory first.
-            sample_batch = super().postprocess_trajectory(
-                sample_batch, other_agent_batches, episode
-            )
+            #sample_batch = super().postprocess_trajectory(
+            #    sample_batch, other_agent_batches, episode
+            #)
 
             if not self.config["vtrace"]:
                 sample_batch = compute_gae_for_sample_batch(
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 51820ad086fd..6256e3bbef6f 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -166,12 +166,13 @@ def _make_time_major(*args, **kwargs):
         # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
         # on the following computation to yield correct t=1 to T+1 trajectories,
         # with T being the rollout length (max trajectory len).
-        _, B = values_time_major.shape
-        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        bootstrap_values_time_major = torch.cat(
-            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        )
-        values_time_major += bootstrap_values_time_major
+        #_, B = values_time_major.shape
+        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        #bootstrap_values_time_major = torch.cat(
+        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        #)
+        #values_time_major += bootstrap_values_time_major
+        bootstrap_value = bootstrap_values_time_major[-1]
 
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
@@ -225,8 +226,8 @@ def reduce_mean_valid(t):
                 discounts=(1.0 - _make_time_major(dones).float())
                 * self.config["gamma"],
                 rewards=_make_time_major(rewards),
-                values=values_time_major[:-1],
-                bootstrap_value=values_time_major[-1],
+                values=values_time_major,
+                bootstrap_value=bootstrap_value,
                 dist_class=TorchCategorical if is_multidiscrete else dist_class,
                 model=model,
                 clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
@@ -260,7 +261,7 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = vtrace_returns.vs.to(values_time_major.device)
-            delta = values_time_major[:-1] - value_targets
+            delta = values_time_major - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
@@ -292,7 +293,7 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS])
-            delta = values_time_major[:-1] - value_targets
+            delta = values_time_major - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
@@ -319,7 +320,7 @@ def reduce_mean_valid(t):
         model.tower_stats["value_targets"] = value_targets
         model.tower_stats["vf_explained_var"] = explained_variance(
             torch.reshape(value_targets, [-1]),
-            torch.reshape(values_time_major[:-1], [-1]),
+            torch.reshape(values_time_major, [-1]),
         )
 
         return total_loss
@@ -385,9 +386,9 @@ def postprocess_trajectory(
         episode: Optional["Episode"] = None,
     ):
         # Call super's postprocess_trajectory first.
-        sample_batch = super().postprocess_trajectory(
-            sample_batch, other_agent_batches, episode
-        )
+        #sample_batch = super().postprocess_trajectory(
+        #    sample_batch, other_agent_batches, episode
+        #)
 
         # Do all post-processing always with no_grad().
         # Not using this here will introduce a memory leak
diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
index abfbea25b34b..d4386008f026 100644
--- a/rllib/algorithms/appo/tf/appo_tf_learner.py
+++ b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -76,18 +76,20 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_value = bootstrap_values_time_major[-1]
+
         # Then add the shifted-by-one bootstrapped values to that to yield the final
         # value tensor. Use the last ts in that resulting tensor as the
         # "bootstrapped" values for vtrace.
-        shape = tf.shape(values_time_major)
-        B = shape[1]
+        #shape = tf.shape(values_time_major)
+        #B = shape[1]
         # Augment `values_time_major` by one timestep at the end (all zeros).
-        values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+        #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
         # Augment `bootstrap_values_time_major` by one timestep at the beginning
         # (all zeros).
-        bootstrap_values_time_major = tf.concat(
-            [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-        )
+        #bootstrap_values_time_major = tf.concat(
+        #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+        #)
         # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
         # ONLY at the last ts of a trajectory (for the following timestep,
         # which is one past(!) the last ts). All other values in that tensor are
@@ -95,7 +97,7 @@ def compute_loss_for_module(
         # Adding values and bootstrap_values yields the correct values+bootstrap
         # configuration, from which we can then take t=-1 (last timestep) to get
         # the bootstrap_value arg for the vtrace function below.
-        values_time_major += bootstrap_values_time_major
+        #values_time_major += bootstrap_values_time_major
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
@@ -117,8 +119,8 @@ def compute_loss_for_module(
             behaviour_action_log_probs=behaviour_actions_logp_time_major,
             discounts=discounts_time_major,
             rewards=rewards_time_major,
-            values=values_time_major[:-1],
-            bootstrap_value=values_time_major[-1],
+            values=values_time_major,
+            bootstrap_value=bootstrap_value,
             clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold,
             clip_rho_threshold=hps.vtrace_clip_rho_threshold,
         )
@@ -151,7 +153,7 @@ def compute_loss_for_module(
         mean_pi_loss = -tf.math.reduce_mean(surrogate_loss)
 
         # The baseline loss.
-        delta = values_time_major[:-1] - vtrace_adjusted_target_values
+        delta = values_time_major - vtrace_adjusted_target_values
         mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2)
 
         # The entropy loss.
diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py
index 79e89ad41906..039851701ecf 100644
--- a/rllib/algorithms/appo/torch/appo_torch_learner.py
+++ b/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -89,17 +89,19 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_value = bootstrap_values_time_major[-1]
+
         # Then add the shifted-by-one bootstrapped values to that to yield the final
         # value tensor. Use the last ts in that resulting tensor as the
         # "bootstrapped" values for vtrace.
-        _, B = values_time_major.shape
+        #_, B = values_time_major.shape
         # Augment `values_time_major` by one timestep at the end (all zeros).
-        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
         # Augment `bootstrap_values_time_major` by one timestep at the beginning
         # (all zeros).
-        bootstrap_values_time_major = torch.cat(
-            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        )
+        #bootstrap_values_time_major = torch.cat(
+        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        #)
         # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
         # ONLY at the last ts of a trajectory (for the following timestep,
         # which is one past(!) the last ts). All other values in that tensor are
@@ -107,7 +109,7 @@ def compute_loss_for_module(
         # Adding values and bootstrap_values yields the correct values+bootstrap
         # configuration, from which we can then take t=-1 (last timestep) to get
         # the bootstrap_value arg for the vtrace function below.
-        values_time_major += bootstrap_values_time_major
+        #values_time_major += bootstrap_values_time_major
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
@@ -126,8 +128,8 @@ def compute_loss_for_module(
             behaviour_action_log_probs=behaviour_actions_logp_time_major,
             discounts=discounts_time_major,
             rewards=rewards_time_major,
-            values=values_time_major[:-1],
-            bootstrap_value=values_time_major[-1],
+            values=values_time_major,
+            bootstrap_value=bootstrap_value,
             clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold,
             clip_rho_threshold=hps.vtrace_clip_rho_threshold,
         )
@@ -156,7 +158,7 @@ def compute_loss_for_module(
         mean_pi_loss = -torch.mean(surrogate_loss)
 
         # The baseline loss.
-        delta = values_time_major[:-1] - vtrace_adjusted_target_values
+        delta = values_time_major - vtrace_adjusted_target_values
         mean_vf_loss = 0.5 * torch.mean(delta**2)
 
         # The entropy loss.
diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index 30f726458c90..519fd9022e62 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -359,13 +359,14 @@ def make_time_major(*args, **kw):
             # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
             # details on the following computation to yield correct t=1 to T+1
             # trajectories, with T being the rollout length (max trajectory len).
-            shape = tf.shape(values_time_major)
-            B = shape[1]
-            values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-            bootstrap_values_time_major = tf.concat(
-                [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-            )
-            values_time_major += bootstrap_values_time_major
+            #shape = tf.shape(values_time_major)
+            #B = shape[1]
+            #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+            #bootstrap_values_time_major = tf.concat(
+            #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+            #)
+            #values_time_major += bootstrap_values_time_major
+            bootstrap_value = bootstrap_values_time_major[-1]
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
@@ -390,8 +391,8 @@ def make_time_major(*args, **kw):
                 target_logits=make_time_major(unpacked_outputs),
                 discount=self.config["gamma"],
                 rewards=make_time_major(rewards),
-                values=values_time_major[:-1],
-                bootstrap_value=values_time_major[-1],
+                values=values_time_major,
+                bootstrap_value=bootstrap_value,
                 dist_class=Categorical if is_multidiscrete else dist_class,
                 model=model,
                 valid_mask=make_time_major(mask),
@@ -436,9 +437,9 @@ def postprocess_trajectory(
             episode: Optional["Episode"] = None,
         ):
             # Call super's postprocess_trajectory first.
-            sample_batch = super().postprocess_trajectory(
-                sample_batch, other_agent_batches, episode
-            )
+            #sample_batch = super().postprocess_trajectory(
+            #    sample_batch, other_agent_batches, episode
+            #)
 
             if self.config["vtrace"]:
                 # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index 8ade5bee4272..839e42631d19 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -276,12 +276,13 @@ def _make_time_major(*args, **kw):
         # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
         # on the following computation to yield correct t=1 to T+1 trajectories,
         # with T being the rollout length (max trajectory len).
-        _, B = values_time_major.shape
-        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        bootstrap_values_time_major = torch.cat(
-            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        )
-        values_time_major += bootstrap_values_time_major
+        #_, B = values_time_major.shape
+        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        #bootstrap_values_time_major = torch.cat(
+        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        #)
+        #values_time_major += bootstrap_values_time_major
+        bootstrap_value = bootstrap_values_time_major[-1]
 
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
@@ -304,8 +305,8 @@ def _make_time_major(*args, **kw):
             target_logits=_make_time_major(unpacked_outputs),
             discount=self.config["gamma"],
             rewards=_make_time_major(rewards),
-            values=values_time_major[:-1],
-            bootstrap_value=values_time_major[-1],
+            values=values_time_major,
+            bootstrap_value=bootstrap_value,
             dist_class=TorchCategorical if is_multidiscrete else dist_class,
             model=model,
             valid_mask=_make_time_major(mask),
@@ -364,9 +365,9 @@ def postprocess_trajectory(
         episode: Optional["Episode"] = None,
     ):
         # Call super's postprocess_trajectory first.
-        sample_batch = super().postprocess_trajectory(
-            sample_batch, other_agent_batches, episode
-        )
+        #sample_batch = super().postprocess_trajectory(
+        #    sample_batch, other_agent_batches, episode
+        #)
 
         if self.config["vtrace"]:
             # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need

From 644098e93ad7617a04c1fffc1fff3b51e43cc52d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 20 Jun 2023 15:40:35 +0200
Subject: [PATCH 14/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py       | 16 ++-----------
 rllib/algorithms/appo/appo_torch_policy.py    | 15 ++----------
 rllib/algorithms/appo/tf/appo_tf_learner.py   | 23 +------------------
 .../appo/torch/appo_torch_learner.py          | 22 +-----------------
 rllib/algorithms/impala/impala_tf_policy.py   | 16 ++-----------
 .../algorithms/impala/impala_torch_policy.py  | 15 ++----------
 .../algorithms/impala/tf/impala_tf_learner.py | 11 ++++++---
 .../impala/torch/impala_torch_learner.py      | 11 ++++++---
 8 files changed, 26 insertions(+), 103 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 2afb021e6720..fa1909869c94 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -164,18 +164,6 @@ def make_time_major(*args, **kw):
             )
             bootstrap_value = bootstrap_values_time_major[-1]
 
-            ## See docstring of:
-            ## `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
-            ## details on the following computation to yield correct t=1 to T+1
-            ## trajectories, with T being the rollout length (max trajectory len).
-            #shape = tf.shape(values_time_major)
-            #B = shape[1]
-            #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-            #bootstrap_values_time_major = tf.concat(
-            #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-            #)
-            #values_time_major += bootstrap_values_time_major
-
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
                 mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
@@ -383,9 +371,9 @@ def postprocess_trajectory(
             episode: Optional["Episode"] = None,
         ):
             # Call super's postprocess_trajectory first.
-            #sample_batch = super().postprocess_trajectory(
+            # sample_batch = super().postprocess_trajectory(
             #    sample_batch, other_agent_batches, episode
-            #)
+            # )
 
             if not self.config["vtrace"]:
                 sample_batch = compute_gae_for_sample_batch(
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 6256e3bbef6f..448e7705862c 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -161,17 +161,6 @@ def _make_time_major(*args, **kwargs):
         bootstrap_values_time_major = _make_time_major(
             train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
         )
-
-        # See docstring of:
-        # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
-        # on the following computation to yield correct t=1 to T+1 trajectories,
-        # with T being the rollout length (max trajectory len).
-        #_, B = values_time_major.shape
-        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        #bootstrap_values_time_major = torch.cat(
-        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        #)
-        #values_time_major += bootstrap_values_time_major
         bootstrap_value = bootstrap_values_time_major[-1]
 
         if self.is_recurrent():
@@ -386,9 +375,9 @@ def postprocess_trajectory(
         episode: Optional["Episode"] = None,
     ):
         # Call super's postprocess_trajectory first.
-        #sample_batch = super().postprocess_trajectory(
+        # sample_batch = super().postprocess_trajectory(
         #    sample_batch, other_agent_batches, episode
-        #)
+        # )
 
         # Do all post-processing always with no_grad().
         # Not using this here will introduce a memory leak
diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py
index d4386008f026..3c949d1a090f 100644
--- a/rllib/algorithms/appo/tf/appo_tf_learner.py
+++ b/rllib/algorithms/appo/tf/appo_tf_learner.py
@@ -78,28 +78,7 @@ def compute_loss_for_module(
         )
         bootstrap_value = bootstrap_values_time_major[-1]
 
-        # Then add the shifted-by-one bootstrapped values to that to yield the final
-        # value tensor. Use the last ts in that resulting tensor as the
-        # "bootstrapped" values for vtrace.
-        #shape = tf.shape(values_time_major)
-        #B = shape[1]
-        # Augment `values_time_major` by one timestep at the end (all zeros).
-        #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-        # Augment `bootstrap_values_time_major` by one timestep at the beginning
-        # (all zeros).
-        #bootstrap_values_time_major = tf.concat(
-        #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-        #)
-        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-        # ONLY at the last ts of a trajectory (for the following timestep,
-        # which is one past(!) the last ts). All other values in that tensor are
-        # zero.
-        # Adding values and bootstrap_values yields the correct values+bootstrap
-        # configuration, from which we can then take t=-1 (last timestep) to get
-        # the bootstrap_value arg for the vtrace function below.
-        #values_time_major += bootstrap_values_time_major
-
-        # the discount factor that is used should be gamma except for timesteps where
+        # The discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
         discounts_time_major = (
             1.0
diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py
index 039851701ecf..91b86a989264 100644
--- a/rllib/algorithms/appo/torch/appo_torch_learner.py
+++ b/rllib/algorithms/appo/torch/appo_torch_learner.py
@@ -91,27 +91,7 @@ def compute_loss_for_module(
         )
         bootstrap_value = bootstrap_values_time_major[-1]
 
-        # Then add the shifted-by-one bootstrapped values to that to yield the final
-        # value tensor. Use the last ts in that resulting tensor as the
-        # "bootstrapped" values for vtrace.
-        #_, B = values_time_major.shape
-        # Augment `values_time_major` by one timestep at the end (all zeros).
-        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        # Augment `bootstrap_values_time_major` by one timestep at the beginning
-        # (all zeros).
-        #bootstrap_values_time_major = torch.cat(
-        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        #)
-        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
-        # ONLY at the last ts of a trajectory (for the following timestep,
-        # which is one past(!) the last ts). All other values in that tensor are
-        # zero.
-        # Adding values and bootstrap_values yields the correct values+bootstrap
-        # configuration, from which we can then take t=-1 (last timestep) to get
-        # the bootstrap_value arg for the vtrace function below.
-        #values_time_major += bootstrap_values_time_major
-
-        # the discount factor that is used should be gamma except for timesteps where
+        # The discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
         discounts_time_major = (
             1.0
diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py
index 519fd9022e62..5e2700cb3639 100644
--- a/rllib/algorithms/impala/impala_tf_policy.py
+++ b/rllib/algorithms/impala/impala_tf_policy.py
@@ -354,18 +354,6 @@ def make_time_major(*args, **kw):
             bootstrap_values_time_major = make_time_major(
                 train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
             )
-
-            # See docstring of:
-            # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for
-            # details on the following computation to yield correct t=1 to T+1
-            # trajectories, with T being the rollout length (max trajectory len).
-            #shape = tf.shape(values_time_major)
-            #B = shape[1]
-            #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
-            #bootstrap_values_time_major = tf.concat(
-            #    [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
-            #)
-            #values_time_major += bootstrap_values_time_major
             bootstrap_value = bootstrap_values_time_major[-1]
 
             if self.is_recurrent():
@@ -437,9 +425,9 @@ def postprocess_trajectory(
             episode: Optional["Episode"] = None,
         ):
             # Call super's postprocess_trajectory first.
-            #sample_batch = super().postprocess_trajectory(
+            # sample_batch = super().postprocess_trajectory(
             #    sample_batch, other_agent_batches, episode
-            #)
+            # )
 
             if self.config["vtrace"]:
                 # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py
index 839e42631d19..80b39f64f065 100644
--- a/rllib/algorithms/impala/impala_torch_policy.py
+++ b/rllib/algorithms/impala/impala_torch_policy.py
@@ -271,17 +271,6 @@ def _make_time_major(*args, **kw):
         bootstrap_values_time_major = _make_time_major(
             train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
         )
-
-        # See docstring of:
-        # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details
-        # on the following computation to yield correct t=1 to T+1 trajectories,
-        # with T being the rollout length (max trajectory len).
-        #_, B = values_time_major.shape
-        #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
-        #bootstrap_values_time_major = torch.cat(
-        #    [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
-        #)
-        #values_time_major += bootstrap_values_time_major
         bootstrap_value = bootstrap_values_time_major[-1]
 
         if self.is_recurrent():
@@ -365,9 +354,9 @@ def postprocess_trajectory(
         episode: Optional["Episode"] = None,
     ):
         # Call super's postprocess_trajectory first.
-        #sample_batch = super().postprocess_trajectory(
+        # sample_batch = super().postprocess_trajectory(
         #    sample_batch, other_agent_batches, episode
-        #)
+        # )
 
         if self.config["vtrace"]:
             # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need
diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py
index 87ddc2cd938f..03a960c691e3 100644
--- a/rllib/algorithms/impala/tf/impala_tf_learner.py
+++ b/rllib/algorithms/impala/tf/impala_tf_learner.py
@@ -48,17 +48,22 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        rewards_time_major = make_time_major(
+            batch[SampleBatch.REWARDS],
+            trajectory_len=hps.rollout_frag_or_episode_len,
+            recurrent_seq_len=hps.recurrent_seq_len,
+        )
         values_time_major = make_time_major(
             values,
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        bootstrap_value = values_time_major[-1]
-        rewards_time_major = make_time_major(
-            batch[SampleBatch.REWARDS],
+        bootstrap_values_time_major = make_time_major(
+            batch[SampleBatch.VALUES_BOOTSTRAPPED],
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_value = bootstrap_values_time_major[-1]
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py
index 9a5d25f32ed6..8aa7b8b3d07d 100644
--- a/rllib/algorithms/impala/torch/impala_torch_learner.py
+++ b/rllib/algorithms/impala/torch/impala_torch_learner.py
@@ -58,17 +58,22 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        rewards_time_major = make_time_major(
+            batch[SampleBatch.REWARDS],
+            trajectory_len=hps.rollout_frag_or_episode_len,
+            recurrent_seq_len=hps.recurrent_seq_len,
+        )
         values_time_major = make_time_major(
             values,
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        bootstrap_value = values_time_major[-1]
-        rewards_time_major = make_time_major(
-            batch[SampleBatch.REWARDS],
+        bootstrap_values_time_major = make_time_major(
+            batch[SampleBatch.VALUES_BOOTSTRAPPED],
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_value = bootstrap_values_time_major[-1]
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.

From c8cc8ae01a549f25066f58b610af3d114ebbc3e2 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Tue, 20 Jun 2023 17:21:17 +0200
Subject: [PATCH 15/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/evaluation/postprocessing.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 685d8b900fe2..3afa125078a6 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -223,16 +223,6 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample
     VF_PREDS:              .   .   .   .   .
     VALUES_BOOTSTRAPPED:   0   0   0   0   X
 
-    Then in the loss, we can now shift VALUES_BOOTSTRAPPED by one to the right ->
-    T:                     8   9  10  11  12  13 <- next timestep
-    VF_PREDS:              .   .   .   .   . 0.0 <- fill right side ts (13) with 0.0
-    VALUES_BOOTSTRAPPED: 0.0   0   0   0   0   X <- fill left side ts (8) with 0.0
-
-    Then add the two columns together to yield the correct value estimates for each
-    timestep.
-    T:                   8   9  10  11  12  13 <- next (bootstrapped) timestep
-    RESULTING VALUES:    .   .   .   .   . 0+X
-
     Args:
         sample_batch: The SampleBatch (single trajectory) for which to compute the
             bootstrap value at the end. This SampleBatch will be altered in place

From 56a3dea96a170e62df00ab28c93ac61b5a85be7d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 13:42:53 +0200
Subject: [PATCH 16/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 .../impala/tests/test_impala_learner.py       |  3 +++
 rllib/evaluation/postprocessing.py            |  9 +++++---
 .../appo/stateless_cartpole_appo_vtrace.py    | 21 +++++++++++++++++++
 3 files changed, 30 insertions(+), 3 deletions(-)
 create mode 100644 rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py

diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py
index 7f130ca93997..1e3672074c37 100644
--- a/rllib/algorithms/impala/tests/test_impala_learner.py
+++ b/rllib/algorithms/impala/tests/test_impala_learner.py
@@ -31,6 +31,9 @@
     SampleBatch.VF_PREDS: np.array(
         list(reversed(range(frag_length))), dtype=np.float32
     ),
+    SampleBatch.VALUES_BOOTSTRAPPED: np.array(
+        list(reversed(range(frag_length))), dtype=np.float32
+    ),
     SampleBatch.ACTION_LOGP: np.log(
         np.random.uniform(low=0, high=1, size=(frag_length,))
     ).astype(np.float32),
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 3afa125078a6..fed2863392cb 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -273,9 +273,12 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample
     # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the
     # very last timestep (where this bootstrapping value is actually needed), which
     # we set to the computed `last_r`.
-    values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS])
-    values_bootstrapped[-1] = last_r
-    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped
+    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate(
+        [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0
+    )
+    #values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS])
+    #values_bootstrapped[-1] = last_r
+    #sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped
 
     return sample_batch
 
diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py b/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py
new file mode 100644
index 000000000000..93ed841c2746
--- /dev/null
+++ b/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py
@@ -0,0 +1,21 @@
+from ray.rllib.algorithms.appo.appo import APPOConfig
+from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
+
+
+config = (
+    APPOConfig()
+    .environment(StatelessCartPole)
+    .resources(num_gpus=0)
+    .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")#TODO rollout workers=5
+    .training(lr=0.0003, num_sgd_iter=6, vf_loss_coeff=0.01, model={
+        "fcnet_hiddens": [32],
+        "fcnet_activation": "linear",
+        "vf_share_layers": True,
+        "use_lstm": True,
+    })
+)
+
+stop = {
+    "timesteps_total": 500000,
+    "sampler_results/episode_reward_mean": 150.0,
+}

From bf0bb3644629a62ffb4d9afede09f12e6887e394 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 14:46:40 +0200
Subject: [PATCH 17/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                            | 10 ++++++++++
 rllib/algorithms/appo/appo_tf_policy.py                |  3 +--
 rllib/algorithms/appo/appo_torch_policy.py             |  5 +----
 rllib/evaluation/postprocessing.py                     |  5 +----
 ...ppo_vtrace.py => stateless-cartpole-appo-vtrace.py} |  0
 5 files changed, 13 insertions(+), 10 deletions(-)
 rename rllib/tuned_examples/appo/{stateless_cartpole_appo_vtrace.py => stateless-cartpole-appo-vtrace.py} (100%)

diff --git a/rllib/BUILD b/rllib/BUILD
index 0963a733e149..a11b7f189ef6 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -301,6 +301,16 @@ py_test(
     args = ["--dir=tuned_examples/appo"]
 )
 
+py_test(
+    name = "learning_tests_stateless_cartpole_appo_vtrace",
+    main = "tests/run_regression_tests.py",
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"],
+    size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer
+    srcs = ["tests/run_regression_tests.py"],
+    data = ["tuned_examples/appo/stateless-cartpole-appo-vtrace.py"],
+    args = ["--dir=tuned_examples/appo"]
+)
+
 # ARS
 py_test(
     name = "learning_tests_cartpole_ars",
diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index fa1909869c94..16579f18eb7a 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -389,8 +389,7 @@ def postprocess_trajectory(
         @override(base)
         def extra_action_out_fn(self) -> Dict[str, TensorType]:
             extra_action_fetches = super().extra_action_out_fn()
-            if not self.config["vtrace"]:
-                extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function()
+            extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function()
             return extra_action_fetches
 
         @override(base)
diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py
index 448e7705862c..a97446c44281 100644
--- a/rllib/algorithms/appo/appo_torch_policy.py
+++ b/rllib/algorithms/appo/appo_torch_policy.py
@@ -362,10 +362,7 @@ def extra_action_out(
         model: TorchModelV2,
         action_dist: TorchDistributionWrapper,
     ) -> Dict[str, TensorType]:
-        out = {}
-        if not self.config["vtrace"]:
-            out[SampleBatch.VF_PREDS] = model.value_function()
-        return out
+        return {SampleBatch.VF_PREDS: model.value_function()}
 
     @override(TorchPolicyV2)
     def postprocess_trajectory(
diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index fed2863392cb..47b2531dfc33 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -270,15 +270,12 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample
         else:
             last_r = policy._value(**input_dict)
 
-    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the
+    # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the
     # very last timestep (where this bootstrapping value is actually needed), which
     # we set to the computed `last_r`.
     sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate(
         [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0
     )
-    #values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS])
-    #values_bootstrapped[-1] = last_r
-    #sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped
 
     return sample_batch
 
diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
similarity index 100%
rename from rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py
rename to rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py

From 9c8c7e2b6e3ffc9dcf7374be582e345b5d025c9e Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 15:52:43 +0200
Subject: [PATCH 18/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/evaluation/postprocessing.py            |  7 +++---
 .../appo/stateless-cartpole-appo-vtrace.py    | 22 +++++++++++++------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 47b2531dfc33..1b6f026d330e 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -273,9 +273,10 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample
     # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the
     # very last timestep (where this bootstrapping value is actually needed), which
     # we set to the computed `last_r`.
-    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate(
-        [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0
-    )
+    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate([
+        convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]),
+        np.array([convert_to_numpy(last_r)], dtype=np.float32),
+    ], axis=0)
 
     return sample_batch
 
diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
index 93ed841c2746..50293322a4c3 100644
--- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
+++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
@@ -6,13 +6,21 @@
     APPOConfig()
     .environment(StatelessCartPole)
     .resources(num_gpus=0)
-    .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")#TODO rollout workers=5
-    .training(lr=0.0003, num_sgd_iter=6, vf_loss_coeff=0.01, model={
-        "fcnet_hiddens": [32],
-        "fcnet_activation": "linear",
-        "vf_share_layers": True,
-        "use_lstm": True,
-    })
+    .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")
+    .training(
+        lr=0.0003,
+        num_sgd_iter=6,
+        vf_loss_coeff=0.01,
+        model={
+            "fcnet_hiddens": [32],
+            "fcnet_activation": "linear",
+            "vf_share_layers": True,
+            "use_lstm": True,
+        },
+        # TODO: Switch over to new stack.
+        # _enable_learner_api=True,
+    )
+    # .rl_module(_enable_rl_module_api=True)
 )
 
 stop = {

From d2d0eb3524f07c1b0b3c5eff8ade4dcd8cc6348d Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 16:16:59 +0200
Subject: [PATCH 19/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/evaluation/postprocessing.py                    | 11 +++++++----
 .../appo/stateless-cartpole-appo-vtrace.py            |  2 +-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py
index 1b6f026d330e..525d375a67af 100644
--- a/rllib/evaluation/postprocessing.py
+++ b/rllib/evaluation/postprocessing.py
@@ -273,10 +273,13 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample
     # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the
     # very last timestep (where this bootstrapping value is actually needed), which
     # we set to the computed `last_r`.
-    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate([
-        convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]),
-        np.array([convert_to_numpy(last_r)], dtype=np.float32),
-    ], axis=0)
+    sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate(
+        [
+            convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]),
+            np.array([convert_to_numpy(last_r)], dtype=np.float32),
+        ],
+        axis=0,
+    )
 
     return sample_batch
 
diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
index 50293322a4c3..fc3c37e1d9dd 100644
--- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
+++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
@@ -17,7 +17,7 @@
             "vf_share_layers": True,
             "use_lstm": True,
         },
-        # TODO: Switch over to new stack.
+        # TODO: Switch over to new stack once it supports LSTMs.
         # _enable_learner_api=True,
     )
     # .rl_module(_enable_rl_module_api=True)

From 0ab57036ef7acf00c4b6f472a9593248f602fd3c Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 16:58:05 +0200
Subject: [PATCH 20/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                                 | 2 +-
 rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index 0fbd2f1ff50f..d8d833165a6e 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -3720,7 +3720,7 @@ py_test(
 )
 
 # Taking out this test for now: Mixed torch- and tf- policies within the same
-# Algorothm never really worked.
+# Algorithm never really worked.
 # py_test(
 #     name = "examples/multi_agent_two_trainers_mixed_torch_tf",
 #     main = "examples/multi_agent_two_trainers.py",
diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
index fc3c37e1d9dd..e127da1e28c3 100644
--- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
+++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py
@@ -6,7 +6,7 @@
     APPOConfig()
     .environment(StatelessCartPole)
     .resources(num_gpus=0)
-    .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")
+    .rollouts(num_rollout_workers=1, observation_filter="MeanStdFilter")
     .training(
         lr=0.0003,
         num_sgd_iter=6,

From 17be7ccf3c81d219109bbed0f09b4e5cb6541043 Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Wed, 21 Jun 2023 18:31:16 +0200
Subject: [PATCH 21/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/BUILD                                      | 4 ++--
 rllib/algorithms/appo/tests/test_appo_learner.py | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/rllib/BUILD b/rllib/BUILD
index d8d833165a6e..feab865a38dc 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -304,8 +304,8 @@ py_test(
 py_test(
     name = "learning_tests_stateless_cartpole_appo_vtrace",
     main = "tests/run_regression_tests.py",
-    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"],
-    size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer
+    tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"],
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/appo/stateless-cartpole-appo-vtrace.py"],
     args = ["--dir=tuned_examples/appo"]
diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py
index 19bcacdc89be..6a018ca40344 100644
--- a/rllib/algorithms/appo/tests/test_appo_learner.py
+++ b/rllib/algorithms/appo/tests/test_appo_learner.py
@@ -36,6 +36,9 @@
     SampleBatch.VF_PREDS: np.array(
         list(reversed(range(frag_length))), dtype=np.float32
     ),
+    SampleBatch.VALUES_BOOTSTRAPPED: np.array(
+        list(reversed(range(frag_length))), dtype=np.float32
+    ),
     SampleBatch.ACTION_LOGP: np.log(
         np.random.uniform(low=0, high=1, size=(frag_length,))
     ).astype(np.float32),

From f55532c10702fba53e6df4e67a35769c44daa52b Mon Sep 17 00:00:00 2001
From: sven1977 <svenmika1977@gmail.com>
Date: Thu, 22 Jun 2023 12:17:28 +0200
Subject: [PATCH 22/22] wip

Signed-off-by: sven1977 <svenmika1977@gmail.com>
---
 rllib/algorithms/appo/appo_tf_policy.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py
index 16579f18eb7a..ec4dd78c295e 100644
--- a/rllib/algorithms/appo/appo_tf_policy.py
+++ b/rllib/algorithms/appo/appo_tf_policy.py
@@ -386,12 +386,6 @@ def postprocess_trajectory(
 
             return sample_batch
 
-        @override(base)
-        def extra_action_out_fn(self) -> Dict[str, TensorType]:
-            extra_action_fetches = super().extra_action_out_fn()
-            extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function()
-            return extra_action_fetches
-
         @override(base)
         def get_batch_divisibility_req(self) -> int:
             return self.config["rollout_fragment_length"]