ray-project · sven1977 · Jun 22, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
@@ -144,7 +144,6 @@ def loss(
                 is_multidiscrete = False
                 output_hidden_shape = 1
 
-            # TODO: (sven) deprecate this when trajectory view API gets activated.
             def make_time_major(*args, **kw):
                 return _make_time_major(
                     self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw
@@ -159,12 +158,28 @@ def make_time_major(*args, **kw):
             prev_action_dist = dist_class(behaviour_logits, self.model)
             values = self.model.value_function()
             values_time_major = make_time_major(values)
+            bootstrap_values_time_major = make_time_major(
+                train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+            )
+            # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+            # with T being the rollout length (max trajectory len).
+            # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+            # ONLY at the last ts of a trajectory (for the following timestep,
+            # which is one past(!) the last ts). All other values in that tensor are
+            # zero.
+            shape = tf.shape(values_time_major)
+            B = shape[1]
+            values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+            bootstrap_values_time_major = tf.concat(
+                [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+            )
+            values_time_major += bootstrap_values_time_major
 
             if self.is_recurrent():
                 max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS])
                 mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
                 mask = tf.reshape(mask, [-1])
-                mask = make_time_major(mask, drop_last=self.config["vtrace"])
+                mask = make_time_major(mask)
 
                 def reduce_mean_valid(t):
                     return tf.reduce_mean(tf.boolean_mask(t, mask))
@@ -173,11 +188,7 @@ def reduce_mean_valid(t):
                 reduce_mean_valid = tf.reduce_mean
 
             if self.config["vtrace"]:
-                drop_last = self.config["vtrace_drop_last_ts"]
-                logger.debug(
-                    "Using V-Trace surrogate loss (vtrace=True; "
-                    f"drop_last={drop_last})"
-                )
+                logger.debug("Using V-Trace surrogate loss (vtrace=True)")
 
                 # Prepare actions for loss.
                 loss_actions = (
@@ -188,9 +199,7 @@ def reduce_mean_valid(t):
                 old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
 
                 # Prepare KL for Loss
-                mean_kl = make_time_major(
-                    old_policy_action_dist.multi_kl(action_dist), drop_last=drop_last
-                )
+                mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist))
 
                 unpacked_behaviour_logits = tf.split(
                     behaviour_logits, output_hidden_shape, axis=1
@@ -203,25 +212,19 @@ def reduce_mean_valid(t):
                 with tf.device("/cpu:0"):
                     vtrace_returns = vtrace.multi_from_logits(
                         behaviour_policy_logits=make_time_major(
-                            unpacked_behaviour_logits, drop_last=drop_last
+                            unpacked_behaviour_logits
                         ),
                         target_policy_logits=make_time_major(
-                            unpacked_old_policy_behaviour_logits, drop_last=drop_last
-                        ),
-                        actions=tf.unstack(
-                            make_time_major(loss_actions, drop_last=drop_last), axis=2
+                            unpacked_old_policy_behaviour_logits
                         ),
+                        actions=tf.unstack(make_time_major(loss_actions), axis=2),
                         discounts=tf.cast(
-                            ~make_time_major(
-                                tf.cast(dones, tf.bool), drop_last=drop_last
-                            ),
+                            ~make_time_major(tf.cast(dones, tf.bool)),
                             tf.float32,
                         )
                         * self.config["gamma"],
-                        rewards=make_time_major(rewards, drop_last=drop_last),
-                        values=values_time_major[:-1]
-                        if drop_last
-                        else values_time_major,
+                        rewards=make_time_major(rewards),
+                        values=values_time_major[:-1],
                         bootstrap_value=values_time_major[-1],
                         dist_class=Categorical if is_multidiscrete else dist_class,
                         model=model,
@@ -233,14 +236,10 @@ def reduce_mean_valid(t):
                         ),
                     )
 
-                actions_logp = make_time_major(
-                    action_dist.logp(actions), drop_last=drop_last
-                )
-                prev_actions_logp = make_time_major(
-                    prev_action_dist.logp(actions), drop_last=drop_last
-                )
+                actions_logp = make_time_major(action_dist.logp(actions))
+                prev_actions_logp = make_time_major(prev_action_dist.logp(actions))
                 old_policy_actions_logp = make_time_major(
-                    old_policy_action_dist.logp(actions), drop_last=drop_last
+                    old_policy_action_dist.logp(actions)
                 )
 
                 is_ratio = tf.clip_by_value(
@@ -267,17 +266,12 @@ def reduce_mean_valid(t):
                 mean_policy_loss = -reduce_mean_valid(surrogate_loss)
 
                 # The value function loss.
-                if drop_last:
-                    delta = values_time_major[:-1] - vtrace_returns.vs
-                else:
-                    delta = values_time_major - vtrace_returns.vs
                 value_targets = vtrace_returns.vs
+                delta = values_time_major[:-1] - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
-                actions_entropy = make_time_major(
-                    action_dist.multi_entropy(), drop_last=True
-                )
+                actions_entropy = make_time_major(action_dist.multi_entropy())
                 mean_entropy = reduce_mean_valid(actions_entropy)
 
             else:
@@ -312,7 +306,7 @@ def reduce_mean_valid(t):
                 value_targets = make_time_major(
                     train_batch[Postprocessing.VALUE_TARGETS]
                 )
-                delta = values_time_major - value_targets
+                delta = values_time_major[:-1] - value_targets
                 mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
 
                 # The entropy loss.
@@ -353,7 +347,6 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]:
                 self,
                 train_batch.get(SampleBatch.SEQ_LENS),
                 self.model.value_function(),
-                drop_last=self.config["vtrace"] and self.config["vtrace_drop_last_ts"],
             )
 
             stats_dict = {
@@ -388,6 +381,11 @@ def postprocess_trajectory(
             other_agent_batches: Optional[SampleBatch] = None,
             episode: Optional["Episode"] = None,
         ):
+            # Call super's postprocess_trajectory first.
+            sample_batch = super().postprocess_trajectory(
+                sample_batch, other_agent_batches, episode
+            )
+
             if not self.config["vtrace"]:
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode

@@ -157,14 +157,28 @@ def _make_time_major(*args, **kwargs):
         prev_action_dist = dist_class(behaviour_logits, model)
         values = model.value_function()
         values_time_major = _make_time_major(values)
+        bootstrap_values_time_major = _make_time_major(
+            train_batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
 
-        drop_last = self.config["vtrace"] and self.config["vtrace_drop_last_ts"]
+        # Add values to bootstrap values to yield correct t=1 to T+1 trajectories,
+        # with T being the rollout length (max trajectory len).
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        _, B = values_time_major.shape
+        values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0)
+        bootstrap_values_time_major = torch.cat(
+            [torch.zeros((1, B)), bootstrap_values_time_major], dim=0
+        )
+        values_time_major += bootstrap_values_time_major
 
         if self.is_recurrent():
             max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS])
             mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len)
             mask = torch.reshape(mask, [-1])
-            mask = _make_time_major(mask, drop_last=drop_last)
+            mask = _make_time_major(mask)
             num_valid = torch.sum(mask)
 
             def reduce_mean_valid(t):
@@ -174,9 +188,7 @@ def reduce_mean_valid(t):
             reduce_mean_valid = torch.mean
 
         if self.config["vtrace"]:
-            logger.debug(
-                "Using V-Trace surrogate loss (vtrace=True; " f"drop_last={drop_last})"
-            )
+            logger.debug("Using V-Trace surrogate loss (vtrace=True)")
 
             old_policy_behaviour_logits = target_model_out.detach()
             old_policy_action_dist = dist_class(old_policy_behaviour_logits, model)
@@ -202,40 +214,30 @@ def reduce_mean_valid(t):
             )
 
             # Prepare KL for loss.
-            action_kl = _make_time_major(
-                old_policy_action_dist.kl(action_dist), drop_last=drop_last
-            )
+            action_kl = _make_time_major(old_policy_action_dist.kl(action_dist))
 
             # Compute vtrace on the CPU for better perf.
             vtrace_returns = vtrace.multi_from_logits(
-                behaviour_policy_logits=_make_time_major(
-                    unpacked_behaviour_logits, drop_last=drop_last
-                ),
+                behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits),
                 target_policy_logits=_make_time_major(
-                    unpacked_old_policy_behaviour_logits, drop_last=drop_last
-                ),
-                actions=torch.unbind(
-                    _make_time_major(loss_actions, drop_last=drop_last), dim=2
+                    unpacked_old_policy_behaviour_logits
                 ),
-                discounts=(1.0 - _make_time_major(dones, drop_last=drop_last).float())
+                actions=torch.unbind(_make_time_major(loss_actions), dim=2),
+                discounts=(1.0 - _make_time_major(dones).float())
                 * self.config["gamma"],
-                rewards=_make_time_major(rewards, drop_last=drop_last),
-                values=values_time_major[:-1] if drop_last else values_time_major,
+                rewards=_make_time_major(rewards),
+                values=values_time_major[:-1],
                 bootstrap_value=values_time_major[-1],
                 dist_class=TorchCategorical if is_multidiscrete else dist_class,
                 model=model,
                 clip_rho_threshold=self.config["vtrace_clip_rho_threshold"],
                 clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"],
             )
 
-            actions_logp = _make_time_major(
-                action_dist.logp(actions), drop_last=drop_last
-            )
-            prev_actions_logp = _make_time_major(
-                prev_action_dist.logp(actions), drop_last=drop_last
-            )
+            actions_logp = _make_time_major(action_dist.logp(actions))
+            prev_actions_logp = _make_time_major(prev_action_dist.logp(actions))
             old_policy_actions_logp = _make_time_major(
-                old_policy_action_dist.logp(actions), drop_last=drop_last
+                old_policy_action_dist.logp(actions)
             )
             is_ratio = torch.clamp(
                 torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0
@@ -259,16 +261,11 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = vtrace_returns.vs.to(values_time_major.device)
-            if drop_last:
-                delta = values_time_major[:-1] - value_targets
-            else:
-                delta = values_time_major - value_targets
+            delta = values_time_major[:-1] - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
-            mean_entropy = reduce_mean_valid(
-                _make_time_major(action_dist.entropy(), drop_last=drop_last)
-            )
+            mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy()))
 
         else:
             logger.debug("Using PPO surrogate loss (vtrace=False)")
@@ -296,7 +293,7 @@ def reduce_mean_valid(t):
 
             # The value function loss.
             value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS])
-            delta = values_time_major - value_targets
+            delta = values_time_major[:-1] - value_targets
             mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0))
 
             # The entropy loss.
@@ -323,9 +320,7 @@ def reduce_mean_valid(t):
         model.tower_stats["value_targets"] = value_targets
         model.tower_stats["vf_explained_var"] = explained_variance(
             torch.reshape(value_targets, [-1]),
-            torch.reshape(
-                values_time_major[:-1] if drop_last else values_time_major, [-1]
-            ),
+            torch.reshape(values_time_major[:-1], [-1]),
         )
 
         return total_loss
@@ -402,6 +397,7 @@ def postprocess_trajectory(
                 sample_batch = compute_gae_for_sample_batch(
                     self, sample_batch, other_agent_batches, episode
                 )
+
         return sample_batch
 
     @override(TorchPolicyV2)

@@ -61,17 +61,39 @@ def compute_loss_for_module(
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        values_time_major = make_time_major(
-            values,
+        rewards_time_major = make_time_major(
+            batch[SampleBatch.REWARDS],
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
-        bootstrap_value = values_time_major[-1]
-        rewards_time_major = make_time_major(
-            batch[SampleBatch.REWARDS],
+        values_time_major = make_time_major(
+            values,
             trajectory_len=hps.rollout_frag_or_episode_len,
             recurrent_seq_len=hps.recurrent_seq_len,
         )
+        bootstrap_values_time_major = make_time_major(
+            batch[SampleBatch.VALUES_BOOTSTRAPPED]
+        )
+        # Then add the shifted-by-one bootstrapped values to that to yield the final
+        # value tensor. Use the last ts in that resulting tensor as the
+        # "bootstrapped" values for vtrace.
+        shape = tf.shape(values_time_major)
+        B = shape[1]
+        # Augment `values_time_major` by one timestep at the end (all zeros).
+        values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0)
+        # Augment `bootstrap_values_time_major` by one timestep at the beginning
+        # (all zeros).
+        bootstrap_values_time_major = tf.concat(
+            [tf.zeros((1, B)), bootstrap_values_time_major], axis=0
+        )
+        # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded
+        # ONLY at the last ts of a trajectory (for the following timestep,
+        # which is one past(!) the last ts). All other values in that tensor are
+        # zero.
+        # Adding values and bootstrap_values yields the correct values+bootstrap
+        # configuration, from which we can then take t=-1 (last timestep) to get
+        # the bootstrap_value arg for the vtrace function below.
+        values_time_major += bootstrap_values_time_major
 
         # the discount factor that is used should be gamma except for timesteps where
         # the episode is terminated. In that case, the discount factor should be 0.
@@ -93,8 +115,8 @@ def compute_loss_for_module(
             behaviour_action_log_probs=behaviour_actions_logp_time_major,
             discounts=discounts_time_major,
             rewards=rewards_time_major,
-            values=values_time_major,
-            bootstrap_value=bootstrap_value,
+            values=values_time_major[:-1],
+            bootstrap_value=values_time_major[-1],
             clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold,
             clip_rho_threshold=hps.vtrace_clip_rho_threshold,
         )
@@ -127,7 +149,7 @@ def compute_loss_for_module(
         mean_pi_loss = -tf.math.reduce_mean(surrogate_loss)
 
         # The baseline loss.
-        delta = values_time_major - vtrace_adjusted_target_values
+        delta = values_time_major[:-1] - vtrace_adjusted_target_values
         mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2)
 
         # The entropy loss.