From d53832aa8f407236da61f03eef6522d694fbf60e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 2 Jun 2023 07:08:50 +0200 Subject: [PATCH 01/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 54 ++++++++------------- rllib/algorithms/impala/impala.py | 21 +++----- rllib/algorithms/impala/impala_tf_policy.py | 40 +++++---------- rllib/evaluation/postprocessing.py | 38 ++++++++++----- rllib/policy/sample_batch.py | 9 ++-- 5 files changed, 71 insertions(+), 91 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 8441f8032ede..1869388f6795 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -19,6 +19,7 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -144,7 +145,6 @@ def loss( is_multidiscrete = False output_hidden_shape = 1 - # TODO: (sven) deprecate this when trajectory view API gets activated. def make_time_major(*args, **kw): return _make_time_major( self, train_batch.get(SampleBatch.SEQ_LENS), *args, **kw @@ -159,12 +159,14 @@ def make_time_major(*args, **kw): prev_action_dist = dist_class(behaviour_logits, self.model) values = self.model.value_function() values_time_major = make_time_major(values) + bootstrap_value = train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + bootstrap_value = make_time_major(bootstrap_value)[-1] if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) mask = tf.reshape(mask, [-1]) - mask = make_time_major(mask, drop_last=self.config["vtrace"]) + mask = make_time_major(mask) def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, mask)) @@ -173,11 +175,7 @@ def reduce_mean_valid(t): reduce_mean_valid = tf.reduce_mean if self.config["vtrace"]: - drop_last = self.config["vtrace_drop_last_ts"] - logger.debug( - "Using V-Trace surrogate loss (vtrace=True; " - f"drop_last={drop_last})" - ) + logger.debug("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss. loss_actions = ( @@ -189,7 +187,7 @@ def reduce_mean_valid(t): # Prepare KL for Loss mean_kl = make_time_major( - old_policy_action_dist.multi_kl(action_dist), drop_last=drop_last + old_policy_action_dist.multi_kl(action_dist) ) unpacked_behaviour_logits = tf.split( @@ -203,26 +201,22 @@ def reduce_mean_valid(t): with tf.device("/cpu:0"): vtrace_returns = vtrace.multi_from_logits( behaviour_policy_logits=make_time_major( - unpacked_behaviour_logits, drop_last=drop_last + unpacked_behaviour_logits ), target_policy_logits=make_time_major( - unpacked_old_policy_behaviour_logits, drop_last=drop_last + unpacked_old_policy_behaviour_logits ), actions=tf.unstack( - make_time_major(loss_actions, drop_last=drop_last), axis=2 + make_time_major(loss_actions), axis=2 ), discounts=tf.cast( - ~make_time_major( - tf.cast(dones, tf.bool), drop_last=drop_last - ), + ~make_time_major(tf.cast(dones, tf.bool)), tf.float32, ) * self.config["gamma"], - rewards=make_time_major(rewards, drop_last=drop_last), - values=values_time_major[:-1] - if drop_last - else values_time_major, - bootstrap_value=values_time_major[-1], + rewards=make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_value, dist_class=Categorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=tf.cast( @@ -233,14 +227,10 @@ def reduce_mean_valid(t): ), ) - actions_logp = make_time_major( - action_dist.logp(actions), drop_last=drop_last - ) - prev_actions_logp = make_time_major( - prev_action_dist.logp(actions), drop_last=drop_last - ) + actions_logp = make_time_major(action_dist.logp(actions)) + prev_actions_logp = make_time_major(prev_action_dist.logp(actions)) old_policy_actions_logp = make_time_major( - old_policy_action_dist.logp(actions), drop_last=drop_last + old_policy_action_dist.logp(actions) ) is_ratio = tf.clip_by_value( @@ -267,17 +257,12 @@ def reduce_mean_valid(t): mean_policy_loss = -reduce_mean_valid(surrogate_loss) # The value function loss. - if drop_last: - delta = values_time_major[:-1] - vtrace_returns.vs - else: - delta = values_time_major - vtrace_returns.vs + delta = values_time_major - vtrace_returns.vs value_targets = vtrace_returns.vs mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) # The entropy loss. - actions_entropy = make_time_major( - action_dist.multi_entropy(), drop_last=True - ) + actions_entropy = make_time_major(action_dist.multi_entropy()) mean_entropy = reduce_mean_valid(actions_entropy) else: @@ -353,7 +338,6 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: self, train_batch.get(SampleBatch.SEQ_LENS), self.model.value_function(), - drop_last=self.config["vtrace"] and self.config["vtrace_drop_last_ts"], ) stats_dict = { @@ -392,6 +376,8 @@ def postprocess_trajectory( sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) + else: + sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 4403936aab15..3990d36b20ad 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -112,11 +112,6 @@ def __init__(self, algo_class=None): self.vtrace = True self.vtrace_clip_rho_threshold = 1.0 self.vtrace_clip_pg_rho_threshold = 1.0 - # TODO (sven): Deprecate this setting. It makes no sense to drop the last ts. - # It's actually dangerous if there are important rewards "hiding" in that ts. - # This setting is already ignored (always False) on the new Learner API - # (if _enable_learner_api=True). - self.vtrace_drop_last_ts = False self.num_multi_gpu_tower_stacks = 1 self.minibatch_buffer_size = 1 self.num_sgd_iter = 1 @@ -171,6 +166,7 @@ def __init__(self, algo_class=None): # Deprecated value. self.num_data_loader_buffers = DEPRECATED_VALUE + self.vtrace_drop_last_ts = DEPRECATED_VALUE @override(AlgorithmConfig) def training( @@ -179,7 +175,6 @@ def training( vtrace: Optional[bool] = NotProvided, vtrace_clip_rho_threshold: Optional[float] = NotProvided, vtrace_clip_pg_rho_threshold: Optional[float] = NotProvided, - vtrace_drop_last_ts: Optional[bool] = NotProvided, gamma: Optional[float] = NotProvided, num_multi_gpu_tower_stacks: Optional[int] = NotProvided, minibatch_buffer_size: Optional[int] = NotProvided, @@ -206,6 +201,8 @@ def training( _separate_vf_optimizer: Optional[bool] = NotProvided, _lr_vf: Optional[float] = NotProvided, after_train_step: Optional[Callable[[dict], None]] = NotProvided, + # deprecated. + vtrace_drop_last_ts=None, **kwargs, ) -> "ImpalaConfig": """Sets the training related configuration. @@ -214,13 +211,6 @@ def training( vtrace: V-trace params (see vtrace_tf/torch.py). vtrace_clip_rho_threshold: vtrace_clip_pg_rho_threshold: - vtrace_drop_last_ts: If True, drop the last timestep for the vtrace - calculations, such that all data goes into the calculations as [B x T-1] - (+ the bootstrap value). This is the default and legacy RLlib behavior, - however, could potentially have a destabilizing effect on learning, - especially in sparse reward or reward-at-goal environments. - False for not dropping the last timestep. - System params. gamma: Float specifying the discount factor of the Markov Decision process. num_multi_gpu_tower_stacks: For each stack of multi-GPU towers, how many slots should we reserve for parallel data loading? Set this to >1 to @@ -293,6 +283,9 @@ def training( Returns: This updated AlgorithmConfig object. """ + if vtrace_drop_last_ts is not None: + deprecation_warning(old="vtrace_drop_last_ts", error=False) + # Pass kwargs onto super's `training()` method. super().training(**kwargs) @@ -302,8 +295,6 @@ def training( self.vtrace_clip_rho_threshold = vtrace_clip_rho_threshold if vtrace_clip_pg_rho_threshold is not NotProvided: self.vtrace_clip_pg_rho_threshold = vtrace_clip_pg_rho_threshold - if vtrace_drop_last_ts is not NotProvided: - self.vtrace_drop_last_ts = vtrace_drop_last_ts if num_multi_gpu_tower_stacks is not NotProvided: self.num_multi_gpu_tower_stacks = num_multi_gpu_tower_stacks if minibatch_buffer_size is not NotProvided: diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index d8b830ef7653..74cb92f80149 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -131,14 +131,13 @@ def __init__( self.total_loss += self.vf_loss * vf_loss_coeff -def _make_time_major(policy, seq_lens, tensor, drop_last=False): +def _make_time_major(policy, seq_lens, tensor): """Swaps batch and trajectory axis. Args: policy: Policy reference seq_lens: Sequence lengths if recurrent or None tensor: A tensor or list of tensors to reshape. - drop_last: A bool indicating whether to drop the last trajectory item. Returns: @@ -146,7 +145,7 @@ def _make_time_major(policy, seq_lens, tensor, drop_last=False): swapped axes. """ if isinstance(tensor, list): - return [_make_time_major(policy, seq_lens, t, drop_last) for t in tensor] + return [_make_time_major(policy, seq_lens, t) for t in tensor] if policy.is_recurrent(): B = tf.shape(seq_lens)[0] @@ -163,8 +162,6 @@ def _make_time_major(policy, seq_lens, tensor, drop_last=False): # swap B and T axes res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) - if drop_last: - return res[:-1] return res @@ -363,30 +360,21 @@ def make_time_major(*args, **kw): ) # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc. - drop_last = self.config["vtrace_drop_last_ts"] self.vtrace_loss = VTraceLoss( - actions=make_time_major(loss_actions, drop_last=drop_last), - actions_logp=make_time_major( - action_dist.logp(actions), drop_last=drop_last - ), - actions_entropy=make_time_major( - action_dist.multi_entropy(), drop_last=drop_last - ), - dones=make_time_major(dones, drop_last=drop_last), - behaviour_action_logp=make_time_major( - behaviour_action_logp, drop_last=drop_last - ), - behaviour_logits=make_time_major( - unpacked_behaviour_logits, drop_last=drop_last - ), - target_logits=make_time_major(unpacked_outputs, drop_last=drop_last), + actions=make_time_major(loss_actions), + actions_logp=make_time_major(action_dist.logp(actions)), + actions_entropy=make_time_major(action_dist.multi_entropy()), + dones=make_time_major(dones), + behaviour_action_logp=make_time_major(behaviour_action_logp), + behaviour_logits=make_time_major(unpacked_behaviour_logits), + target_logits=make_time_major(unpacked_outputs), discount=self.config["gamma"], - rewards=make_time_major(rewards, drop_last=drop_last), - values=make_time_major(values, drop_last=drop_last), - bootstrap_value=make_time_major(values)[-1], + rewards=make_time_major(rewards), + values=make_time_major(values), + bootstrap_value=make_time_major(values)[-1],#TODO dist_class=Categorical if is_multidiscrete else dist_class, model=model, - valid_mask=make_time_major(mask, drop_last=drop_last), + valid_mask=make_time_major(mask), config=self.config, vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.entropy_coeff, @@ -401,12 +389,10 @@ def make_time_major(*args, **kw): @override(base) def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: - drop_last = self.config["vtrace"] and self.config["vtrace_drop_last_ts"] values_batched = _make_time_major( self, train_batch.get(SampleBatch.SEQ_LENS), self.model.value_function(), - drop_last=drop_last, ) return { diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 21355cefd455..09ee8cf13e3c 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -179,10 +179,29 @@ def compute_gae_for_sample_batch( Returns: The postprocessed, modified SampleBatch (or a new one). """ + sample_batch = compute_bootstrap_value(sample_batch, policy) + # Adds the policy logits, VF preds, and advantages to the batch, + # using GAE ("generalized advantage estimation") or not. + batch = compute_advantages( + rollout=sample_batch, + last_r=sample_batch[SampleBatch.VALUES_BOOTSTRAPPED][-1], + gamma=policy.config["gamma"], + lambda_=policy.config["lambda"], + use_gae=policy.config["use_gae"], + use_critic=policy.config.get("use_critic", True), + ) + + return batch + + +@DeveloperAPI +def compute_bootstrap_value(sample_batch, policy): + """TODO (sven): docstr """ # Trajectory is actually complete -> last r=0.0. if sample_batch[SampleBatch.TERMINATEDS][-1]: last_r = 0.0 + # Trajectory has been truncated -> last r=VF estimate of last obs. else: # Input dict is provided to us automatically via the Model's @@ -192,7 +211,6 @@ def compute_gae_for_sample_batch( input_dict = sample_batch.get_single_step_input_dict( policy.model.view_requirements, index="last" ) - if policy.config.get("_enable_rl_module_api"): # Note: During sampling you are using the parameters at the beginning of # the sampling process. If I'll be using this advantages during training @@ -217,18 +235,14 @@ def compute_gae_for_sample_batch( else: last_r = policy._value(**input_dict) - # Adds the policy logits, VF preds, and advantages to the batch, - # using GAE ("generalized advantage estimation") or not. - batch = compute_advantages( - sample_batch, - last_r, - policy.config["gamma"], - policy.config["lambda"], - use_gae=policy.config["use_gae"], - use_critic=policy.config.get("use_critic", True), - ) + # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all zeros, except for the + # very last timestep (where this bootstrapping value is actually needed), which + # we set to the computed `last_r`. + values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS]) + values_bootstrapped[-1] = last_r + sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped - return batch + return sample_batch @DeveloperAPI diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index ccc0ca0147e7..3d7e5b3af18a 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -150,6 +150,10 @@ class SampleBatch(dict): # Value function predictions emitted by the behaviour policy. VF_PREDS = "vf_preds" + # Values one ts beyond the last ts taken. These are usually calculated via the value + # function network using the final observation (and in case of an RNN: the last + # returned internal state). + VALUES_BOOTSTRAPPED = "values_bootstrapped" # RE 3 # This is only computed and used when RE3 exploration strategy is enabled. @@ -161,9 +165,8 @@ class SampleBatch(dict): # Deprecated keys: - # SampleBatches must already not be constructed anymore by setting this key - # directly. Instead, the values under this key are auto-computed via the values of - # the new TERMINATEDS and TRUNCATEDS keys. + # Do not set this key directly. Instead, the values under this key are + # auto-computed via the values of the TERMINATEDS and TRUNCATEDS keys. DONES = "dones" # Use SampleBatch.OBS instead. CUR_OBS = "obs" From faab512f5831025143ce2f9865d3be7a4f0588fb Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 2 Jun 2023 10:03:41 +0200 Subject: [PATCH 02/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_torch_policy.py | 41 ++++++++----------- .../algorithms/impala/impala_torch_policy.py | 32 ++++++--------- rllib/algorithms/impala/tf/vtrace_tf_v2.py | 7 +--- .../impala/torch/vtrace_torch_v2.py | 7 +--- 4 files changed, 31 insertions(+), 56 deletions(-) diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 4a7754830f32..6f30d32edfce 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -158,13 +158,11 @@ def _make_time_major(*args, **kwargs): values = model.value_function() values_time_major = _make_time_major(values) - drop_last = self.config["vtrace"] and self.config["vtrace_drop_last_ts"] - if self.is_recurrent(): max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) mask = sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) mask = torch.reshape(mask, [-1]) - mask = _make_time_major(mask, drop_last=drop_last) + mask = _make_time_major(mask) num_valid = torch.sum(mask) def reduce_mean_valid(t): @@ -175,7 +173,7 @@ def reduce_mean_valid(t): if self.config["vtrace"]: logger.debug( - "Using V-Trace surrogate loss (vtrace=True; " f"drop_last={drop_last})" + "Using V-Trace surrogate loss (vtrace=True)" ) old_policy_behaviour_logits = target_model_out.detach() @@ -202,26 +200,24 @@ def reduce_mean_valid(t): ) # Prepare KL for loss. - action_kl = _make_time_major( - old_policy_action_dist.kl(action_dist), drop_last=drop_last - ) + action_kl = _make_time_major(old_policy_action_dist.kl(action_dist)) # Compute vtrace on the CPU for better perf. vtrace_returns = vtrace.multi_from_logits( behaviour_policy_logits=_make_time_major( - unpacked_behaviour_logits, drop_last=drop_last + unpacked_behaviour_logits ), target_policy_logits=_make_time_major( - unpacked_old_policy_behaviour_logits, drop_last=drop_last + unpacked_old_policy_behaviour_logits ), actions=torch.unbind( - _make_time_major(loss_actions, drop_last=drop_last), dim=2 + _make_time_major(loss_actions), dim=2 ), - discounts=(1.0 - _make_time_major(dones, drop_last=drop_last).float()) + discounts=(1.0 - _make_time_major(dones).float()) * self.config["gamma"], - rewards=_make_time_major(rewards, drop_last=drop_last), - values=values_time_major[:-1] if drop_last else values_time_major, - bootstrap_value=values_time_major[-1], + rewards=_make_time_major(rewards), + values=values_time_major, + bootstrap_value=bootstrap_values[-1], dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], @@ -229,13 +225,13 @@ def reduce_mean_valid(t): ) actions_logp = _make_time_major( - action_dist.logp(actions), drop_last=drop_last + action_dist.logp(actions) ) prev_actions_logp = _make_time_major( - prev_action_dist.logp(actions), drop_last=drop_last + prev_action_dist.logp(actions) ) old_policy_actions_logp = _make_time_major( - old_policy_action_dist.logp(actions), drop_last=drop_last + old_policy_action_dist.logp(actions) ) is_ratio = torch.clamp( torch.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0 @@ -259,15 +255,12 @@ def reduce_mean_valid(t): # The value function loss. value_targets = vtrace_returns.vs.to(values_time_major.device) - if drop_last: - delta = values_time_major[:-1] - value_targets - else: - delta = values_time_major - value_targets + delta = values_time_major - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. mean_entropy = reduce_mean_valid( - _make_time_major(action_dist.entropy(), drop_last=drop_last) + _make_time_major(action_dist.entropy()) ) else: @@ -323,9 +316,7 @@ def reduce_mean_valid(t): model.tower_stats["value_targets"] = value_targets model.tower_stats["vf_explained_var"] = explained_variance( torch.reshape(value_targets, [-1]), - torch.reshape( - values_time_major[:-1] if drop_last else values_time_major, [-1] - ), + torch.reshape(values_time_major, [-1]), ) return total_loss diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 71aed0320601..3cc6fefeffc0 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -126,22 +126,20 @@ def __init__( ) -def make_time_major(policy, seq_lens, tensor, drop_last=False): +def make_time_major(policy, seq_lens, tensor): """Swaps batch and trajectory axis. Args: policy: Policy reference seq_lens: Sequence lengths if recurrent or None tensor: A tensor or list of tensors to reshape. - drop_last: A bool indicating whether to drop the last - trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, (list, tuple)): - return [make_time_major(policy, seq_lens, t, drop_last) for t in tensor] + return [make_time_major(policy, seq_lens, t) for t in tensor] if policy.is_recurrent(): B = seq_lens.shape[0] @@ -158,8 +156,6 @@ def make_time_major(policy, seq_lens, tensor, drop_last=False): # Swap B and T axes. res = torch.transpose(rs, 1, 0) - if drop_last: - return res[:-1] return res @@ -277,30 +273,29 @@ def _make_time_major(*args, **kw): loss_actions = actions if is_multidiscrete else torch.unsqueeze(actions, dim=1) # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc. - drop_last = self.config["vtrace_drop_last_ts"] loss = VTraceLoss( - actions=_make_time_major(loss_actions, drop_last=drop_last), + actions=_make_time_major(loss_actions), actions_logp=_make_time_major( - action_dist.logp(actions), drop_last=drop_last + action_dist.logp(actions) ), actions_entropy=_make_time_major( - action_dist.entropy(), drop_last=drop_last + action_dist.entropy() ), - dones=_make_time_major(dones, drop_last=drop_last), + dones=_make_time_major(dones), behaviour_action_logp=_make_time_major( - behaviour_action_logp, drop_last=drop_last + behaviour_action_logp ), behaviour_logits=_make_time_major( - unpacked_behaviour_logits, drop_last=drop_last + unpacked_behaviour_logits ), - target_logits=_make_time_major(unpacked_outputs, drop_last=drop_last), + target_logits=_make_time_major(unpacked_outputs), discount=self.config["gamma"], - rewards=_make_time_major(rewards, drop_last=drop_last), - values=_make_time_major(values, drop_last=drop_last), - bootstrap_value=_make_time_major(values)[-1], + rewards=_make_time_major(rewards), + values=_make_time_major(values), + bootstrap_value=_make_time_major(bootstrap_values)[-1], dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, - valid_mask=_make_time_major(mask, drop_last=drop_last), + valid_mask=_make_time_major(mask), config=self.config, vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.entropy_coeff, @@ -320,7 +315,6 @@ def _make_time_major(*args, **kw): self, train_batch.get(SampleBatch.SEQ_LENS), values, - drop_last=self.config["vtrace"] and drop_last, ) model.tower_stats["vf_explained_var"] = explained_variance( torch.reshape(loss.value_targets, [-1]), torch.reshape(values_batched, [-1]) diff --git a/rllib/algorithms/impala/tf/vtrace_tf_v2.py b/rllib/algorithms/impala/tf/vtrace_tf_v2.py index 5f878ddbc1c1..4aecc6c65549 100644 --- a/rllib/algorithms/impala/tf/vtrace_tf_v2.py +++ b/rllib/algorithms/impala/tf/vtrace_tf_v2.py @@ -9,7 +9,6 @@ def make_time_major( *, trajectory_len: int = None, recurrent_seq_len: int = None, - drop_last: bool = False, ): """Swaps batch and trajectory axis. @@ -21,8 +20,6 @@ def make_time_major( If None then `recurrent_seq_len` must be set. recurrent_seq_len: Sequence lengths if recurrent. If None then `trajectory_len` must be set. - drop_last: A bool indicating whether to drop the last - trajectory item. Note: Either `trajectory_len` or `recurrent_seq_len` must be set. `trajectory_len` should be used in cases where tensor is not produced from a @@ -33,7 +30,7 @@ def make_time_major( """ if isinstance(tensor, list): return [ - make_time_major(_tensor, trajectory_len, recurrent_seq_len, drop_last) + make_time_major(_tensor, trajectory_len, recurrent_seq_len) for _tensor in tensor ] @@ -52,8 +49,6 @@ def make_time_major( # swap B and T axes res = tf.transpose(rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) - if drop_last: - return res[:-1] return res diff --git a/rllib/algorithms/impala/torch/vtrace_torch_v2.py b/rllib/algorithms/impala/torch/vtrace_torch_v2.py index 1f4b6f9411fa..83ba8879d955 100644 --- a/rllib/algorithms/impala/torch/vtrace_torch_v2.py +++ b/rllib/algorithms/impala/torch/vtrace_torch_v2.py @@ -9,7 +9,6 @@ def make_time_major( *, trajectory_len: int = None, recurrent_seq_len: int = None, - drop_last: bool = False, ): """Swaps batch and trajectory axis. @@ -21,8 +20,6 @@ def make_time_major( If None then `recurrent_seq_len` must be set. recurrent_seq_len: Sequence lengths if recurrent. If None then `trajectory_len` must be set. - drop_last: A bool indicating whether to drop the last - trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with @@ -30,7 +27,7 @@ def make_time_major( """ if isinstance(tensor, (list, tuple)): return [ - make_time_major(_tensor, trajectory_len, recurrent_seq_len, drop_last) + make_time_major(_tensor, trajectory_len, recurrent_seq_len) for _tensor in tensor ] @@ -53,8 +50,6 @@ def make_time_major( # Swap B and T axes. res = torch.transpose(rs, 1, 0) - if drop_last: - return res[:-1] return res From 935b7f77d2cecdcbc92f9ee982ed055bdf4344a7 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 2 Jun 2023 12:54:04 +0200 Subject: [PATCH 03/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 32 +++++++++++++++++++++---- rllib/evaluation/postprocessing.py | 2 +- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 1869388f6795..9c12654b8877 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -159,8 +159,30 @@ def make_time_major(*args, **kw): prev_action_dist = dist_class(behaviour_logits, self.model) values = self.model.value_function() values_time_major = make_time_major(values) - bootstrap_value = train_batch[SampleBatch.VALUES_BOOTSTRAPPED] - bootstrap_value = make_time_major(bootstrap_value)[-1] + bootstrap_values_time_major = make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + + # Then add the shifted-by-one bootstrapped values to that to yield the final + # value tensor. Use the last ts in that resulting tensor as the + # "bootstrapped" values for vtrace. + shape = tf.shape(values_time_major) + T, B = shape[0], shape[1] + # Augment `values_time_major` by one timestep at the end (all zeros). + values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + # Augment `bootstrap_values_time_major` by one timestep at the beginning + # (all zeros). + bootstrap_values_time_major = tf.concat( + [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + ) + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + # Adding values and bootstrap_values yields the correct values+bootstrap + # configuration, from which we can then take t=-1 (last timestep) to get + # the bootstrap_value arg for the vtrace function below. + values_time_major += bootstrap_values_time_major if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) @@ -215,8 +237,8 @@ def reduce_mean_valid(t): ) * self.config["gamma"], rewards=make_time_major(rewards), - values=values_time_major, - bootstrap_value=bootstrap_value, + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], dist_class=Categorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=tf.cast( @@ -257,8 +279,8 @@ def reduce_mean_valid(t): mean_policy_loss = -reduce_mean_valid(surrogate_loss) # The value function loss. - delta = values_time_major - vtrace_returns.vs value_targets = vtrace_returns.vs + delta = values_time_major[:-1] - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) # The entropy loss. diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 09ee8cf13e3c..754cb256e39a 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -235,7 +235,7 @@ def compute_bootstrap_value(sample_batch, policy): else: last_r = policy._value(**input_dict) - # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all zeros, except for the + # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the # very last timestep (where this bootstrapping value is actually needed), which # we set to the computed `last_r`. values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS]) From 4a44f70a8d676de15e3d187e20d5f11fa47eedcd Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 2 Jun 2023 13:20:40 +0200 Subject: [PATCH 04/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 2 +- rllib/algorithms/appo/appo_torch_policy.py | 34 ++++++++++++++--- rllib/algorithms/appo/tf/appo_tf_learner.py | 38 +++++++++++++++---- .../appo/torch/appo_torch_learner.py | 37 ++++++++++++++---- 4 files changed, 89 insertions(+), 22 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 9c12654b8877..064ff0b9cf7f 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -319,7 +319,7 @@ def reduce_mean_valid(t): value_targets = make_time_major( train_batch[Postprocessing.VALUE_TARGETS] ) - delta = values_time_major - value_targets + delta = values_time_major[:-1] - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) # The entropy loss. diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 6f30d32edfce..fe4a092045fa 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -19,6 +19,7 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -157,6 +158,29 @@ def _make_time_major(*args, **kwargs): prev_action_dist = dist_class(behaviour_logits, model) values = model.value_function() values_time_major = _make_time_major(values) + bootstrap_values_time_major = make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + + # Then add the shifted-by-one bootstrapped values to that to yield the final + # value tensor. Use the last ts in that resulting tensor as the + # "bootstrapped" values for vtrace. + T, B = values_time_major.shape + # Augment `values_time_major` by one timestep at the end (all zeros). + values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + # Augment `bootstrap_values_time_major` by one timestep at the beginning + # (all zeros). + bootstrap_values_time_major = torch.cat( + [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + ) + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + # Adding values and bootstrap_values yields the correct values+bootstrap + # configuration, from which we can then take t=-1 (last timestep) to get + # the bootstrap_value arg for the vtrace function below. + values_time_major += bootstrap_values_time_major if self.is_recurrent(): max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) @@ -216,8 +240,8 @@ def reduce_mean_valid(t): discounts=(1.0 - _make_time_major(dones).float()) * self.config["gamma"], rewards=_make_time_major(rewards), - values=values_time_major, - bootstrap_value=bootstrap_values[-1], + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], @@ -255,7 +279,7 @@ def reduce_mean_valid(t): # The value function loss. value_targets = vtrace_returns.vs.to(values_time_major.device) - delta = values_time_major - value_targets + delta = values_time_major[:-1] - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. @@ -289,7 +313,7 @@ def reduce_mean_valid(t): # The value function loss. value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS]) - delta = values_time_major - value_targets + delta = values_time_major[:-1] - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. @@ -316,7 +340,7 @@ def reduce_mean_valid(t): model.tower_stats["value_targets"] = value_targets model.tower_stats["vf_explained_var"] = explained_variance( torch.reshape(value_targets, [-1]), - torch.reshape(values_time_major, [-1]), + torch.reshape(values_time_major[:-1], [-1]), ) return total_loss diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index 84fc8f4714cc..c9959c670db7 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -61,17 +61,39 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - values_time_major = make_time_major( - values, + rewards_time_major = make_time_major( + batch[SampleBatch.REWARDS], trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - batch[SampleBatch.REWARDS], + values_time_major = make_time_major( + values, trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_values_time_major = make_time_major( + batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + # Then add the shifted-by-one bootstrapped values to that to yield the final + # value tensor. Use the last ts in that resulting tensor as the + # "bootstrapped" values for vtrace. + shape = tf.shape(values_time_major) + T, B = shape[0], shape[1] + # Augment `values_time_major` by one timestep at the end (all zeros). + values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + # Augment `bootstrap_values_time_major` by one timestep at the beginning + # (all zeros). + bootstrap_values_time_major = tf.concat( + [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + ) + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + # Adding values and bootstrap_values yields the correct values+bootstrap + # configuration, from which we can then take t=-1 (last timestep) to get + # the bootstrap_value arg for the vtrace function below. + values_time_major += bootstrap_values_time_major # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. @@ -93,8 +115,8 @@ def compute_loss_for_module( behaviour_action_log_probs=behaviour_actions_logp_time_major, discounts=discounts_time_major, rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold, clip_rho_threshold=hps.vtrace_clip_rho_threshold, ) @@ -127,7 +149,7 @@ def compute_loss_for_module( mean_pi_loss = -tf.math.reduce_mean(surrogate_loss) # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values + delta = values_time_major[:-1] - vtrace_adjusted_target_values mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2) # The entropy loss. diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index e1f170a092dc..80fa01ffc3db 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -74,17 +74,38 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - values_time_major = make_time_major( - values, + rewards_time_major = make_time_major( + batch[SampleBatch.REWARDS], trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - batch[SampleBatch.REWARDS], + values_time_major = make_time_major( + values, trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_values_time_major = make_time_major( + batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + # Then add the shifted-by-one bootstrapped values to that to yield the final + # value tensor. Use the last ts in that resulting tensor as the + # "bootstrapped" values for vtrace. + T, B = values_time_major.shape + # Augment `values_time_major` by one timestep at the end (all zeros). + values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + # Augment `bootstrap_values_time_major` by one timestep at the beginning + # (all zeros). + bootstrap_values_time_major = torch.cat( + [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + ) + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + # Adding values and bootstrap_values yields the correct values+bootstrap + # configuration, from which we can then take t=-1 (last timestep) to get + # the bootstrap_value arg for the vtrace function below. + values_time_major += bootstrap_values_time_major # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. @@ -103,8 +124,8 @@ def compute_loss_for_module( behaviour_action_log_probs=behaviour_actions_logp_time_major, discounts=discounts_time_major, rewards=rewards_time_major, - values=values_time_major, - bootstrap_value=bootstrap_value, + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold, clip_rho_threshold=hps.vtrace_clip_rho_threshold, ) @@ -133,7 +154,7 @@ def compute_loss_for_module( mean_pi_loss = -torch.mean(surrogate_loss) # The baseline loss. - delta = values_time_major - vtrace_adjusted_target_values + delta = values_time_major[:-1] - vtrace_adjusted_target_values mean_vf_loss = 0.5 * torch.mean(delta**2) # The entropy loss. From 7b3d7b5b1b155e6c42f5d9a8fb753bcf5591263d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 2 Jun 2023 15:27:07 +0200 Subject: [PATCH 05/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 38 +++++------- rllib/algorithms/appo/appo_torch_policy.py | 49 +++++----------- rllib/algorithms/appo/tf/appo_tf_learner.py | 2 +- rllib/algorithms/impala/impala_tf_policy.py | 43 +++++++++++++- .../algorithms/impala/impala_torch_policy.py | 58 ++++++++++++++----- rllib/evaluation/postprocessing.py | 2 +- 6 files changed, 114 insertions(+), 78 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 064ff0b9cf7f..de1713c366d6 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -19,7 +19,6 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( - compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -162,26 +161,18 @@ def make_time_major(*args, **kw): bootstrap_values_time_major = make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - - # Then add the shifted-by-one bootstrapped values to that to yield the final - # value tensor. Use the last ts in that resulting tensor as the - # "bootstrapped" values for vtrace. + # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # with T being the rollout length (max trajectory len). + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. shape = tf.shape(values_time_major) - T, B = shape[0], shape[1] - # Augment `values_time_major` by one timestep at the end (all zeros). + B = shape[1] values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - # Augment `bootstrap_values_time_major` by one timestep at the beginning - # (all zeros). bootstrap_values_time_major = tf.concat( [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 ) - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. - # Adding values and bootstrap_values yields the correct values+bootstrap - # configuration, from which we can then take t=-1 (last timestep) to get - # the bootstrap_value arg for the vtrace function below. values_time_major += bootstrap_values_time_major if self.is_recurrent(): @@ -208,9 +199,7 @@ def reduce_mean_valid(t): old_policy_action_dist = dist_class(old_policy_behaviour_logits, model) # Prepare KL for Loss - mean_kl = make_time_major( - old_policy_action_dist.multi_kl(action_dist) - ) + mean_kl = make_time_major(old_policy_action_dist.multi_kl(action_dist)) unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1 @@ -228,9 +217,7 @@ def reduce_mean_valid(t): target_policy_logits=make_time_major( unpacked_old_policy_behaviour_logits ), - actions=tf.unstack( - make_time_major(loss_actions), axis=2 - ), + actions=tf.unstack(make_time_major(loss_actions), axis=2), discounts=tf.cast( ~make_time_major(tf.cast(dones, tf.bool)), tf.float32, @@ -394,12 +381,15 @@ def postprocess_trajectory( other_agent_batches: Optional[SampleBatch] = None, episode: Optional["Episode"] = None, ): + # Call super's postprocess_trajectory first. + sample_batch = super().postprocess_trajectory( + sample_batch, other_agent_batches, episode + ) + if not self.config["vtrace"]: sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) - else: - sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index fe4a092045fa..167c4070e134 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -19,7 +19,6 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( - compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -158,28 +157,21 @@ def _make_time_major(*args, **kwargs): prev_action_dist = dist_class(behaviour_logits, model) values = model.value_function() values_time_major = _make_time_major(values) - bootstrap_values_time_major = make_time_major( + bootstrap_values_time_major = _make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # Then add the shifted-by-one bootstrapped values to that to yield the final - # value tensor. Use the last ts in that resulting tensor as the - # "bootstrapped" values for vtrace. - T, B = values_time_major.shape - # Augment `values_time_major` by one timestep at the end (all zeros). - values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - # Augment `bootstrap_values_time_major` by one timestep at the beginning - # (all zeros). - bootstrap_values_time_major = torch.cat( - [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - ) + # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # with T being the rollout length (max trajectory len). # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded # ONLY at the last ts of a trajectory (for the following timestep, # which is one past(!) the last ts). All other values in that tensor are # zero. - # Adding values and bootstrap_values yields the correct values+bootstrap - # configuration, from which we can then take t=-1 (last timestep) to get - # the bootstrap_value arg for the vtrace function below. + _, B = values_time_major.shape + values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + bootstrap_values_time_major = torch.cat( + [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + ) values_time_major += bootstrap_values_time_major if self.is_recurrent(): @@ -196,9 +188,7 @@ def reduce_mean_valid(t): reduce_mean_valid = torch.mean if self.config["vtrace"]: - logger.debug( - "Using V-Trace surrogate loss (vtrace=True)" - ) + logger.debug("Using V-Trace surrogate loss (vtrace=True)") old_policy_behaviour_logits = target_model_out.detach() old_policy_action_dist = dist_class(old_policy_behaviour_logits, model) @@ -228,15 +218,11 @@ def reduce_mean_valid(t): # Compute vtrace on the CPU for better perf. vtrace_returns = vtrace.multi_from_logits( - behaviour_policy_logits=_make_time_major( - unpacked_behaviour_logits - ), + behaviour_policy_logits=_make_time_major(unpacked_behaviour_logits), target_policy_logits=_make_time_major( unpacked_old_policy_behaviour_logits ), - actions=torch.unbind( - _make_time_major(loss_actions), dim=2 - ), + actions=torch.unbind(_make_time_major(loss_actions), dim=2), discounts=(1.0 - _make_time_major(dones).float()) * self.config["gamma"], rewards=_make_time_major(rewards), @@ -248,12 +234,8 @@ def reduce_mean_valid(t): clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"], ) - actions_logp = _make_time_major( - action_dist.logp(actions) - ) - prev_actions_logp = _make_time_major( - prev_action_dist.logp(actions) - ) + actions_logp = _make_time_major(action_dist.logp(actions)) + prev_actions_logp = _make_time_major(prev_action_dist.logp(actions)) old_policy_actions_logp = _make_time_major( old_policy_action_dist.logp(actions) ) @@ -283,9 +265,7 @@ def reduce_mean_valid(t): mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. - mean_entropy = reduce_mean_valid( - _make_time_major(action_dist.entropy()) - ) + mean_entropy = reduce_mean_valid(_make_time_major(action_dist.entropy())) else: logger.debug("Using PPO surrogate loss (vtrace=False)") @@ -417,6 +397,7 @@ def postprocess_trajectory( sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) + return sample_batch @override(TorchPolicyV2) diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index c9959c670db7..9418649969d4 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -78,7 +78,7 @@ def compute_loss_for_module( # value tensor. Use the last ts in that resulting tensor as the # "bootstrapped" values for vtrace. shape = tf.shape(values_time_major) - T, B = shape[0], shape[1] + B = shape[1] # Augment `values_time_major` by one timestep at the end (all zeros). values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) # Augment `bootstrap_values_time_major` by one timestep at the beginning diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 74cb92f80149..114af959474b 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -5,9 +5,11 @@ import numpy as np import logging import gymnasium as gym -from typing import Dict, List, Type, Union +from typing import Dict, List, Optional, Type, Union from ray.rllib.algorithms.impala import vtrace_tf as vtrace +from ray.rllib.evaluation.episode import Episode +from ray.rllib.evaluation.postprocessing import compute_bootstrap_value from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_action_dist import Categorical, TFActionDistribution from ray.rllib.policy.dynamic_tf_policy_v2 import DynamicTFPolicyV2 @@ -346,6 +348,24 @@ def make_time_major(*args, **kw): ) unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1) values = model.value_function() + values_time_major = make_time_major(values) + bootstrap_values_time_major = make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + + # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # with T being the rollout length (max trajectory len). + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + shape = tf.shape(values_time_major) + B = shape[1] + values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + bootstrap_values_time_major = tf.concat( + [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + ) + values_time_major += bootstrap_values_time_major if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) @@ -370,8 +390,8 @@ def make_time_major(*args, **kw): target_logits=make_time_major(unpacked_outputs), discount=self.config["gamma"], rewards=make_time_major(rewards), - values=make_time_major(values), - bootstrap_value=make_time_major(values)[-1],#TODO + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], dist_class=Categorical if is_multidiscrete else dist_class, model=model, valid_mask=make_time_major(mask), @@ -408,6 +428,23 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: ), } + @override(base) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[SampleBatch] = None, + episode: Optional["Episode"] = None, + ): + # Call super's postprocess_trajectory first. + sample_batch = super().postprocess_trajectory( + sample_batch, other_agent_batches, episode + ) + + if self.config["vtrace"]: + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + @override(base) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"] diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 3cc6fefeffc0..6048de80f8f3 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -1,9 +1,11 @@ import gymnasium as gym import logging import numpy as np -from typing import Dict, List, Type, Union +from typing import Dict, List, Optional, Type, Union import ray +from ray.rllib.evaluation.episode import Episode +from ray.rllib.evaluation.postprocessing import compute_bootstrap_value from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.torch.torch_action_dist import TorchCategorical @@ -261,6 +263,23 @@ def _make_time_major(*args, **kw): ) unpacked_outputs = torch.chunk(model_out, output_hidden_shape, dim=1) values = model.value_function() + values_time_major = _make_time_major(values) + bootstrap_values_time_major = _make_time_major( + train_batch[SampleBatch.VALUES_BOOTSTRAPPED] + ) + + # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # with T being the rollout length (max trajectory len). + # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded + # ONLY at the last ts of a trajectory (for the following timestep, + # which is one past(!) the last ts). All other values in that tensor are + # zero. + _, B = values_time_major.shape + values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + bootstrap_values_time_major = torch.cat( + [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + ) + values_time_major += bootstrap_values_time_major if self.is_recurrent(): max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) @@ -275,24 +294,16 @@ def _make_time_major(*args, **kw): # Inputs are reshaped from [B * T] => [(T|T-1), B] for V-trace calc. loss = VTraceLoss( actions=_make_time_major(loss_actions), - actions_logp=_make_time_major( - action_dist.logp(actions) - ), - actions_entropy=_make_time_major( - action_dist.entropy() - ), + actions_logp=_make_time_major(action_dist.logp(actions)), + actions_entropy=_make_time_major(action_dist.entropy()), dones=_make_time_major(dones), - behaviour_action_logp=_make_time_major( - behaviour_action_logp - ), - behaviour_logits=_make_time_major( - unpacked_behaviour_logits - ), + behaviour_action_logp=_make_time_major(behaviour_action_logp), + behaviour_logits=_make_time_major(unpacked_behaviour_logits), target_logits=_make_time_major(unpacked_outputs), discount=self.config["gamma"], rewards=_make_time_major(rewards), - values=_make_time_major(values), - bootstrap_value=_make_time_major(bootstrap_values)[-1], + values=values_time_major[:-1], + bootstrap_value=values_time_major[-1], dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, valid_mask=_make_time_major(mask), @@ -343,6 +354,23 @@ def stats_fn(self, train_batch: SampleBatch) -> Dict[str, TensorType]: } ) + @override(TorchPolicyV2) + def postprocess_trajectory( + self, + sample_batch: SampleBatch, + other_agent_batches: Optional[SampleBatch] = None, + episode: Optional["Episode"] = None, + ): + # Call super's postprocess_trajectory first. + sample_batch = super().postprocess_trajectory( + sample_batch, other_agent_batches, episode + ) + + if self.config["vtrace"]: + sample_batch = compute_bootstrap_value(sample_batch, self) + + return sample_batch + @override(TorchPolicyV2) def extra_grad_process( self, optimizer: "torch.optim.Optimizer", loss: TensorType diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 754cb256e39a..4b70b03a425d 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -197,7 +197,7 @@ def compute_gae_for_sample_batch( @DeveloperAPI def compute_bootstrap_value(sample_batch, policy): - """TODO (sven): docstr """ + """TODO (sven): docstr""" # Trajectory is actually complete -> last r=0.0. if sample_batch[SampleBatch.TERMINATEDS][-1]: last_r = 0.0 From c2451bb55099be04396acd34f9479c854773e61f Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 3 Jun 2023 11:51:24 +0200 Subject: [PATCH 06/22] fix test Signed-off-by: sven1977 --- rllib/algorithms/impala/impala_tf_policy.py | 4 +++- rllib/evaluation/tests/test_trajectory_view_api.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 114af959474b..670081682225 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -20,7 +20,7 @@ from ray.rllib.utils.annotations import override from ray.rllib.utils.framework import try_import_tf from ray.rllib.utils.tf_utils import explained_variance -from ray.rllib.policy.tf_mixins import GradStatsMixin +from ray.rllib.policy.tf_mixins import GradStatsMixin, ValueNetworkMixin from ray.rllib.utils.typing import ( LocalOptimizer, ModelGradients, @@ -273,6 +273,7 @@ class ImpalaTFPolicy( LearningRateSchedule, EntropyCoeffSchedule, GradStatsMixin, + ValueNetworkMixin, base, ): def __init__( @@ -295,6 +296,7 @@ def __init__( existing_inputs=existing_inputs, existing_model=existing_model, ) + ValueNetworkMixin.__init__(self, config) # If Learner API is used, we don't need any loss-specific mixins. # However, we also would like to avoid creating special Policy-subclasses diff --git a/rllib/evaluation/tests/test_trajectory_view_api.py b/rllib/evaluation/tests/test_trajectory_view_api.py index 6bcf6225444a..0a7d76e6959f 100644 --- a/rllib/evaluation/tests/test_trajectory_view_api.py +++ b/rllib/evaluation/tests/test_trajectory_view_api.py @@ -129,7 +129,7 @@ def test_traj_view_lstm_prev_actions_and_rewards(self): view_req_policy = policy.view_requirements # 7=obs, prev-a + r, 2x state-in, 2x state-out. assert len(view_req_model) == 7, view_req_model - assert len(view_req_policy) == 22, (len(view_req_policy), view_req_policy) + assert len(view_req_policy) == 23, (len(view_req_policy), view_req_policy) for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, From 5f0b92c8f8997a5c9e55d0098ee423c80ce76c24 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 3 Jun 2023 12:34:39 +0200 Subject: [PATCH 07/22] fix test Signed-off-by: sven1977 --- rllib/policy/tf_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rllib/policy/tf_mixins.py b/rllib/policy/tf_mixins.py index fe5e23a330e8..49ba23a0f039 100644 --- a/rllib/policy/tf_mixins.py +++ b/rllib/policy/tf_mixins.py @@ -293,9 +293,9 @@ class ValueNetworkMixin: """ def __init__(self, config): - # When doing GAE, we need the value function estimate on the + # When doing GAE or vtrace, we need the value function estimate on the # observation. - if config["use_gae"]: + if config.get("use_gae") or config.get("vtrace"): # Input dict is provided to us automatically via the Model's # requirements. It's a single-timestep (last one in trajectory) From 605cf6ef3d4d484b6dc89a0029baa80bf0db0165 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Sat, 3 Jun 2023 21:09:52 +0200 Subject: [PATCH 08/22] fix test Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 3 +++ rllib/algorithms/appo/appo_torch_policy.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index de1713c366d6..b23c5bf45a1e 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -19,6 +19,7 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -390,6 +391,8 @@ def postprocess_trajectory( sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) + else: + sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 167c4070e134..6a1e8f865b34 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -19,6 +19,7 @@ ) from ray.rllib.evaluation.episode import Episode from ray.rllib.evaluation.postprocessing import ( + compute_bootstrap_value, compute_gae_for_sample_batch, Postprocessing, ) @@ -389,14 +390,17 @@ def postprocess_trajectory( sample_batch = super().postprocess_trajectory( sample_batch, other_agent_batches, episode ) - if not self.config["vtrace"]: - # Do all post-processing always with no_grad(). - # Not using this here will introduce a memory leak - # in torch (issue #6962). - with torch.no_grad(): + + # Do all post-processing always with no_grad(). + # Not using this here will introduce a memory leak + # in torch (issue #6962). + with torch.no_grad(): + if not self.config["vtrace"]: sample_batch = compute_gae_for_sample_batch( self, sample_batch, other_agent_batches, episode ) + else: + sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch From 8a0e5c9e25ebcdb06e9cf4dbdc68966e8ce35429 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 5 Jun 2023 11:11:23 +0200 Subject: [PATCH 09/22] wip Signed-off-by: sven1977 --- rllib/algorithms/impala/impala_torch_policy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 6048de80f8f3..bde4cfbb6234 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -13,6 +13,7 @@ from ray.rllib.policy.torch_mixins import ( EntropyCoeffSchedule, LearningRateSchedule, + ValueNetworkMixin, ) from ray.rllib.policy.torch_policy_v2 import TorchPolicyV2 from ray.rllib.utils.annotations import override @@ -190,6 +191,7 @@ class ImpalaTorchPolicy( VTraceOptimizer, LearningRateSchedule, EntropyCoeffSchedule, + ValueNetworkMixin, TorchPolicyV2, ): """PyTorch policy class used with Impala.""" @@ -220,6 +222,8 @@ def __init__(self, observation_space, action_space, config): max_seq_len=config["model"]["max_seq_len"], ) + ValueNetworkMixin.__init__(self, config) + self._initialize_loss_from_dummy_batch() @override(TorchPolicyV2) From 95bd73f3b8800135ed16bdc045179e6921bfb2ed Mon Sep 17 00:00:00 2001 From: sven1977 Date: Mon, 5 Jun 2023 14:55:44 +0200 Subject: [PATCH 10/22] fix Signed-off-by: sven1977 --- rllib/policy/torch_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rllib/policy/torch_mixins.py b/rllib/policy/torch_mixins.py index b258c1d74560..359eb31089a5 100644 --- a/rllib/policy/torch_mixins.py +++ b/rllib/policy/torch_mixins.py @@ -126,7 +126,7 @@ class ValueNetworkMixin: def __init__(self, config): # When doing GAE, we need the value function estimate on the # observation. - if config["use_gae"]: + if config.get("use_gae") or config.get("vtrace"): # Input dict is provided to us automatically via the Model's # requirements. It's a single-timestep (last one in trajectory) # input_dict. From 957e185000d76d0f5fa8ebfffac5194575da5778 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 9 Jun 2023 12:54:36 +0200 Subject: [PATCH 11/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 12 ++--- rllib/algorithms/appo/appo_torch_policy.py | 10 ++-- rllib/algorithms/impala/impala.py | 9 +++- rllib/algorithms/impala/impala_tf_policy.py | 12 ++--- .../algorithms/impala/impala_torch_policy.py | 10 ++-- rllib/evaluation/postprocessing.py | 49 ++++++++++++++++++- 6 files changed, 77 insertions(+), 25 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index b23c5bf45a1e..c228b31489ab 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -162,12 +162,10 @@ def make_time_major(*args, **kw): bootstrap_values_time_major = make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, - # with T being the rollout length (max trajectory len). - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. + # See docstring of: + # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for + # details on the following computation to yield correct t=1 to T+1 + # trajectories, with T being the rollout length (max trajectory len). shape = tf.shape(values_time_major) B = shape[1] values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) @@ -392,6 +390,8 @@ def postprocess_trajectory( self, sample_batch, other_agent_batches, episode ) else: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 6a1e8f865b34..51820ad086fd 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -162,12 +162,10 @@ def _make_time_major(*args, **kwargs): train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # See docstring of: + # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details + # on the following computation to yield correct t=1 to T+1 trajectories, # with T being the rollout length (max trajectory len). - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. _, B = values_time_major.shape values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) bootstrap_values_time_major = torch.cat( @@ -400,6 +398,8 @@ def postprocess_trajectory( self, sample_batch, other_agent_batches, episode ) else: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index 3990d36b20ad..af2b8f937ff9 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -284,7 +284,14 @@ def training( This updated AlgorithmConfig object. """ if vtrace_drop_last_ts is not None: - deprecation_warning(old="vtrace_drop_last_ts", error=False) + deprecation_warning( + old="vtrace_drop_last_ts", + help="The v-trace operations in RLlib have been enhanced and we are " + "now using proper value bootstrapping at the end of each " + "trajectory, such that no timesteps in our loss functions have to " + "be dropped anymore.", + error=True, + ) # Pass kwargs onto super's `training()` method. super().training(**kwargs) diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 670081682225..30f726458c90 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -355,12 +355,10 @@ def make_time_major(*args, **kw): train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, - # with T being the rollout length (max trajectory len). - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. + # See docstring of: + # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for + # details on the following computation to yield correct t=1 to T+1 + # trajectories, with T being the rollout length (max trajectory len). shape = tf.shape(values_time_major) B = shape[1] values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) @@ -443,6 +441,8 @@ def postprocess_trajectory( ) if self.config["vtrace"]: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index bde4cfbb6234..8ade5bee4272 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -272,12 +272,10 @@ def _make_time_major(*args, **kw): train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # Add values to bootstrap values to yield correct t=1 to T+1 trajectories, + # See docstring of: + # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details + # on the following computation to yield correct t=1 to T+1 trajectories, # with T being the rollout length (max trajectory len). - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. _, B = values_time_major.shape values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) bootstrap_values_time_major = torch.cat( @@ -371,6 +369,8 @@ def postprocess_trajectory( ) if self.config["vtrace"]: + # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need + # inside the loss for vtrace calculations. sample_batch = compute_bootstrap_value(sample_batch, self) return sample_batch diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 4b70b03a425d..685d8b900fe2 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -179,6 +179,8 @@ def compute_gae_for_sample_batch( Returns: The postprocessed, modified SampleBatch (or a new one). """ + # Compute the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need for the + # following `last_r` arg in `compute_advantages()`. sample_batch = compute_bootstrap_value(sample_batch, policy) # Adds the policy logits, VF preds, and advantages to the batch, @@ -196,8 +198,51 @@ def compute_gae_for_sample_batch( @DeveloperAPI -def compute_bootstrap_value(sample_batch, policy): - """TODO (sven): docstr""" +def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> SampleBatch: + """Performs a value function computation at the end of a trajectory. + + If the trajectory is terminated (not truncated), will not use the value function, + but assume that the value of the last timestep is 0.0. + In all other cases, will use the given policy's value function to compute the + "bootstrapped" value estimate at the end of the given trajectory. To do so, the + very last observation (sample_batch[NEXT_OBS][-1]) and - if applicable - + the very last state output (sample_batch[STATE_OUT][-1]) wil be used as inputs to + the value function. + + The thus computed value estimate will be stored in a new column of the + `sample_batch`: SampleBatch.VALUES_BOOTSTRAPPED. Thereby, values at all timesteps + in this column are set to 0.0, except or the last timestep, which receives the + computed bootstrapped value. + This is done, such that in any loss function (which processes raw, intact + trajectories, such as those of IMPALA and APPO) can use this new column as follows: + + Example: numbers=ts in episode, '|'=episode boundary (terminal), + X=bootstrapped value (!= 0.0 b/c ts=12 is not a terminal). + ts=5 is NOT a terminal. + T: 8 9 10 11 12 <- no terminal + VF_PREDS: . . . . . + VALUES_BOOTSTRAPPED: 0 0 0 0 X + + Then in the loss, we can now shift VALUES_BOOTSTRAPPED by one to the right -> + T: 8 9 10 11 12 13 <- next timestep + VF_PREDS: . . . . . 0.0 <- fill right side ts (13) with 0.0 + VALUES_BOOTSTRAPPED: 0.0 0 0 0 0 X <- fill left side ts (8) with 0.0 + + Then add the two columns together to yield the correct value estimates for each + timestep. + T: 8 9 10 11 12 13 <- next (bootstrapped) timestep + RESULTING VALUES: . . . . . 0+X + + Args: + sample_batch: The SampleBatch (single trajectory) for which to compute the + bootstrap value at the end. This SampleBatch will be altered in place + (by adding a new column: SampleBatch.VALUES_BOOTSTRAPPED). + policy: The Policy object, whose value function to use. + + Returns: + The altered SampleBatch (with the extra SampleBatch.VALUES_BOOTSTRAPPED + column). + """ # Trajectory is actually complete -> last r=0.0. if sample_batch[SampleBatch.TERMINATEDS][-1]: last_r = 0.0 From a2730a46ec060377b1c5eeef377699c55be7e444 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Fri, 9 Jun 2023 13:12:10 +0200 Subject: [PATCH 12/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/tf/appo_tf_learner.py | 4 +++- rllib/algorithms/appo/torch/appo_torch_learner.py | 6 ++++-- rllib/algorithms/impala/impala.py | 6 +++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index 9418649969d4..abfbea25b34b 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -72,7 +72,9 @@ def compute_loss_for_module( recurrent_seq_len=hps.recurrent_seq_len, ) bootstrap_values_time_major = make_time_major( - batch[SampleBatch.VALUES_BOOTSTRAPPED] + batch[SampleBatch.VALUES_BOOTSTRAPPED], + trajectory_len=hps.rollout_frag_or_episode_len, + recurrent_seq_len=hps.recurrent_seq_len, ) # Then add the shifted-by-one bootstrapped values to that to yield the final # value tensor. Use the last ts in that resulting tensor as the diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 80fa01ffc3db..79e89ad41906 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -85,12 +85,14 @@ def compute_loss_for_module( recurrent_seq_len=hps.recurrent_seq_len, ) bootstrap_values_time_major = make_time_major( - batch[SampleBatch.VALUES_BOOTSTRAPPED] + batch[SampleBatch.VALUES_BOOTSTRAPPED], + trajectory_len=hps.rollout_frag_or_episode_len, + recurrent_seq_len=hps.recurrent_seq_len, ) # Then add the shifted-by-one bootstrapped values to that to yield the final # value tensor. Use the last ts in that resulting tensor as the # "bootstrapped" values for vtrace. - T, B = values_time_major.shape + _, B = values_time_major.shape # Augment `values_time_major` by one timestep at the end (all zeros). values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) # Augment `bootstrap_values_time_major` by one timestep at the beginning diff --git a/rllib/algorithms/impala/impala.py b/rllib/algorithms/impala/impala.py index af2b8f937ff9..670934d8f03f 100644 --- a/rllib/algorithms/impala/impala.py +++ b/rllib/algorithms/impala/impala.py @@ -287,9 +287,9 @@ def training( deprecation_warning( old="vtrace_drop_last_ts", help="The v-trace operations in RLlib have been enhanced and we are " - "now using proper value bootstrapping at the end of each " - "trajectory, such that no timesteps in our loss functions have to " - "be dropped anymore.", + "now using proper value bootstrapping at the end of each " + "trajectory, such that no timesteps in our loss functions have to " + "be dropped anymore.", error=True, ) From 4305ffab98d9b99e956d6f42512e879accb6c72b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 20 Jun 2023 15:14:10 +0200 Subject: [PATCH 13/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 38 ++++++++++--------- rllib/algorithms/appo/appo_torch_policy.py | 29 +++++++------- rllib/algorithms/appo/tf/appo_tf_learner.py | 22 ++++++----- .../appo/torch/appo_torch_learner.py | 20 +++++----- rllib/algorithms/impala/impala_tf_policy.py | 25 ++++++------ .../algorithms/impala/impala_torch_policy.py | 23 +++++------ 6 files changed, 83 insertions(+), 74 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index c228b31489ab..2afb021e6720 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -162,17 +162,19 @@ def make_time_major(*args, **kw): bootstrap_values_time_major = make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - # See docstring of: - # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for - # details on the following computation to yield correct t=1 to T+1 - # trajectories, with T being the rollout length (max trajectory len). - shape = tf.shape(values_time_major) - B = shape[1] - values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - bootstrap_values_time_major = tf.concat( - [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - ) - values_time_major += bootstrap_values_time_major + bootstrap_value = bootstrap_values_time_major[-1] + + ## See docstring of: + ## `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for + ## details on the following computation to yield correct t=1 to T+1 + ## trajectories, with T being the rollout length (max trajectory len). + #shape = tf.shape(values_time_major) + #B = shape[1] + #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + #bootstrap_values_time_major = tf.concat( + # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + #) + #values_time_major += bootstrap_values_time_major if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) @@ -223,8 +225,8 @@ def reduce_mean_valid(t): ) * self.config["gamma"], rewards=make_time_major(rewards), - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, dist_class=Categorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=tf.cast( @@ -266,7 +268,7 @@ def reduce_mean_valid(t): # The value function loss. value_targets = vtrace_returns.vs - delta = values_time_major[:-1] - value_targets + delta = values_time_major - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) # The entropy loss. @@ -305,7 +307,7 @@ def reduce_mean_valid(t): value_targets = make_time_major( train_batch[Postprocessing.VALUE_TARGETS] ) - delta = values_time_major[:-1] - value_targets + delta = values_time_major - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta)) # The entropy loss. @@ -381,9 +383,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) + #sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + #) if not self.config["vtrace"]: sample_batch = compute_gae_for_sample_batch( diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 51820ad086fd..6256e3bbef6f 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -166,12 +166,13 @@ def _make_time_major(*args, **kwargs): # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details # on the following computation to yield correct t=1 to T+1 trajectories, # with T being the rollout length (max trajectory len). - _, B = values_time_major.shape - values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - bootstrap_values_time_major = torch.cat( - [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - ) - values_time_major += bootstrap_values_time_major + #_, B = values_time_major.shape + #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + #bootstrap_values_time_major = torch.cat( + # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + #) + #values_time_major += bootstrap_values_time_major + bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) @@ -225,8 +226,8 @@ def reduce_mean_valid(t): discounts=(1.0 - _make_time_major(dones).float()) * self.config["gamma"], rewards=_make_time_major(rewards), - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], @@ -260,7 +261,7 @@ def reduce_mean_valid(t): # The value function loss. value_targets = vtrace_returns.vs.to(values_time_major.device) - delta = values_time_major[:-1] - value_targets + delta = values_time_major - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. @@ -292,7 +293,7 @@ def reduce_mean_valid(t): # The value function loss. value_targets = _make_time_major(train_batch[Postprocessing.VALUE_TARGETS]) - delta = values_time_major[:-1] - value_targets + delta = values_time_major - value_targets mean_vf_loss = 0.5 * reduce_mean_valid(torch.pow(delta, 2.0)) # The entropy loss. @@ -319,7 +320,7 @@ def reduce_mean_valid(t): model.tower_stats["value_targets"] = value_targets model.tower_stats["vf_explained_var"] = explained_variance( torch.reshape(value_targets, [-1]), - torch.reshape(values_time_major[:-1], [-1]), + torch.reshape(values_time_major, [-1]), ) return total_loss @@ -385,9 +386,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) + #sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + #) # Do all post-processing always with no_grad(). # Not using this here will introduce a memory leak diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index abfbea25b34b..d4386008f026 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -76,18 +76,20 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_value = bootstrap_values_time_major[-1] + # Then add the shifted-by-one bootstrapped values to that to yield the final # value tensor. Use the last ts in that resulting tensor as the # "bootstrapped" values for vtrace. - shape = tf.shape(values_time_major) - B = shape[1] + #shape = tf.shape(values_time_major) + #B = shape[1] # Augment `values_time_major` by one timestep at the end (all zeros). - values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) # Augment `bootstrap_values_time_major` by one timestep at the beginning # (all zeros). - bootstrap_values_time_major = tf.concat( - [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - ) + #bootstrap_values_time_major = tf.concat( + # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + #) # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded # ONLY at the last ts of a trajectory (for the following timestep, # which is one past(!) the last ts). All other values in that tensor are @@ -95,7 +97,7 @@ def compute_loss_for_module( # Adding values and bootstrap_values yields the correct values+bootstrap # configuration, from which we can then take t=-1 (last timestep) to get # the bootstrap_value arg for the vtrace function below. - values_time_major += bootstrap_values_time_major + #values_time_major += bootstrap_values_time_major # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. @@ -117,8 +119,8 @@ def compute_loss_for_module( behaviour_action_log_probs=behaviour_actions_logp_time_major, discounts=discounts_time_major, rewards=rewards_time_major, - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold, clip_rho_threshold=hps.vtrace_clip_rho_threshold, ) @@ -151,7 +153,7 @@ def compute_loss_for_module( mean_pi_loss = -tf.math.reduce_mean(surrogate_loss) # The baseline loss. - delta = values_time_major[:-1] - vtrace_adjusted_target_values + delta = values_time_major - vtrace_adjusted_target_values mean_vf_loss = 0.5 * tf.math.reduce_mean(delta**2) # The entropy loss. diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 79e89ad41906..039851701ecf 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -89,17 +89,19 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_value = bootstrap_values_time_major[-1] + # Then add the shifted-by-one bootstrapped values to that to yield the final # value tensor. Use the last ts in that resulting tensor as the # "bootstrapped" values for vtrace. - _, B = values_time_major.shape + #_, B = values_time_major.shape # Augment `values_time_major` by one timestep at the end (all zeros). - values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) # Augment `bootstrap_values_time_major` by one timestep at the beginning # (all zeros). - bootstrap_values_time_major = torch.cat( - [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - ) + #bootstrap_values_time_major = torch.cat( + # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + #) # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded # ONLY at the last ts of a trajectory (for the following timestep, # which is one past(!) the last ts). All other values in that tensor are @@ -107,7 +109,7 @@ def compute_loss_for_module( # Adding values and bootstrap_values yields the correct values+bootstrap # configuration, from which we can then take t=-1 (last timestep) to get # the bootstrap_value arg for the vtrace function below. - values_time_major += bootstrap_values_time_major + #values_time_major += bootstrap_values_time_major # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. @@ -126,8 +128,8 @@ def compute_loss_for_module( behaviour_action_log_probs=behaviour_actions_logp_time_major, discounts=discounts_time_major, rewards=rewards_time_major, - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, clip_pg_rho_threshold=hps.vtrace_clip_pg_rho_threshold, clip_rho_threshold=hps.vtrace_clip_rho_threshold, ) @@ -156,7 +158,7 @@ def compute_loss_for_module( mean_pi_loss = -torch.mean(surrogate_loss) # The baseline loss. - delta = values_time_major[:-1] - vtrace_adjusted_target_values + delta = values_time_major - vtrace_adjusted_target_values mean_vf_loss = 0.5 * torch.mean(delta**2) # The entropy loss. diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 30f726458c90..519fd9022e62 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -359,13 +359,14 @@ def make_time_major(*args, **kw): # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for # details on the following computation to yield correct t=1 to T+1 # trajectories, with T being the rollout length (max trajectory len). - shape = tf.shape(values_time_major) - B = shape[1] - values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - bootstrap_values_time_major = tf.concat( - [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - ) - values_time_major += bootstrap_values_time_major + #shape = tf.shape(values_time_major) + #B = shape[1] + #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) + #bootstrap_values_time_major = tf.concat( + # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 + #) + #values_time_major += bootstrap_values_time_major + bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) @@ -390,8 +391,8 @@ def make_time_major(*args, **kw): target_logits=make_time_major(unpacked_outputs), discount=self.config["gamma"], rewards=make_time_major(rewards), - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, dist_class=Categorical if is_multidiscrete else dist_class, model=model, valid_mask=make_time_major(mask), @@ -436,9 +437,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) + #sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + #) if self.config["vtrace"]: # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 8ade5bee4272..839e42631d19 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -276,12 +276,13 @@ def _make_time_major(*args, **kw): # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details # on the following computation to yield correct t=1 to T+1 trajectories, # with T being the rollout length (max trajectory len). - _, B = values_time_major.shape - values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - bootstrap_values_time_major = torch.cat( - [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - ) - values_time_major += bootstrap_values_time_major + #_, B = values_time_major.shape + #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) + #bootstrap_values_time_major = torch.cat( + # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 + #) + #values_time_major += bootstrap_values_time_major + bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): max_seq_len = torch.max(train_batch[SampleBatch.SEQ_LENS]) @@ -304,8 +305,8 @@ def _make_time_major(*args, **kw): target_logits=_make_time_major(unpacked_outputs), discount=self.config["gamma"], rewards=_make_time_major(rewards), - values=values_time_major[:-1], - bootstrap_value=values_time_major[-1], + values=values_time_major, + bootstrap_value=bootstrap_value, dist_class=TorchCategorical if is_multidiscrete else dist_class, model=model, valid_mask=_make_time_major(mask), @@ -364,9 +365,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - sample_batch = super().postprocess_trajectory( - sample_batch, other_agent_batches, episode - ) + #sample_batch = super().postprocess_trajectory( + # sample_batch, other_agent_batches, episode + #) if self.config["vtrace"]: # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need From 644098e93ad7617a04c1fffc1fff3b51e43cc52d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 20 Jun 2023 15:40:35 +0200 Subject: [PATCH 14/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 16 ++----------- rllib/algorithms/appo/appo_torch_policy.py | 15 ++---------- rllib/algorithms/appo/tf/appo_tf_learner.py | 23 +------------------ .../appo/torch/appo_torch_learner.py | 22 +----------------- rllib/algorithms/impala/impala_tf_policy.py | 16 ++----------- .../algorithms/impala/impala_torch_policy.py | 15 ++---------- .../algorithms/impala/tf/impala_tf_learner.py | 11 ++++++--- .../impala/torch/impala_torch_learner.py | 11 ++++++--- 8 files changed, 26 insertions(+), 103 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 2afb021e6720..fa1909869c94 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -164,18 +164,6 @@ def make_time_major(*args, **kw): ) bootstrap_value = bootstrap_values_time_major[-1] - ## See docstring of: - ## `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for - ## details on the following computation to yield correct t=1 to T+1 - ## trajectories, with T being the rollout length (max trajectory len). - #shape = tf.shape(values_time_major) - #B = shape[1] - #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - #bootstrap_values_time_major = tf.concat( - # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - #) - #values_time_major += bootstrap_values_time_major - if self.is_recurrent(): max_seq_len = tf.reduce_max(train_batch[SampleBatch.SEQ_LENS]) mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], max_seq_len) @@ -383,9 +371,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - #sample_batch = super().postprocess_trajectory( + # sample_batch = super().postprocess_trajectory( # sample_batch, other_agent_batches, episode - #) + # ) if not self.config["vtrace"]: sample_batch = compute_gae_for_sample_batch( diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 6256e3bbef6f..448e7705862c 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -161,17 +161,6 @@ def _make_time_major(*args, **kwargs): bootstrap_values_time_major = _make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - - # See docstring of: - # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details - # on the following computation to yield correct t=1 to T+1 trajectories, - # with T being the rollout length (max trajectory len). - #_, B = values_time_major.shape - #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - #bootstrap_values_time_major = torch.cat( - # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - #) - #values_time_major += bootstrap_values_time_major bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): @@ -386,9 +375,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - #sample_batch = super().postprocess_trajectory( + # sample_batch = super().postprocess_trajectory( # sample_batch, other_agent_batches, episode - #) + # ) # Do all post-processing always with no_grad(). # Not using this here will introduce a memory leak diff --git a/rllib/algorithms/appo/tf/appo_tf_learner.py b/rllib/algorithms/appo/tf/appo_tf_learner.py index d4386008f026..3c949d1a090f 100644 --- a/rllib/algorithms/appo/tf/appo_tf_learner.py +++ b/rllib/algorithms/appo/tf/appo_tf_learner.py @@ -78,28 +78,7 @@ def compute_loss_for_module( ) bootstrap_value = bootstrap_values_time_major[-1] - # Then add the shifted-by-one bootstrapped values to that to yield the final - # value tensor. Use the last ts in that resulting tensor as the - # "bootstrapped" values for vtrace. - #shape = tf.shape(values_time_major) - #B = shape[1] - # Augment `values_time_major` by one timestep at the end (all zeros). - #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - # Augment `bootstrap_values_time_major` by one timestep at the beginning - # (all zeros). - #bootstrap_values_time_major = tf.concat( - # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - #) - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. - # Adding values and bootstrap_values yields the correct values+bootstrap - # configuration, from which we can then take t=-1 (last timestep) to get - # the bootstrap_value arg for the vtrace function below. - #values_time_major += bootstrap_values_time_major - - # the discount factor that is used should be gamma except for timesteps where + # The discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. discounts_time_major = ( 1.0 diff --git a/rllib/algorithms/appo/torch/appo_torch_learner.py b/rllib/algorithms/appo/torch/appo_torch_learner.py index 039851701ecf..91b86a989264 100644 --- a/rllib/algorithms/appo/torch/appo_torch_learner.py +++ b/rllib/algorithms/appo/torch/appo_torch_learner.py @@ -91,27 +91,7 @@ def compute_loss_for_module( ) bootstrap_value = bootstrap_values_time_major[-1] - # Then add the shifted-by-one bootstrapped values to that to yield the final - # value tensor. Use the last ts in that resulting tensor as the - # "bootstrapped" values for vtrace. - #_, B = values_time_major.shape - # Augment `values_time_major` by one timestep at the end (all zeros). - #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - # Augment `bootstrap_values_time_major` by one timestep at the beginning - # (all zeros). - #bootstrap_values_time_major = torch.cat( - # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - #) - # Note that the `SampleBatch.VALUES_BOOTSTRAPPED` values are always recorded - # ONLY at the last ts of a trajectory (for the following timestep, - # which is one past(!) the last ts). All other values in that tensor are - # zero. - # Adding values and bootstrap_values yields the correct values+bootstrap - # configuration, from which we can then take t=-1 (last timestep) to get - # the bootstrap_value arg for the vtrace function below. - #values_time_major += bootstrap_values_time_major - - # the discount factor that is used should be gamma except for timesteps where + # The discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. discounts_time_major = ( 1.0 diff --git a/rllib/algorithms/impala/impala_tf_policy.py b/rllib/algorithms/impala/impala_tf_policy.py index 519fd9022e62..5e2700cb3639 100644 --- a/rllib/algorithms/impala/impala_tf_policy.py +++ b/rllib/algorithms/impala/impala_tf_policy.py @@ -354,18 +354,6 @@ def make_time_major(*args, **kw): bootstrap_values_time_major = make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - - # See docstring of: - # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for - # details on the following computation to yield correct t=1 to T+1 - # trajectories, with T being the rollout length (max trajectory len). - #shape = tf.shape(values_time_major) - #B = shape[1] - #values_time_major = tf.concat([values_time_major, tf.zeros((1, B))], axis=0) - #bootstrap_values_time_major = tf.concat( - # [tf.zeros((1, B)), bootstrap_values_time_major], axis=0 - #) - #values_time_major += bootstrap_values_time_major bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): @@ -437,9 +425,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - #sample_batch = super().postprocess_trajectory( + # sample_batch = super().postprocess_trajectory( # sample_batch, other_agent_batches, episode - #) + # ) if self.config["vtrace"]: # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need diff --git a/rllib/algorithms/impala/impala_torch_policy.py b/rllib/algorithms/impala/impala_torch_policy.py index 839e42631d19..80b39f64f065 100644 --- a/rllib/algorithms/impala/impala_torch_policy.py +++ b/rllib/algorithms/impala/impala_torch_policy.py @@ -271,17 +271,6 @@ def _make_time_major(*args, **kw): bootstrap_values_time_major = _make_time_major( train_batch[SampleBatch.VALUES_BOOTSTRAPPED] ) - - # See docstring of: - # `ray.rllib.evaluation.postprocessing.compute_bootstrap_value()` for details - # on the following computation to yield correct t=1 to T+1 trajectories, - # with T being the rollout length (max trajectory len). - #_, B = values_time_major.shape - #values_time_major = torch.cat([values_time_major, torch.zeros((1, B))], dim=0) - #bootstrap_values_time_major = torch.cat( - # [torch.zeros((1, B)), bootstrap_values_time_major], dim=0 - #) - #values_time_major += bootstrap_values_time_major bootstrap_value = bootstrap_values_time_major[-1] if self.is_recurrent(): @@ -365,9 +354,9 @@ def postprocess_trajectory( episode: Optional["Episode"] = None, ): # Call super's postprocess_trajectory first. - #sample_batch = super().postprocess_trajectory( + # sample_batch = super().postprocess_trajectory( # sample_batch, other_agent_batches, episode - #) + # ) if self.config["vtrace"]: # Add the SampleBatch.VALUES_BOOTSTRAPPED column, which we'll need diff --git a/rllib/algorithms/impala/tf/impala_tf_learner.py b/rllib/algorithms/impala/tf/impala_tf_learner.py index 87ddc2cd938f..03a960c691e3 100644 --- a/rllib/algorithms/impala/tf/impala_tf_learner.py +++ b/rllib/algorithms/impala/tf/impala_tf_learner.py @@ -48,17 +48,22 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + rewards_time_major = make_time_major( + batch[SampleBatch.REWARDS], + trajectory_len=hps.rollout_frag_or_episode_len, + recurrent_seq_len=hps.recurrent_seq_len, + ) values_time_major = make_time_major( values, trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - batch[SampleBatch.REWARDS], + bootstrap_values_time_major = make_time_major( + batch[SampleBatch.VALUES_BOOTSTRAPPED], trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_value = bootstrap_values_time_major[-1] # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. diff --git a/rllib/algorithms/impala/torch/impala_torch_learner.py b/rllib/algorithms/impala/torch/impala_torch_learner.py index 9a5d25f32ed6..8aa7b8b3d07d 100644 --- a/rllib/algorithms/impala/torch/impala_torch_learner.py +++ b/rllib/algorithms/impala/torch/impala_torch_learner.py @@ -58,17 +58,22 @@ def compute_loss_for_module( trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + rewards_time_major = make_time_major( + batch[SampleBatch.REWARDS], + trajectory_len=hps.rollout_frag_or_episode_len, + recurrent_seq_len=hps.recurrent_seq_len, + ) values_time_major = make_time_major( values, trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) - bootstrap_value = values_time_major[-1] - rewards_time_major = make_time_major( - batch[SampleBatch.REWARDS], + bootstrap_values_time_major = make_time_major( + batch[SampleBatch.VALUES_BOOTSTRAPPED], trajectory_len=hps.rollout_frag_or_episode_len, recurrent_seq_len=hps.recurrent_seq_len, ) + bootstrap_value = bootstrap_values_time_major[-1] # the discount factor that is used should be gamma except for timesteps where # the episode is terminated. In that case, the discount factor should be 0. From c8cc8ae01a549f25066f58b610af3d114ebbc3e2 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Tue, 20 Jun 2023 17:21:17 +0200 Subject: [PATCH 15/22] wip Signed-off-by: sven1977 --- rllib/evaluation/postprocessing.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 685d8b900fe2..3afa125078a6 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -223,16 +223,6 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample VF_PREDS: . . . . . VALUES_BOOTSTRAPPED: 0 0 0 0 X - Then in the loss, we can now shift VALUES_BOOTSTRAPPED by one to the right -> - T: 8 9 10 11 12 13 <- next timestep - VF_PREDS: . . . . . 0.0 <- fill right side ts (13) with 0.0 - VALUES_BOOTSTRAPPED: 0.0 0 0 0 0 X <- fill left side ts (8) with 0.0 - - Then add the two columns together to yield the correct value estimates for each - timestep. - T: 8 9 10 11 12 13 <- next (bootstrapped) timestep - RESULTING VALUES: . . . . . 0+X - Args: sample_batch: The SampleBatch (single trajectory) for which to compute the bootstrap value at the end. This SampleBatch will be altered in place From 56a3dea96a170e62df00ab28c93ac61b5a85be7d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 13:42:53 +0200 Subject: [PATCH 16/22] wip Signed-off-by: sven1977 --- .../impala/tests/test_impala_learner.py | 3 +++ rllib/evaluation/postprocessing.py | 9 +++++--- .../appo/stateless_cartpole_appo_vtrace.py | 21 +++++++++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py diff --git a/rllib/algorithms/impala/tests/test_impala_learner.py b/rllib/algorithms/impala/tests/test_impala_learner.py index 7f130ca93997..1e3672074c37 100644 --- a/rllib/algorithms/impala/tests/test_impala_learner.py +++ b/rllib/algorithms/impala/tests/test_impala_learner.py @@ -31,6 +31,9 @@ SampleBatch.VF_PREDS: np.array( list(reversed(range(frag_length))), dtype=np.float32 ), + SampleBatch.VALUES_BOOTSTRAPPED: np.array( + list(reversed(range(frag_length))), dtype=np.float32 + ), SampleBatch.ACTION_LOGP: np.log( np.random.uniform(low=0, high=1, size=(frag_length,)) ).astype(np.float32), diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 3afa125078a6..fed2863392cb 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -273,9 +273,12 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the # very last timestep (where this bootstrapping value is actually needed), which # we set to the computed `last_r`. - values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS]) - values_bootstrapped[-1] = last_r - sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped + sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate( + [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0 + ) + #values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS]) + #values_bootstrapped[-1] = last_r + #sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped return sample_batch diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py b/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py new file mode 100644 index 000000000000..93ed841c2746 --- /dev/null +++ b/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py @@ -0,0 +1,21 @@ +from ray.rllib.algorithms.appo.appo import APPOConfig +from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole + + +config = ( + APPOConfig() + .environment(StatelessCartPole) + .resources(num_gpus=0) + .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")#TODO rollout workers=5 + .training(lr=0.0003, num_sgd_iter=6, vf_loss_coeff=0.01, model={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, + "use_lstm": True, + }) +) + +stop = { + "timesteps_total": 500000, + "sampler_results/episode_reward_mean": 150.0, +} From bf0bb3644629a62ffb4d9afede09f12e6887e394 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 14:46:40 +0200 Subject: [PATCH 17/22] wip Signed-off-by: sven1977 --- rllib/BUILD | 10 ++++++++++ rllib/algorithms/appo/appo_tf_policy.py | 3 +-- rllib/algorithms/appo/appo_torch_policy.py | 5 +---- rllib/evaluation/postprocessing.py | 5 +---- ...ppo_vtrace.py => stateless-cartpole-appo-vtrace.py} | 0 5 files changed, 13 insertions(+), 10 deletions(-) rename rllib/tuned_examples/appo/{stateless_cartpole_appo_vtrace.py => stateless-cartpole-appo-vtrace.py} (100%) diff --git a/rllib/BUILD b/rllib/BUILD index 0963a733e149..a11b7f189ef6 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -301,6 +301,16 @@ py_test( args = ["--dir=tuned_examples/appo"] ) +py_test( + name = "learning_tests_stateless_cartpole_appo_vtrace", + main = "tests/run_regression_tests.py", + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"], + size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer + srcs = ["tests/run_regression_tests.py"], + data = ["tuned_examples/appo/stateless-cartpole-appo-vtrace.py"], + args = ["--dir=tuned_examples/appo"] +) + # ARS py_test( name = "learning_tests_cartpole_ars", diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index fa1909869c94..16579f18eb7a 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -389,8 +389,7 @@ def postprocess_trajectory( @override(base) def extra_action_out_fn(self) -> Dict[str, TensorType]: extra_action_fetches = super().extra_action_out_fn() - if not self.config["vtrace"]: - extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function() + extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function() return extra_action_fetches @override(base) diff --git a/rllib/algorithms/appo/appo_torch_policy.py b/rllib/algorithms/appo/appo_torch_policy.py index 448e7705862c..a97446c44281 100644 --- a/rllib/algorithms/appo/appo_torch_policy.py +++ b/rllib/algorithms/appo/appo_torch_policy.py @@ -362,10 +362,7 @@ def extra_action_out( model: TorchModelV2, action_dist: TorchDistributionWrapper, ) -> Dict[str, TensorType]: - out = {} - if not self.config["vtrace"]: - out[SampleBatch.VF_PREDS] = model.value_function() - return out + return {SampleBatch.VF_PREDS: model.value_function()} @override(TorchPolicyV2) def postprocess_trajectory( diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index fed2863392cb..47b2531dfc33 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -270,15 +270,12 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample else: last_r = policy._value(**input_dict) - # Set the SampleBatch.VALUES_BOOTSTRAPPED field to all 0.0, except for the + # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the # very last timestep (where this bootstrapping value is actually needed), which # we set to the computed `last_r`. sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate( [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0 ) - #values_bootstrapped = np.zeros_like(sample_batch[SampleBatch.REWARDS]) - #values_bootstrapped[-1] = last_r - #sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = values_bootstrapped return sample_batch diff --git a/rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py similarity index 100% rename from rllib/tuned_examples/appo/stateless_cartpole_appo_vtrace.py rename to rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py From 9c8c7e2b6e3ffc9dcf7374be582e345b5d025c9e Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 15:52:43 +0200 Subject: [PATCH 18/22] wip Signed-off-by: sven1977 --- rllib/evaluation/postprocessing.py | 7 +++--- .../appo/stateless-cartpole-appo-vtrace.py | 22 +++++++++++++------ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 47b2531dfc33..1b6f026d330e 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -273,9 +273,10 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the # very last timestep (where this bootstrapping value is actually needed), which # we set to the computed `last_r`. - sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate( - [sample_batch[SampleBatch.VF_PREDS][1:], np.array([last_r])], axis=0 - ) + sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate([ + convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]), + np.array([convert_to_numpy(last_r)], dtype=np.float32), + ], axis=0) return sample_batch diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py index 93ed841c2746..50293322a4c3 100644 --- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py +++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py @@ -6,13 +6,21 @@ APPOConfig() .environment(StatelessCartPole) .resources(num_gpus=0) - .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter")#TODO rollout workers=5 - .training(lr=0.0003, num_sgd_iter=6, vf_loss_coeff=0.01, model={ - "fcnet_hiddens": [32], - "fcnet_activation": "linear", - "vf_share_layers": True, - "use_lstm": True, - }) + .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter") + .training( + lr=0.0003, + num_sgd_iter=6, + vf_loss_coeff=0.01, + model={ + "fcnet_hiddens": [32], + "fcnet_activation": "linear", + "vf_share_layers": True, + "use_lstm": True, + }, + # TODO: Switch over to new stack. + # _enable_learner_api=True, + ) + # .rl_module(_enable_rl_module_api=True) ) stop = { From d2d0eb3524f07c1b0b3c5eff8ade4dcd8cc6348d Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 16:16:59 +0200 Subject: [PATCH 19/22] wip Signed-off-by: sven1977 --- rllib/evaluation/postprocessing.py | 11 +++++++---- .../appo/stateless-cartpole-appo-vtrace.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/rllib/evaluation/postprocessing.py b/rllib/evaluation/postprocessing.py index 1b6f026d330e..525d375a67af 100644 --- a/rllib/evaluation/postprocessing.py +++ b/rllib/evaluation/postprocessing.py @@ -273,10 +273,13 @@ def compute_bootstrap_value(sample_batch: SampleBatch, policy: Policy) -> Sample # Set the SampleBatch.VALUES_BOOTSTRAPPED field to VF_PREDS[1:] + the # very last timestep (where this bootstrapping value is actually needed), which # we set to the computed `last_r`. - sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate([ - convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]), - np.array([convert_to_numpy(last_r)], dtype=np.float32), - ], axis=0) + sample_batch[SampleBatch.VALUES_BOOTSTRAPPED] = np.concatenate( + [ + convert_to_numpy(sample_batch[SampleBatch.VF_PREDS][1:]), + np.array([convert_to_numpy(last_r)], dtype=np.float32), + ], + axis=0, + ) return sample_batch diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py index 50293322a4c3..fc3c37e1d9dd 100644 --- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py +++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py @@ -17,7 +17,7 @@ "vf_share_layers": True, "use_lstm": True, }, - # TODO: Switch over to new stack. + # TODO: Switch over to new stack once it supports LSTMs. # _enable_learner_api=True, ) # .rl_module(_enable_rl_module_api=True) From 0ab57036ef7acf00c4b6f472a9593248f602fd3c Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 16:58:05 +0200 Subject: [PATCH 20/22] wip Signed-off-by: sven1977 --- rllib/BUILD | 2 +- rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index 0fbd2f1ff50f..d8d833165a6e 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -3720,7 +3720,7 @@ py_test( ) # Taking out this test for now: Mixed torch- and tf- policies within the same -# Algorothm never really worked. +# Algorithm never really worked. # py_test( # name = "examples/multi_agent_two_trainers_mixed_torch_tf", # main = "examples/multi_agent_two_trainers.py", diff --git a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py index fc3c37e1d9dd..e127da1e28c3 100644 --- a/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py +++ b/rllib/tuned_examples/appo/stateless-cartpole-appo-vtrace.py @@ -6,7 +6,7 @@ APPOConfig() .environment(StatelessCartPole) .resources(num_gpus=0) - .rollouts(num_rollout_workers=0, observation_filter="MeanStdFilter") + .rollouts(num_rollout_workers=1, observation_filter="MeanStdFilter") .training( lr=0.0003, num_sgd_iter=6, From 17be7ccf3c81d219109bbed0f09b4e5cb6541043 Mon Sep 17 00:00:00 2001 From: sven1977 Date: Wed, 21 Jun 2023 18:31:16 +0200 Subject: [PATCH 21/22] wip Signed-off-by: sven1977 --- rllib/BUILD | 4 ++-- rllib/algorithms/appo/tests/test_appo_learner.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/rllib/BUILD b/rllib/BUILD index d8d833165a6e..feab865a38dc 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -304,8 +304,8 @@ py_test( py_test( name = "learning_tests_stateless_cartpole_appo_vtrace", main = "tests/run_regression_tests.py", - tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_discrete"], - size = "medium", # bazel may complain about it being too long sometimes - medium is on purpose as some frameworks take longer + tags = ["team:rllib", "exclusive", "learning_tests", "learning_tests_cartpole", "learning_tests_discrete"], + size = "large", srcs = ["tests/run_regression_tests.py"], data = ["tuned_examples/appo/stateless-cartpole-appo-vtrace.py"], args = ["--dir=tuned_examples/appo"] diff --git a/rllib/algorithms/appo/tests/test_appo_learner.py b/rllib/algorithms/appo/tests/test_appo_learner.py index 19bcacdc89be..6a018ca40344 100644 --- a/rllib/algorithms/appo/tests/test_appo_learner.py +++ b/rllib/algorithms/appo/tests/test_appo_learner.py @@ -36,6 +36,9 @@ SampleBatch.VF_PREDS: np.array( list(reversed(range(frag_length))), dtype=np.float32 ), + SampleBatch.VALUES_BOOTSTRAPPED: np.array( + list(reversed(range(frag_length))), dtype=np.float32 + ), SampleBatch.ACTION_LOGP: np.log( np.random.uniform(low=0, high=1, size=(frag_length,)) ).astype(np.float32), From f55532c10702fba53e6df4e67a35769c44daa52b Mon Sep 17 00:00:00 2001 From: sven1977 Date: Thu, 22 Jun 2023 12:17:28 +0200 Subject: [PATCH 22/22] wip Signed-off-by: sven1977 --- rllib/algorithms/appo/appo_tf_policy.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/rllib/algorithms/appo/appo_tf_policy.py b/rllib/algorithms/appo/appo_tf_policy.py index 16579f18eb7a..ec4dd78c295e 100644 --- a/rllib/algorithms/appo/appo_tf_policy.py +++ b/rllib/algorithms/appo/appo_tf_policy.py @@ -386,12 +386,6 @@ def postprocess_trajectory( return sample_batch - @override(base) - def extra_action_out_fn(self) -> Dict[str, TensorType]: - extra_action_fetches = super().extra_action_out_fn() - extra_action_fetches[SampleBatch.VF_PREDS] = self.model.value_function() - return extra_action_fetches - @override(base) def get_batch_divisibility_req(self) -> int: return self.config["rollout_fragment_length"]