ray-project · sven1977 · Sep 9, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
@@ -3,14 +3,11 @@
 from ray.rllib.algorithms.algorithm import Algorithm
 from ray.rllib.algorithms.algorithm_config import AlgorithmConfig, NotProvided
 from ray.rllib.algorithms.marwil.marwil_catalog import MARWILCatalog
-from ray.rllib.algorithms.marwil.marwil_offline_prelearner import (
-    MARWILOfflinePreLearner,
-)
-from ray.rllib.connectors.common.add_observations_from_episodes_to_batch import (
+from ray.rllib.connectors.learner import (
     AddObservationsFromEpisodesToBatch,
-)
-from ray.rllib.connectors.learner.add_next_observations_from_episodes_to_train_batch import (  # noqa
+    AddOneTsToEpisodesAndTruncate,
     AddNextObservationsFromEpisodesToTrainBatch,
+    GeneralAdvantageEstimation,
 )
 from ray.rllib.core.learner.learner import Learner
 from ray.rllib.core.rl_module.rl_module import RLModuleSpec
@@ -104,9 +101,6 @@ def __init__(self, algo_class=None):
 
         # Override some of AlgorithmConfig's default values with MARWIL-specific values.
 
-        # Define the `OfflinePreLearner` class for `MARWIL`.
-        self.prelearner_class = MARWILOfflinePreLearner
-
         # You should override input_ to point to an offline dataset
         # (see algorithm.py and algorithm_config.py).
         # The dataset may have an arbitrary number of timesteps
@@ -283,13 +277,27 @@ def build_learner_connector(
             device=device,
         )
 
+        # Before anything, add one ts to each episode (and record this in the loss
+        # mask, so that the computations at this extra ts are not used to compute
+        # the loss).
+        pipeline.prepend(AddOneTsToEpisodesAndTruncate())
+
         # Prepend the "add-NEXT_OBS-from-episodes-to-train-batch" connector piece (right
         # after the corresponding "add-OBS-..." default piece).
         pipeline.insert_after(
             AddObservationsFromEpisodesToBatch,
             AddNextObservationsFromEpisodesToTrainBatch(),
         )
 
+        # At the end of the pipeline (when the batch is already completed), add the
+        # GAE connector, which performs a vf forward pass, then computes the GAE
+        # computations, and puts the results of this (advantages, value targets)
+        # directly back in the batch. This is then the batch used for
+        # `forward_train` and `compute_losses`.
+        pipeline.append(
+            GeneralAdvantageEstimation(gamma=self.gamma, lambda_=self.lambda_)
+        )
+
         return pipeline
 
     @override(AlgorithmConfig)

@@ -150,7 +150,7 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, List[EpisodeType]]
             episodes = OfflinePreLearner._map_sample_batch_to_episode(
                 self._is_multi_agent,
                 batch,
-                finalize=False,
+                finalize=True,
                 schema=SCHEMA | self.config.input_read_schema,
                 input_compress_columns=self.config.input_compress_columns,
             )["episodes"]
@@ -160,7 +160,7 @@ def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, List[EpisodeType]]
                 self._is_multi_agent,
                 batch,
                 schema=SCHEMA | self.config.input_read_schema,
-                finalize=False,
+                finalize=True,
                 input_compress_columns=self.config.input_compress_columns,
                 observation_space=self.observation_space,
                 action_space=self.action_space,
@@ -285,7 +285,7 @@ def convert(sample, space):
             else:
                 # Build a single-agent episode with a single row of the batch.
                 episode = SingleAgentEpisode(
-                    id_=batch[schema[Columns.EPS_ID]][i],
+                    id_=str(batch[schema[Columns.EPS_ID]][i]),
                     agent_id=agent_id,
                     # Observations might be (a) serialized and/or (b) converted
                     # to a JSONable (when a composite space was used). We unserialize
@@ -412,7 +412,7 @@ def _map_sample_batch_to_episode(
                 )
                 # Create a `SingleAgentEpisode`.
                 episode = SingleAgentEpisode(
-                    id_=batch[schema[Columns.EPS_ID]][i][0],
+                    id_=str(batch[schema[Columns.EPS_ID]][i][0]),
                     agent_id=agent_id,
                     observations=obs,
                     infos=(

@@ -34,6 +34,13 @@
     )
     .offline_data(
         input_=[data_path.as_posix()],
+        # Define the number of reading blocks, these should be larger than 1
+        # and aligned with the data size.
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
+        # Concurrency defines the number of processes that run the
+        # `map_batches` transformations. This should be aligned with the
+        # 'prefetch_batches' argument in 'iter_batches_kwargs'.
+        map_batches_kwargs={"concurrency": max(2, args.num_gpus * 2)},
         actions_in_input_normalized=True,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
     )

@@ -48,7 +48,7 @@
     # as remote learners.
     .offline_data(
         input_=[data_path.as_posix()],
-        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 1)},
+        input_read_method_kwargs={"override_num_blocks": max(args.num_gpus, 2)},
         prelearner_module_synch_period=20,
         dataset_num_iters_per_learner=1 if args.num_gpus == 0 else None,
     )